diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/compaction.c | 61 | ||||
-rw-r--r-- | mm/hugetlb.c | 46 | ||||
-rw-r--r-- | mm/hwpoison-inject.c | 2 | ||||
-rw-r--r-- | mm/internal.h | 4 | ||||
-rw-r--r-- | mm/ksm.c | 121 | ||||
-rw-r--r-- | mm/memblock.c | 387 | ||||
-rw-r--r-- | mm/memcontrol.c | 17 | ||||
-rw-r--r-- | mm/memory-failure.c | 10 | ||||
-rw-r--r-- | mm/memory.c | 16 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 4 | ||||
-rw-r--r-- | mm/migrate.c | 89 | ||||
-rw-r--r-- | mm/mlock.c | 18 | ||||
-rw-r--r-- | mm/mmap.c | 46 | ||||
-rw-r--r-- | mm/mprotect.c | 3 | ||||
-rw-r--r-- | mm/nobootmem.c | 10 | ||||
-rw-r--r-- | mm/nommu.c | 1 | ||||
-rw-r--r-- | mm/oom_kill.c | 51 | ||||
-rw-r--r-- | mm/page_alloc.c | 89 | ||||
-rw-r--r-- | mm/page_cgroup.c | 5 | ||||
-rw-r--r-- | mm/percpu.c | 38 | ||||
-rw-r--r-- | mm/rmap.c | 580 | ||||
-rw-r--r-- | mm/sparse-vmemmap.c | 6 | ||||
-rw-r--r-- | mm/sparse.c | 27 | ||||
-rw-r--r-- | mm/swap.c | 278 | ||||
-rw-r--r-- | mm/util.c | 36 | ||||
-rw-r--r-- | mm/vmalloc.c | 20 |
26 files changed, 1145 insertions, 820 deletions
diff --git a/mm/compaction.c b/mm/compaction.c index f58bcd016f43..3a91a2ea3d34 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -459,6 +459,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
459 | unsigned long flags; | 459 | unsigned long flags; |
460 | bool locked = false; | 460 | bool locked = false; |
461 | struct page *page = NULL, *valid_page = NULL; | 461 | struct page *page = NULL, *valid_page = NULL; |
462 | bool skipped_async_unsuitable = false; | ||
462 | 463 | ||
463 | /* | 464 | /* |
464 | * Ensure that there are not too many pages isolated from the LRU | 465 | * Ensure that there are not too many pages isolated from the LRU |
@@ -534,6 +535,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
534 | if (!cc->sync && last_pageblock_nr != pageblock_nr && | 535 | if (!cc->sync && last_pageblock_nr != pageblock_nr && |
535 | !migrate_async_suitable(get_pageblock_migratetype(page))) { | 536 | !migrate_async_suitable(get_pageblock_migratetype(page))) { |
536 | cc->finished_update_migrate = true; | 537 | cc->finished_update_migrate = true; |
538 | skipped_async_unsuitable = true; | ||
537 | goto next_pageblock; | 539 | goto next_pageblock; |
538 | } | 540 | } |
539 | 541 | ||
@@ -627,8 +629,13 @@ next_pageblock: | |||
627 | if (locked) | 629 | if (locked) |
628 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 630 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
629 | 631 | ||
630 | /* Update the pageblock-skip if the whole pageblock was scanned */ | 632 | /* |
631 | if (low_pfn == end_pfn) | 633 | * Update the pageblock-skip information and cached scanner pfn, |
634 | * if the whole pageblock was scanned without isolating any page. | ||
635 | * This is not done when pageblock was skipped due to being unsuitable | ||
636 | * for async compaction, so that eventual sync compaction can try. | ||
637 | */ | ||
638 | if (low_pfn == end_pfn && !skipped_async_unsuitable) | ||
632 | update_pageblock_skip(cc, valid_page, nr_isolated, true); | 639 | update_pageblock_skip(cc, valid_page, nr_isolated, true); |
633 | 640 | ||
634 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); | 641 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); |
@@ -660,7 +667,7 @@ static void isolate_freepages(struct zone *zone, | |||
660 | * is the end of the pageblock the migration scanner is using. | 667 | * is the end of the pageblock the migration scanner is using. |
661 | */ | 668 | */ |
662 | pfn = cc->free_pfn; | 669 | pfn = cc->free_pfn; |
663 | low_pfn = cc->migrate_pfn + pageblock_nr_pages; | 670 | low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages); |
664 | 671 | ||
665 | /* | 672 | /* |
666 | * Take care that if the migration scanner is at the end of the zone | 673 | * Take care that if the migration scanner is at the end of the zone |
@@ -676,7 +683,7 @@ static void isolate_freepages(struct zone *zone, | |||
676 | * pages on cc->migratepages. We stop searching if the migrate | 683 | * pages on cc->migratepages. We stop searching if the migrate |
677 | * and free page scanners meet or enough free pages are isolated. | 684 | * and free page scanners meet or enough free pages are isolated. |
678 | */ | 685 | */ |
679 | for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; | 686 | for (; pfn >= low_pfn && cc->nr_migratepages > nr_freepages; |
680 | pfn -= pageblock_nr_pages) { | 687 | pfn -= pageblock_nr_pages) { |
681 | unsigned long isolated; | 688 | unsigned long isolated; |
682 | 689 | ||
@@ -738,7 +745,14 @@ static void isolate_freepages(struct zone *zone, | |||
738 | /* split_free_page does not map the pages */ | 745 | /* split_free_page does not map the pages */ |
739 | map_pages(freelist); | 746 | map_pages(freelist); |
740 | 747 | ||
741 | cc->free_pfn = high_pfn; | 748 | /* |
749 | * If we crossed the migrate scanner, we want to keep it that way | ||
750 | * so that compact_finished() may detect this | ||
751 | */ | ||
752 | if (pfn < low_pfn) | ||
753 | cc->free_pfn = max(pfn, zone->zone_start_pfn); | ||
754 | else | ||
755 | cc->free_pfn = high_pfn; | ||
742 | cc->nr_freepages = nr_freepages; | 756 | cc->nr_freepages = nr_freepages; |
743 | } | 757 | } |
744 | 758 | ||
@@ -837,6 +851,10 @@ static int compact_finished(struct zone *zone, | |||
837 | 851 | ||
838 | /* Compaction run completes if the migrate and free scanner meet */ | 852 | /* Compaction run completes if the migrate and free scanner meet */ |
839 | if (cc->free_pfn <= cc->migrate_pfn) { | 853 | if (cc->free_pfn <= cc->migrate_pfn) { |
854 | /* Let the next compaction start anew. */ | ||
855 | zone->compact_cached_migrate_pfn = zone->zone_start_pfn; | ||
856 | zone->compact_cached_free_pfn = zone_end_pfn(zone); | ||
857 | |||
840 | /* | 858 | /* |
841 | * Mark that the PG_migrate_skip information should be cleared | 859 | * Mark that the PG_migrate_skip information should be cleared |
842 | * by kswapd when it goes to sleep. kswapd does not set the | 860 | * by kswapd when it goes to sleep. kswapd does not set the |
@@ -947,6 +965,14 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
947 | } | 965 | } |
948 | 966 | ||
949 | /* | 967 | /* |
968 | * Clear pageblock skip if there were failures recently and compaction | ||
969 | * is about to be retried after being deferred. kswapd does not do | ||
970 | * this reset as it'll reset the cached information when going to sleep. | ||
971 | */ | ||
972 | if (compaction_restarting(zone, cc->order) && !current_is_kswapd()) | ||
973 | __reset_isolation_suitable(zone); | ||
974 | |||
975 | /* | ||
950 | * Setup to move all movable pages to the end of the zone. Used cached | 976 | * Setup to move all movable pages to the end of the zone. Used cached |
951 | * information on where the scanners should start but check that it | 977 | * information on where the scanners should start but check that it |
952 | * is initialised by ensuring the values are within zone boundaries. | 978 | * is initialised by ensuring the values are within zone boundaries. |
@@ -962,13 +988,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
962 | zone->compact_cached_migrate_pfn = cc->migrate_pfn; | 988 | zone->compact_cached_migrate_pfn = cc->migrate_pfn; |
963 | } | 989 | } |
964 | 990 | ||
965 | /* | 991 | trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn); |
966 | * Clear pageblock skip if there were failures recently and compaction | ||
967 | * is about to be retried after being deferred. kswapd does not do | ||
968 | * this reset as it'll reset the cached information when going to sleep. | ||
969 | */ | ||
970 | if (compaction_restarting(zone, cc->order) && !current_is_kswapd()) | ||
971 | __reset_isolation_suitable(zone); | ||
972 | 992 | ||
973 | migrate_prep_local(); | 993 | migrate_prep_local(); |
974 | 994 | ||
@@ -1003,7 +1023,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
1003 | if (err) { | 1023 | if (err) { |
1004 | putback_movable_pages(&cc->migratepages); | 1024 | putback_movable_pages(&cc->migratepages); |
1005 | cc->nr_migratepages = 0; | 1025 | cc->nr_migratepages = 0; |
1006 | if (err == -ENOMEM) { | 1026 | /* |
1027 | * migrate_pages() may return -ENOMEM when scanners meet | ||
1028 | * and we want compact_finished() to detect it | ||
1029 | */ | ||
1030 | if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) { | ||
1007 | ret = COMPACT_PARTIAL; | 1031 | ret = COMPACT_PARTIAL; |
1008 | goto out; | 1032 | goto out; |
1009 | } | 1033 | } |
@@ -1015,6 +1039,8 @@ out: | |||
1015 | cc->nr_freepages -= release_freepages(&cc->freepages); | 1039 | cc->nr_freepages -= release_freepages(&cc->freepages); |
1016 | VM_BUG_ON(cc->nr_freepages != 0); | 1040 | VM_BUG_ON(cc->nr_freepages != 0); |
1017 | 1041 | ||
1042 | trace_mm_compaction_end(ret); | ||
1043 | |||
1018 | return ret; | 1044 | return ret; |
1019 | } | 1045 | } |
1020 | 1046 | ||
@@ -1120,12 +1146,11 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) | |||
1120 | compact_zone(zone, cc); | 1146 | compact_zone(zone, cc); |
1121 | 1147 | ||
1122 | if (cc->order > 0) { | 1148 | if (cc->order > 0) { |
1123 | int ok = zone_watermark_ok(zone, cc->order, | 1149 | if (zone_watermark_ok(zone, cc->order, |
1124 | low_wmark_pages(zone), 0, 0); | 1150 | low_wmark_pages(zone), 0, 0)) |
1125 | if (ok && cc->order >= zone->compact_order_failed) | 1151 | compaction_defer_reset(zone, cc->order, false); |
1126 | zone->compact_order_failed = cc->order + 1; | ||
1127 | /* Currently async compaction is never deferred. */ | 1152 | /* Currently async compaction is never deferred. */ |
1128 | else if (!ok && cc->sync) | 1153 | else if (cc->sync) |
1129 | defer_compaction(zone, cc->order); | 1154 | defer_compaction(zone, cc->order); |
1130 | } | 1155 | } |
1131 | 1156 | ||
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index dee6cf4e6d34..04306b9de90d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -690,15 +690,11 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) | |||
690 | */ | 690 | */ |
691 | int PageHuge(struct page *page) | 691 | int PageHuge(struct page *page) |
692 | { | 692 | { |
693 | compound_page_dtor *dtor; | ||
694 | |||
695 | if (!PageCompound(page)) | 693 | if (!PageCompound(page)) |
696 | return 0; | 694 | return 0; |
697 | 695 | ||
698 | page = compound_head(page); | 696 | page = compound_head(page); |
699 | dtor = get_compound_page_dtor(page); | 697 | return get_compound_page_dtor(page) == free_huge_page; |
700 | |||
701 | return dtor == free_huge_page; | ||
702 | } | 698 | } |
703 | EXPORT_SYMBOL_GPL(PageHuge); | 699 | EXPORT_SYMBOL_GPL(PageHuge); |
704 | 700 | ||
@@ -708,16 +704,11 @@ EXPORT_SYMBOL_GPL(PageHuge); | |||
708 | */ | 704 | */ |
709 | int PageHeadHuge(struct page *page_head) | 705 | int PageHeadHuge(struct page *page_head) |
710 | { | 706 | { |
711 | compound_page_dtor *dtor; | ||
712 | |||
713 | if (!PageHead(page_head)) | 707 | if (!PageHead(page_head)) |
714 | return 0; | 708 | return 0; |
715 | 709 | ||
716 | dtor = get_compound_page_dtor(page_head); | 710 | return get_compound_page_dtor(page_head) == free_huge_page; |
717 | |||
718 | return dtor == free_huge_page; | ||
719 | } | 711 | } |
720 | EXPORT_SYMBOL_GPL(PageHeadHuge); | ||
721 | 712 | ||
722 | pgoff_t __basepage_index(struct page *page) | 713 | pgoff_t __basepage_index(struct page *page) |
723 | { | 714 | { |
@@ -1280,9 +1271,9 @@ int __weak alloc_bootmem_huge_page(struct hstate *h) | |||
1280 | for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { | 1271 | for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { |
1281 | void *addr; | 1272 | void *addr; |
1282 | 1273 | ||
1283 | addr = __alloc_bootmem_node_nopanic(NODE_DATA(node), | 1274 | addr = memblock_virt_alloc_try_nid_nopanic( |
1284 | huge_page_size(h), huge_page_size(h), 0); | 1275 | huge_page_size(h), huge_page_size(h), |
1285 | 1276 | 0, BOOTMEM_ALLOC_ACCESSIBLE, node); | |
1286 | if (addr) { | 1277 | if (addr) { |
1287 | /* | 1278 | /* |
1288 | * Use the beginning of the huge page to store the | 1279 | * Use the beginning of the huge page to store the |
@@ -1322,8 +1313,8 @@ static void __init gather_bootmem_prealloc(void) | |||
1322 | 1313 | ||
1323 | #ifdef CONFIG_HIGHMEM | 1314 | #ifdef CONFIG_HIGHMEM |
1324 | page = pfn_to_page(m->phys >> PAGE_SHIFT); | 1315 | page = pfn_to_page(m->phys >> PAGE_SHIFT); |
1325 | free_bootmem_late((unsigned long)m, | 1316 | memblock_free_late(__pa(m), |
1326 | sizeof(struct huge_bootmem_page)); | 1317 | sizeof(struct huge_bootmem_page)); |
1327 | #else | 1318 | #else |
1328 | page = virt_to_page(m); | 1319 | page = virt_to_page(m); |
1329 | #endif | 1320 | #endif |
@@ -2355,17 +2346,27 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | |||
2355 | int cow; | 2346 | int cow; |
2356 | struct hstate *h = hstate_vma(vma); | 2347 | struct hstate *h = hstate_vma(vma); |
2357 | unsigned long sz = huge_page_size(h); | 2348 | unsigned long sz = huge_page_size(h); |
2349 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
2350 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
2351 | int ret = 0; | ||
2358 | 2352 | ||
2359 | cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; | 2353 | cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; |
2360 | 2354 | ||
2355 | mmun_start = vma->vm_start; | ||
2356 | mmun_end = vma->vm_end; | ||
2357 | if (cow) | ||
2358 | mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end); | ||
2359 | |||
2361 | for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { | 2360 | for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { |
2362 | spinlock_t *src_ptl, *dst_ptl; | 2361 | spinlock_t *src_ptl, *dst_ptl; |
2363 | src_pte = huge_pte_offset(src, addr); | 2362 | src_pte = huge_pte_offset(src, addr); |
2364 | if (!src_pte) | 2363 | if (!src_pte) |
2365 | continue; | 2364 | continue; |
2366 | dst_pte = huge_pte_alloc(dst, addr, sz); | 2365 | dst_pte = huge_pte_alloc(dst, addr, sz); |
2367 | if (!dst_pte) | 2366 | if (!dst_pte) { |
2368 | goto nomem; | 2367 | ret = -ENOMEM; |
2368 | break; | ||
2369 | } | ||
2369 | 2370 | ||
2370 | /* If the pagetables are shared don't copy or take references */ | 2371 | /* If the pagetables are shared don't copy or take references */ |
2371 | if (dst_pte == src_pte) | 2372 | if (dst_pte == src_pte) |
@@ -2386,10 +2387,11 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | |||
2386 | spin_unlock(src_ptl); | 2387 | spin_unlock(src_ptl); |
2387 | spin_unlock(dst_ptl); | 2388 | spin_unlock(dst_ptl); |
2388 | } | 2389 | } |
2389 | return 0; | ||
2390 | 2390 | ||
2391 | nomem: | 2391 | if (cow) |
2392 | return -ENOMEM; | 2392 | mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end); |
2393 | |||
2394 | return ret; | ||
2393 | } | 2395 | } |
2394 | 2396 | ||
2395 | static int is_hugetlb_entry_migration(pte_t pte) | 2397 | static int is_hugetlb_entry_migration(pte_t pte) |
@@ -3079,7 +3081,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3079 | same_page: | 3081 | same_page: |
3080 | if (pages) { | 3082 | if (pages) { |
3081 | pages[i] = mem_map_offset(page, pfn_offset); | 3083 | pages[i] = mem_map_offset(page, pfn_offset); |
3082 | get_page(pages[i]); | 3084 | get_page_foll(pages[i]); |
3083 | } | 3085 | } |
3084 | 3086 | ||
3085 | if (vmas) | 3087 | if (vmas) |
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 4c84678371eb..95487c71cad5 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c | |||
@@ -55,7 +55,7 @@ static int hwpoison_inject(void *data, u64 val) | |||
55 | return 0; | 55 | return 0; |
56 | 56 | ||
57 | inject: | 57 | inject: |
58 | printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn); | 58 | pr_info("Injecting memory failure at pfn %#lx\n", pfn); |
59 | return memory_failure(pfn, 18, MF_COUNT_INCREASED); | 59 | return memory_failure(pfn, 18, MF_COUNT_INCREASED); |
60 | } | 60 | } |
61 | 61 | ||
diff --git a/mm/internal.h b/mm/internal.h index 684f7aa9692a..a346ba120e42 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -47,11 +47,9 @@ static inline void __get_page_tail_foll(struct page *page, | |||
47 | * page_cache_get_speculative()) on tail pages. | 47 | * page_cache_get_speculative()) on tail pages. |
48 | */ | 48 | */ |
49 | VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); | 49 | VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); |
50 | VM_BUG_ON(atomic_read(&page->_count) != 0); | ||
51 | VM_BUG_ON(page_mapcount(page) < 0); | ||
52 | if (get_page_head) | 50 | if (get_page_head) |
53 | atomic_inc(&page->first_page->_count); | 51 | atomic_inc(&page->first_page->_count); |
54 | atomic_inc(&page->_mapcount); | 52 | get_huge_page_tail(page); |
55 | } | 53 | } |
56 | 54 | ||
57 | /* | 55 | /* |
@@ -1891,21 +1891,24 @@ struct page *ksm_might_need_to_copy(struct page *page, | |||
1891 | return new_page; | 1891 | return new_page; |
1892 | } | 1892 | } |
1893 | 1893 | ||
1894 | int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, | 1894 | int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) |
1895 | unsigned long *vm_flags) | ||
1896 | { | 1895 | { |
1897 | struct stable_node *stable_node; | 1896 | struct stable_node *stable_node; |
1898 | struct rmap_item *rmap_item; | 1897 | struct rmap_item *rmap_item; |
1899 | unsigned int mapcount = page_mapcount(page); | 1898 | int ret = SWAP_AGAIN; |
1900 | int referenced = 0; | ||
1901 | int search_new_forks = 0; | 1899 | int search_new_forks = 0; |
1902 | 1900 | ||
1903 | VM_BUG_ON(!PageKsm(page)); | 1901 | VM_BUG_ON(!PageKsm(page)); |
1902 | |||
1903 | /* | ||
1904 | * Rely on the page lock to protect against concurrent modifications | ||
1905 | * to that page's node of the stable tree. | ||
1906 | */ | ||
1904 | VM_BUG_ON(!PageLocked(page)); | 1907 | VM_BUG_ON(!PageLocked(page)); |
1905 | 1908 | ||
1906 | stable_node = page_stable_node(page); | 1909 | stable_node = page_stable_node(page); |
1907 | if (!stable_node) | 1910 | if (!stable_node) |
1908 | return 0; | 1911 | return ret; |
1909 | again: | 1912 | again: |
1910 | hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { | 1913 | hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { |
1911 | struct anon_vma *anon_vma = rmap_item->anon_vma; | 1914 | struct anon_vma *anon_vma = rmap_item->anon_vma; |
@@ -1928,113 +1931,16 @@ again: | |||
1928 | if ((rmap_item->mm == vma->vm_mm) == search_new_forks) | 1931 | if ((rmap_item->mm == vma->vm_mm) == search_new_forks) |
1929 | continue; | 1932 | continue; |
1930 | 1933 | ||
1931 | if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) | 1934 | if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) |
1932 | continue; | ||
1933 | |||
1934 | referenced += page_referenced_one(page, vma, | ||
1935 | rmap_item->address, &mapcount, vm_flags); | ||
1936 | if (!search_new_forks || !mapcount) | ||
1937 | break; | ||
1938 | } | ||
1939 | anon_vma_unlock_read(anon_vma); | ||
1940 | if (!mapcount) | ||
1941 | goto out; | ||
1942 | } | ||
1943 | if (!search_new_forks++) | ||
1944 | goto again; | ||
1945 | out: | ||
1946 | return referenced; | ||
1947 | } | ||
1948 | |||
1949 | int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) | ||
1950 | { | ||
1951 | struct stable_node *stable_node; | ||
1952 | struct rmap_item *rmap_item; | ||
1953 | int ret = SWAP_AGAIN; | ||
1954 | int search_new_forks = 0; | ||
1955 | |||
1956 | VM_BUG_ON(!PageKsm(page)); | ||
1957 | VM_BUG_ON(!PageLocked(page)); | ||
1958 | |||
1959 | stable_node = page_stable_node(page); | ||
1960 | if (!stable_node) | ||
1961 | return SWAP_FAIL; | ||
1962 | again: | ||
1963 | hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { | ||
1964 | struct anon_vma *anon_vma = rmap_item->anon_vma; | ||
1965 | struct anon_vma_chain *vmac; | ||
1966 | struct vm_area_struct *vma; | ||
1967 | |||
1968 | anon_vma_lock_read(anon_vma); | ||
1969 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, | ||
1970 | 0, ULONG_MAX) { | ||
1971 | vma = vmac->vma; | ||
1972 | if (rmap_item->address < vma->vm_start || | ||
1973 | rmap_item->address >= vma->vm_end) | ||
1974 | continue; | ||
1975 | /* | ||
1976 | * Initially we examine only the vma which covers this | ||
1977 | * rmap_item; but later, if there is still work to do, | ||
1978 | * we examine covering vmas in other mms: in case they | ||
1979 | * were forked from the original since ksmd passed. | ||
1980 | */ | ||
1981 | if ((rmap_item->mm == vma->vm_mm) == search_new_forks) | ||
1982 | continue; | 1935 | continue; |
1983 | 1936 | ||
1984 | ret = try_to_unmap_one(page, vma, | 1937 | ret = rwc->rmap_one(page, vma, |
1985 | rmap_item->address, flags); | 1938 | rmap_item->address, rwc->arg); |
1986 | if (ret != SWAP_AGAIN || !page_mapped(page)) { | 1939 | if (ret != SWAP_AGAIN) { |
1987 | anon_vma_unlock_read(anon_vma); | 1940 | anon_vma_unlock_read(anon_vma); |
1988 | goto out; | 1941 | goto out; |
1989 | } | 1942 | } |
1990 | } | 1943 | if (rwc->done && rwc->done(page)) { |
1991 | anon_vma_unlock_read(anon_vma); | ||
1992 | } | ||
1993 | if (!search_new_forks++) | ||
1994 | goto again; | ||
1995 | out: | ||
1996 | return ret; | ||
1997 | } | ||
1998 | |||
1999 | #ifdef CONFIG_MIGRATION | ||
2000 | int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, | ||
2001 | struct vm_area_struct *, unsigned long, void *), void *arg) | ||
2002 | { | ||
2003 | struct stable_node *stable_node; | ||
2004 | struct rmap_item *rmap_item; | ||
2005 | int ret = SWAP_AGAIN; | ||
2006 | int search_new_forks = 0; | ||
2007 | |||
2008 | VM_BUG_ON(!PageKsm(page)); | ||
2009 | VM_BUG_ON(!PageLocked(page)); | ||
2010 | |||
2011 | stable_node = page_stable_node(page); | ||
2012 | if (!stable_node) | ||
2013 | return ret; | ||
2014 | again: | ||
2015 | hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { | ||
2016 | struct anon_vma *anon_vma = rmap_item->anon_vma; | ||
2017 | struct anon_vma_chain *vmac; | ||
2018 | struct vm_area_struct *vma; | ||
2019 | |||
2020 | anon_vma_lock_read(anon_vma); | ||
2021 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, | ||
2022 | 0, ULONG_MAX) { | ||
2023 | vma = vmac->vma; | ||
2024 | if (rmap_item->address < vma->vm_start || | ||
2025 | rmap_item->address >= vma->vm_end) | ||
2026 | continue; | ||
2027 | /* | ||
2028 | * Initially we examine only the vma which covers this | ||
2029 | * rmap_item; but later, if there is still work to do, | ||
2030 | * we examine covering vmas in other mms: in case they | ||
2031 | * were forked from the original since ksmd passed. | ||
2032 | */ | ||
2033 | if ((rmap_item->mm == vma->vm_mm) == search_new_forks) | ||
2034 | continue; | ||
2035 | |||
2036 | ret = rmap_one(page, vma, rmap_item->address, arg); | ||
2037 | if (ret != SWAP_AGAIN) { | ||
2038 | anon_vma_unlock_read(anon_vma); | 1944 | anon_vma_unlock_read(anon_vma); |
2039 | goto out; | 1945 | goto out; |
2040 | } | 1946 | } |
@@ -2047,6 +1953,7 @@ out: | |||
2047 | return ret; | 1953 | return ret; |
2048 | } | 1954 | } |
2049 | 1955 | ||
1956 | #ifdef CONFIG_MIGRATION | ||
2050 | void ksm_migrate_page(struct page *newpage, struct page *oldpage) | 1957 | void ksm_migrate_page(struct page *newpage, struct page *oldpage) |
2051 | { | 1958 | { |
2052 | struct stable_node *stable_node; | 1959 | struct stable_node *stable_node; |
diff --git a/mm/memblock.c b/mm/memblock.c index 53e477bb5558..1c2ef2c7edab 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -21,6 +21,9 @@ | |||
21 | #include <linux/memblock.h> | 21 | #include <linux/memblock.h> |
22 | 22 | ||
23 | #include <asm-generic/sections.h> | 23 | #include <asm-generic/sections.h> |
24 | #include <linux/io.h> | ||
25 | |||
26 | #include "internal.h" | ||
24 | 27 | ||
25 | static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; | 28 | static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; |
26 | static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; | 29 | static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; |
@@ -39,6 +42,9 @@ struct memblock memblock __initdata_memblock = { | |||
39 | }; | 42 | }; |
40 | 43 | ||
41 | int memblock_debug __initdata_memblock; | 44 | int memblock_debug __initdata_memblock; |
45 | #ifdef CONFIG_MOVABLE_NODE | ||
46 | bool movable_node_enabled __initdata_memblock = false; | ||
47 | #endif | ||
42 | static int memblock_can_resize __initdata_memblock; | 48 | static int memblock_can_resize __initdata_memblock; |
43 | static int memblock_memory_in_slab __initdata_memblock = 0; | 49 | static int memblock_memory_in_slab __initdata_memblock = 0; |
44 | static int memblock_reserved_in_slab __initdata_memblock = 0; | 50 | static int memblock_reserved_in_slab __initdata_memblock = 0; |
@@ -91,7 +97,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type, | |||
91 | * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} | 97 | * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} |
92 | * @size: size of free area to find | 98 | * @size: size of free area to find |
93 | * @align: alignment of free area to find | 99 | * @align: alignment of free area to find |
94 | * @nid: nid of the free area to find, %MAX_NUMNODES for any node | 100 | * @nid: nid of the free area to find, %NUMA_NO_NODE for any node |
95 | * | 101 | * |
96 | * Utility called from memblock_find_in_range_node(), find free area bottom-up. | 102 | * Utility called from memblock_find_in_range_node(), find free area bottom-up. |
97 | * | 103 | * |
@@ -123,7 +129,7 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end, | |||
123 | * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} | 129 | * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} |
124 | * @size: size of free area to find | 130 | * @size: size of free area to find |
125 | * @align: alignment of free area to find | 131 | * @align: alignment of free area to find |
126 | * @nid: nid of the free area to find, %MAX_NUMNODES for any node | 132 | * @nid: nid of the free area to find, %NUMA_NO_NODE for any node |
127 | * | 133 | * |
128 | * Utility called from memblock_find_in_range_node(), find free area top-down. | 134 | * Utility called from memblock_find_in_range_node(), find free area top-down. |
129 | * | 135 | * |
@@ -154,11 +160,11 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, | |||
154 | 160 | ||
155 | /** | 161 | /** |
156 | * memblock_find_in_range_node - find free area in given range and node | 162 | * memblock_find_in_range_node - find free area in given range and node |
157 | * @start: start of candidate range | ||
158 | * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} | ||
159 | * @size: size of free area to find | 163 | * @size: size of free area to find |
160 | * @align: alignment of free area to find | 164 | * @align: alignment of free area to find |
161 | * @nid: nid of the free area to find, %MAX_NUMNODES for any node | 165 | * @start: start of candidate range |
166 | * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} | ||
167 | * @nid: nid of the free area to find, %NUMA_NO_NODE for any node | ||
162 | * | 168 | * |
163 | * Find @size free area aligned to @align in the specified range and node. | 169 | * Find @size free area aligned to @align in the specified range and node. |
164 | * | 170 | * |
@@ -173,9 +179,9 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, | |||
173 | * RETURNS: | 179 | * RETURNS: |
174 | * Found address on success, 0 on failure. | 180 | * Found address on success, 0 on failure. |
175 | */ | 181 | */ |
176 | phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, | 182 | phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, |
177 | phys_addr_t end, phys_addr_t size, | 183 | phys_addr_t align, phys_addr_t start, |
178 | phys_addr_t align, int nid) | 184 | phys_addr_t end, int nid) |
179 | { | 185 | { |
180 | int ret; | 186 | int ret; |
181 | phys_addr_t kernel_end; | 187 | phys_addr_t kernel_end; |
@@ -238,8 +244,8 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, | |||
238 | phys_addr_t end, phys_addr_t size, | 244 | phys_addr_t end, phys_addr_t size, |
239 | phys_addr_t align) | 245 | phys_addr_t align) |
240 | { | 246 | { |
241 | return memblock_find_in_range_node(start, end, size, align, | 247 | return memblock_find_in_range_node(size, align, start, end, |
242 | MAX_NUMNODES); | 248 | NUMA_NO_NODE); |
243 | } | 249 | } |
244 | 250 | ||
245 | static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) | 251 | static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) |
@@ -255,6 +261,7 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u | |||
255 | type->cnt = 1; | 261 | type->cnt = 1; |
256 | type->regions[0].base = 0; | 262 | type->regions[0].base = 0; |
257 | type->regions[0].size = 0; | 263 | type->regions[0].size = 0; |
264 | type->regions[0].flags = 0; | ||
258 | memblock_set_region_node(&type->regions[0], MAX_NUMNODES); | 265 | memblock_set_region_node(&type->regions[0], MAX_NUMNODES); |
259 | } | 266 | } |
260 | } | 267 | } |
@@ -265,6 +272,19 @@ phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info( | |||
265 | if (memblock.reserved.regions == memblock_reserved_init_regions) | 272 | if (memblock.reserved.regions == memblock_reserved_init_regions) |
266 | return 0; | 273 | return 0; |
267 | 274 | ||
275 | /* | ||
276 | * Don't allow nobootmem allocator to free reserved memory regions | ||
277 | * array if | ||
278 | * - CONFIG_DEBUG_FS is enabled; | ||
279 | * - CONFIG_ARCH_DISCARD_MEMBLOCK is not enabled; | ||
280 | * - reserved memory regions array have been resized during boot. | ||
281 | * Otherwise debug_fs entry "sys/kernel/debug/memblock/reserved" | ||
282 | * will show garbage instead of state of memory reservations. | ||
283 | */ | ||
284 | if (IS_ENABLED(CONFIG_DEBUG_FS) && | ||
285 | !IS_ENABLED(CONFIG_ARCH_DISCARD_MEMBLOCK)) | ||
286 | return 0; | ||
287 | |||
268 | *addr = __pa(memblock.reserved.regions); | 288 | *addr = __pa(memblock.reserved.regions); |
269 | 289 | ||
270 | return PAGE_ALIGN(sizeof(struct memblock_region) * | 290 | return PAGE_ALIGN(sizeof(struct memblock_region) * |
@@ -405,7 +425,8 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type) | |||
405 | 425 | ||
406 | if (this->base + this->size != next->base || | 426 | if (this->base + this->size != next->base || |
407 | memblock_get_region_node(this) != | 427 | memblock_get_region_node(this) != |
408 | memblock_get_region_node(next)) { | 428 | memblock_get_region_node(next) || |
429 | this->flags != next->flags) { | ||
409 | BUG_ON(this->base + this->size > next->base); | 430 | BUG_ON(this->base + this->size > next->base); |
410 | i++; | 431 | i++; |
411 | continue; | 432 | continue; |
@@ -425,13 +446,15 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type) | |||
425 | * @base: base address of the new region | 446 | * @base: base address of the new region |
426 | * @size: size of the new region | 447 | * @size: size of the new region |
427 | * @nid: node id of the new region | 448 | * @nid: node id of the new region |
449 | * @flags: flags of the new region | ||
428 | * | 450 | * |
429 | * Insert new memblock region [@base,@base+@size) into @type at @idx. | 451 | * Insert new memblock region [@base,@base+@size) into @type at @idx. |
430 | * @type must already have extra room to accomodate the new region. | 452 | * @type must already have extra room to accomodate the new region. |
431 | */ | 453 | */ |
432 | static void __init_memblock memblock_insert_region(struct memblock_type *type, | 454 | static void __init_memblock memblock_insert_region(struct memblock_type *type, |
433 | int idx, phys_addr_t base, | 455 | int idx, phys_addr_t base, |
434 | phys_addr_t size, int nid) | 456 | phys_addr_t size, |
457 | int nid, unsigned long flags) | ||
435 | { | 458 | { |
436 | struct memblock_region *rgn = &type->regions[idx]; | 459 | struct memblock_region *rgn = &type->regions[idx]; |
437 | 460 | ||
@@ -439,6 +462,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type, | |||
439 | memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn)); | 462 | memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn)); |
440 | rgn->base = base; | 463 | rgn->base = base; |
441 | rgn->size = size; | 464 | rgn->size = size; |
465 | rgn->flags = flags; | ||
442 | memblock_set_region_node(rgn, nid); | 466 | memblock_set_region_node(rgn, nid); |
443 | type->cnt++; | 467 | type->cnt++; |
444 | type->total_size += size; | 468 | type->total_size += size; |
@@ -450,6 +474,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type, | |||
450 | * @base: base address of the new region | 474 | * @base: base address of the new region |
451 | * @size: size of the new region | 475 | * @size: size of the new region |
452 | * @nid: nid of the new region | 476 | * @nid: nid of the new region |
477 | * @flags: flags of the new region | ||
453 | * | 478 | * |
454 | * Add new memblock region [@base,@base+@size) into @type. The new region | 479 | * Add new memblock region [@base,@base+@size) into @type. The new region |
455 | * is allowed to overlap with existing ones - overlaps don't affect already | 480 | * is allowed to overlap with existing ones - overlaps don't affect already |
@@ -460,7 +485,8 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type, | |||
460 | * 0 on success, -errno on failure. | 485 | * 0 on success, -errno on failure. |
461 | */ | 486 | */ |
462 | static int __init_memblock memblock_add_region(struct memblock_type *type, | 487 | static int __init_memblock memblock_add_region(struct memblock_type *type, |
463 | phys_addr_t base, phys_addr_t size, int nid) | 488 | phys_addr_t base, phys_addr_t size, |
489 | int nid, unsigned long flags) | ||
464 | { | 490 | { |
465 | bool insert = false; | 491 | bool insert = false; |
466 | phys_addr_t obase = base; | 492 | phys_addr_t obase = base; |
@@ -475,6 +501,7 @@ static int __init_memblock memblock_add_region(struct memblock_type *type, | |||
475 | WARN_ON(type->cnt != 1 || type->total_size); | 501 | WARN_ON(type->cnt != 1 || type->total_size); |
476 | type->regions[0].base = base; | 502 | type->regions[0].base = base; |
477 | type->regions[0].size = size; | 503 | type->regions[0].size = size; |
504 | type->regions[0].flags = flags; | ||
478 | memblock_set_region_node(&type->regions[0], nid); | 505 | memblock_set_region_node(&type->regions[0], nid); |
479 | type->total_size = size; | 506 | type->total_size = size; |
480 | return 0; | 507 | return 0; |
@@ -505,7 +532,8 @@ repeat: | |||
505 | nr_new++; | 532 | nr_new++; |
506 | if (insert) | 533 | if (insert) |
507 | memblock_insert_region(type, i++, base, | 534 | memblock_insert_region(type, i++, base, |
508 | rbase - base, nid); | 535 | rbase - base, nid, |
536 | flags); | ||
509 | } | 537 | } |
510 | /* area below @rend is dealt with, forget about it */ | 538 | /* area below @rend is dealt with, forget about it */ |
511 | base = min(rend, end); | 539 | base = min(rend, end); |
@@ -515,7 +543,8 @@ repeat: | |||
515 | if (base < end) { | 543 | if (base < end) { |
516 | nr_new++; | 544 | nr_new++; |
517 | if (insert) | 545 | if (insert) |
518 | memblock_insert_region(type, i, base, end - base, nid); | 546 | memblock_insert_region(type, i, base, end - base, |
547 | nid, flags); | ||
519 | } | 548 | } |
520 | 549 | ||
521 | /* | 550 | /* |
@@ -537,12 +566,13 @@ repeat: | |||
537 | int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, | 566 | int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, |
538 | int nid) | 567 | int nid) |
539 | { | 568 | { |
540 | return memblock_add_region(&memblock.memory, base, size, nid); | 569 | return memblock_add_region(&memblock.memory, base, size, nid, 0); |
541 | } | 570 | } |
542 | 571 | ||
543 | int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) | 572 | int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) |
544 | { | 573 | { |
545 | return memblock_add_region(&memblock.memory, base, size, MAX_NUMNODES); | 574 | return memblock_add_region(&memblock.memory, base, size, |
575 | MAX_NUMNODES, 0); | ||
546 | } | 576 | } |
547 | 577 | ||
548 | /** | 578 | /** |
@@ -597,7 +627,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, | |||
597 | rgn->size -= base - rbase; | 627 | rgn->size -= base - rbase; |
598 | type->total_size -= base - rbase; | 628 | type->total_size -= base - rbase; |
599 | memblock_insert_region(type, i, rbase, base - rbase, | 629 | memblock_insert_region(type, i, rbase, base - rbase, |
600 | memblock_get_region_node(rgn)); | 630 | memblock_get_region_node(rgn), |
631 | rgn->flags); | ||
601 | } else if (rend > end) { | 632 | } else if (rend > end) { |
602 | /* | 633 | /* |
603 | * @rgn intersects from above. Split and redo the | 634 | * @rgn intersects from above. Split and redo the |
@@ -607,7 +638,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, | |||
607 | rgn->size -= end - rbase; | 638 | rgn->size -= end - rbase; |
608 | type->total_size -= end - rbase; | 639 | type->total_size -= end - rbase; |
609 | memblock_insert_region(type, i--, rbase, end - rbase, | 640 | memblock_insert_region(type, i--, rbase, end - rbase, |
610 | memblock_get_region_node(rgn)); | 641 | memblock_get_region_node(rgn), |
642 | rgn->flags); | ||
611 | } else { | 643 | } else { |
612 | /* @rgn is fully contained, record it */ | 644 | /* @rgn is fully contained, record it */ |
613 | if (!*end_rgn) | 645 | if (!*end_rgn) |
@@ -643,28 +675,89 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) | |||
643 | { | 675 | { |
644 | memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n", | 676 | memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n", |
645 | (unsigned long long)base, | 677 | (unsigned long long)base, |
646 | (unsigned long long)base + size, | 678 | (unsigned long long)base + size - 1, |
647 | (void *)_RET_IP_); | 679 | (void *)_RET_IP_); |
648 | 680 | ||
649 | return __memblock_remove(&memblock.reserved, base, size); | 681 | return __memblock_remove(&memblock.reserved, base, size); |
650 | } | 682 | } |
651 | 683 | ||
652 | int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) | 684 | static int __init_memblock memblock_reserve_region(phys_addr_t base, |
685 | phys_addr_t size, | ||
686 | int nid, | ||
687 | unsigned long flags) | ||
653 | { | 688 | { |
654 | struct memblock_type *_rgn = &memblock.reserved; | 689 | struct memblock_type *_rgn = &memblock.reserved; |
655 | 690 | ||
656 | memblock_dbg("memblock_reserve: [%#016llx-%#016llx] %pF\n", | 691 | memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n", |
657 | (unsigned long long)base, | 692 | (unsigned long long)base, |
658 | (unsigned long long)base + size, | 693 | (unsigned long long)base + size - 1, |
659 | (void *)_RET_IP_); | 694 | flags, (void *)_RET_IP_); |
660 | 695 | ||
661 | return memblock_add_region(_rgn, base, size, MAX_NUMNODES); | 696 | return memblock_add_region(_rgn, base, size, nid, flags); |
697 | } | ||
698 | |||
699 | int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) | ||
700 | { | ||
701 | return memblock_reserve_region(base, size, MAX_NUMNODES, 0); | ||
702 | } | ||
703 | |||
704 | /** | ||
705 | * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG. | ||
706 | * @base: the base phys addr of the region | ||
707 | * @size: the size of the region | ||
708 | * | ||
709 | * This function isolates region [@base, @base + @size), and mark it with flag | ||
710 | * MEMBLOCK_HOTPLUG. | ||
711 | * | ||
712 | * Return 0 on succees, -errno on failure. | ||
713 | */ | ||
714 | int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) | ||
715 | { | ||
716 | struct memblock_type *type = &memblock.memory; | ||
717 | int i, ret, start_rgn, end_rgn; | ||
718 | |||
719 | ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); | ||
720 | if (ret) | ||
721 | return ret; | ||
722 | |||
723 | for (i = start_rgn; i < end_rgn; i++) | ||
724 | memblock_set_region_flags(&type->regions[i], MEMBLOCK_HOTPLUG); | ||
725 | |||
726 | memblock_merge_regions(type); | ||
727 | return 0; | ||
728 | } | ||
729 | |||
730 | /** | ||
731 | * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region. | ||
732 | * @base: the base phys addr of the region | ||
733 | * @size: the size of the region | ||
734 | * | ||
735 | * This function isolates region [@base, @base + @size), and clear flag | ||
736 | * MEMBLOCK_HOTPLUG for the isolated regions. | ||
737 | * | ||
738 | * Return 0 on succees, -errno on failure. | ||
739 | */ | ||
740 | int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) | ||
741 | { | ||
742 | struct memblock_type *type = &memblock.memory; | ||
743 | int i, ret, start_rgn, end_rgn; | ||
744 | |||
745 | ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); | ||
746 | if (ret) | ||
747 | return ret; | ||
748 | |||
749 | for (i = start_rgn; i < end_rgn; i++) | ||
750 | memblock_clear_region_flags(&type->regions[i], | ||
751 | MEMBLOCK_HOTPLUG); | ||
752 | |||
753 | memblock_merge_regions(type); | ||
754 | return 0; | ||
662 | } | 755 | } |
663 | 756 | ||
664 | /** | 757 | /** |
665 | * __next_free_mem_range - next function for for_each_free_mem_range() | 758 | * __next_free_mem_range - next function for for_each_free_mem_range() |
666 | * @idx: pointer to u64 loop variable | 759 | * @idx: pointer to u64 loop variable |
667 | * @nid: node selector, %MAX_NUMNODES for all nodes | 760 | * @nid: node selector, %NUMA_NO_NODE for all nodes |
668 | * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL | 761 | * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL |
669 | * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL | 762 | * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL |
670 | * @out_nid: ptr to int for nid of the range, can be %NULL | 763 | * @out_nid: ptr to int for nid of the range, can be %NULL |
@@ -693,13 +786,16 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, | |||
693 | int mi = *idx & 0xffffffff; | 786 | int mi = *idx & 0xffffffff; |
694 | int ri = *idx >> 32; | 787 | int ri = *idx >> 32; |
695 | 788 | ||
789 | if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) | ||
790 | nid = NUMA_NO_NODE; | ||
791 | |||
696 | for ( ; mi < mem->cnt; mi++) { | 792 | for ( ; mi < mem->cnt; mi++) { |
697 | struct memblock_region *m = &mem->regions[mi]; | 793 | struct memblock_region *m = &mem->regions[mi]; |
698 | phys_addr_t m_start = m->base; | 794 | phys_addr_t m_start = m->base; |
699 | phys_addr_t m_end = m->base + m->size; | 795 | phys_addr_t m_end = m->base + m->size; |
700 | 796 | ||
701 | /* only memory regions are associated with nodes, check it */ | 797 | /* only memory regions are associated with nodes, check it */ |
702 | if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m)) | 798 | if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m)) |
703 | continue; | 799 | continue; |
704 | 800 | ||
705 | /* scan areas before each reservation for intersection */ | 801 | /* scan areas before each reservation for intersection */ |
@@ -740,12 +836,17 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, | |||
740 | /** | 836 | /** |
741 | * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() | 837 | * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() |
742 | * @idx: pointer to u64 loop variable | 838 | * @idx: pointer to u64 loop variable |
743 | * @nid: nid: node selector, %MAX_NUMNODES for all nodes | 839 | * @nid: nid: node selector, %NUMA_NO_NODE for all nodes |
744 | * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL | 840 | * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL |
745 | * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL | 841 | * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL |
746 | * @out_nid: ptr to int for nid of the range, can be %NULL | 842 | * @out_nid: ptr to int for nid of the range, can be %NULL |
747 | * | 843 | * |
748 | * Reverse of __next_free_mem_range(). | 844 | * Reverse of __next_free_mem_range(). |
845 | * | ||
846 | * Linux kernel cannot migrate pages used by itself. Memory hotplug users won't | ||
847 | * be able to hot-remove hotpluggable memory used by the kernel. So this | ||
848 | * function skip hotpluggable regions if needed when allocating memory for the | ||
849 | * kernel. | ||
749 | */ | 850 | */ |
750 | void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, | 851 | void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, |
751 | phys_addr_t *out_start, | 852 | phys_addr_t *out_start, |
@@ -756,6 +857,9 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, | |||
756 | int mi = *idx & 0xffffffff; | 857 | int mi = *idx & 0xffffffff; |
757 | int ri = *idx >> 32; | 858 | int ri = *idx >> 32; |
758 | 859 | ||
860 | if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) | ||
861 | nid = NUMA_NO_NODE; | ||
862 | |||
759 | if (*idx == (u64)ULLONG_MAX) { | 863 | if (*idx == (u64)ULLONG_MAX) { |
760 | mi = mem->cnt - 1; | 864 | mi = mem->cnt - 1; |
761 | ri = rsv->cnt; | 865 | ri = rsv->cnt; |
@@ -767,7 +871,11 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, | |||
767 | phys_addr_t m_end = m->base + m->size; | 871 | phys_addr_t m_end = m->base + m->size; |
768 | 872 | ||
769 | /* only memory regions are associated with nodes, check it */ | 873 | /* only memory regions are associated with nodes, check it */ |
770 | if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m)) | 874 | if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m)) |
875 | continue; | ||
876 | |||
877 | /* skip hotpluggable memory regions if needed */ | ||
878 | if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) | ||
771 | continue; | 879 | continue; |
772 | 880 | ||
773 | /* scan areas before each reservation for intersection */ | 881 | /* scan areas before each reservation for intersection */ |
@@ -837,18 +945,18 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, | |||
837 | * memblock_set_node - set node ID on memblock regions | 945 | * memblock_set_node - set node ID on memblock regions |
838 | * @base: base of area to set node ID for | 946 | * @base: base of area to set node ID for |
839 | * @size: size of area to set node ID for | 947 | * @size: size of area to set node ID for |
948 | * @type: memblock type to set node ID for | ||
840 | * @nid: node ID to set | 949 | * @nid: node ID to set |
841 | * | 950 | * |
842 | * Set the nid of memblock memory regions in [@base,@base+@size) to @nid. | 951 | * Set the nid of memblock @type regions in [@base,@base+@size) to @nid. |
843 | * Regions which cross the area boundaries are split as necessary. | 952 | * Regions which cross the area boundaries are split as necessary. |
844 | * | 953 | * |
845 | * RETURNS: | 954 | * RETURNS: |
846 | * 0 on success, -errno on failure. | 955 | * 0 on success, -errno on failure. |
847 | */ | 956 | */ |
848 | int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, | 957 | int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, |
849 | int nid) | 958 | struct memblock_type *type, int nid) |
850 | { | 959 | { |
851 | struct memblock_type *type = &memblock.memory; | ||
852 | int start_rgn, end_rgn; | 960 | int start_rgn, end_rgn; |
853 | int i, ret; | 961 | int i, ret; |
854 | 962 | ||
@@ -870,13 +978,13 @@ static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, | |||
870 | { | 978 | { |
871 | phys_addr_t found; | 979 | phys_addr_t found; |
872 | 980 | ||
873 | if (WARN_ON(!align)) | 981 | if (!align) |
874 | align = __alignof__(long long); | 982 | align = SMP_CACHE_BYTES; |
875 | 983 | ||
876 | /* align @size to avoid excessive fragmentation on reserved array */ | 984 | /* align @size to avoid excessive fragmentation on reserved array */ |
877 | size = round_up(size, align); | 985 | size = round_up(size, align); |
878 | 986 | ||
879 | found = memblock_find_in_range_node(0, max_addr, size, align, nid); | 987 | found = memblock_find_in_range_node(size, align, 0, max_addr, nid); |
880 | if (found && !memblock_reserve(found, size)) | 988 | if (found && !memblock_reserve(found, size)) |
881 | return found; | 989 | return found; |
882 | 990 | ||
@@ -890,7 +998,7 @@ phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int n | |||
890 | 998 | ||
891 | phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) | 999 | phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) |
892 | { | 1000 | { |
893 | return memblock_alloc_base_nid(size, align, max_addr, MAX_NUMNODES); | 1001 | return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE); |
894 | } | 1002 | } |
895 | 1003 | ||
896 | phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) | 1004 | phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) |
@@ -920,6 +1028,207 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i | |||
920 | return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); | 1028 | return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); |
921 | } | 1029 | } |
922 | 1030 | ||
1031 | /** | ||
1032 | * memblock_virt_alloc_internal - allocate boot memory block | ||
1033 | * @size: size of memory block to be allocated in bytes | ||
1034 | * @align: alignment of the region and block's size | ||
1035 | * @min_addr: the lower bound of the memory region to allocate (phys address) | ||
1036 | * @max_addr: the upper bound of the memory region to allocate (phys address) | ||
1037 | * @nid: nid of the free area to find, %NUMA_NO_NODE for any node | ||
1038 | * | ||
1039 | * The @min_addr limit is dropped if it can not be satisfied and the allocation | ||
1040 | * will fall back to memory below @min_addr. Also, allocation may fall back | ||
1041 | * to any node in the system if the specified node can not | ||
1042 | * hold the requested memory. | ||
1043 | * | ||
1044 | * The allocation is performed from memory region limited by | ||
1045 | * memblock.current_limit if @max_addr == %BOOTMEM_ALLOC_ACCESSIBLE. | ||
1046 | * | ||
1047 | * The memory block is aligned on SMP_CACHE_BYTES if @align == 0. | ||
1048 | * | ||
1049 | * The phys address of allocated boot memory block is converted to virtual and | ||
1050 | * allocated memory is reset to 0. | ||
1051 | * | ||
1052 | * In addition, function sets the min_count to 0 using kmemleak_alloc for | ||
1053 | * allocated boot memory block, so that it is never reported as leaks. | ||
1054 | * | ||
1055 | * RETURNS: | ||
1056 | * Virtual address of allocated memory block on success, NULL on failure. | ||
1057 | */ | ||
1058 | static void * __init memblock_virt_alloc_internal( | ||
1059 | phys_addr_t size, phys_addr_t align, | ||
1060 | phys_addr_t min_addr, phys_addr_t max_addr, | ||
1061 | int nid) | ||
1062 | { | ||
1063 | phys_addr_t alloc; | ||
1064 | void *ptr; | ||
1065 | |||
1066 | if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) | ||
1067 | nid = NUMA_NO_NODE; | ||
1068 | |||
1069 | /* | ||
1070 | * Detect any accidental use of these APIs after slab is ready, as at | ||
1071 | * this moment memblock may be deinitialized already and its | ||
1072 | * internal data may be destroyed (after execution of free_all_bootmem) | ||
1073 | */ | ||
1074 | if (WARN_ON_ONCE(slab_is_available())) | ||
1075 | return kzalloc_node(size, GFP_NOWAIT, nid); | ||
1076 | |||
1077 | if (!align) | ||
1078 | align = SMP_CACHE_BYTES; | ||
1079 | |||
1080 | /* align @size to avoid excessive fragmentation on reserved array */ | ||
1081 | size = round_up(size, align); | ||
1082 | |||
1083 | again: | ||
1084 | alloc = memblock_find_in_range_node(size, align, min_addr, max_addr, | ||
1085 | nid); | ||
1086 | if (alloc) | ||
1087 | goto done; | ||
1088 | |||
1089 | if (nid != NUMA_NO_NODE) { | ||
1090 | alloc = memblock_find_in_range_node(size, align, min_addr, | ||
1091 | max_addr, NUMA_NO_NODE); | ||
1092 | if (alloc) | ||
1093 | goto done; | ||
1094 | } | ||
1095 | |||
1096 | if (min_addr) { | ||
1097 | min_addr = 0; | ||
1098 | goto again; | ||
1099 | } else { | ||
1100 | goto error; | ||
1101 | } | ||
1102 | |||
1103 | done: | ||
1104 | memblock_reserve(alloc, size); | ||
1105 | ptr = phys_to_virt(alloc); | ||
1106 | memset(ptr, 0, size); | ||
1107 | |||
1108 | /* | ||
1109 | * The min_count is set to 0 so that bootmem allocated blocks | ||
1110 | * are never reported as leaks. This is because many of these blocks | ||
1111 | * are only referred via the physical address which is not | ||
1112 | * looked up by kmemleak. | ||
1113 | */ | ||
1114 | kmemleak_alloc(ptr, size, 0, 0); | ||
1115 | |||
1116 | return ptr; | ||
1117 | |||
1118 | error: | ||
1119 | return NULL; | ||
1120 | } | ||
1121 | |||
1122 | /** | ||
1123 | * memblock_virt_alloc_try_nid_nopanic - allocate boot memory block | ||
1124 | * @size: size of memory block to be allocated in bytes | ||
1125 | * @align: alignment of the region and block's size | ||
1126 | * @min_addr: the lower bound of the memory region from where the allocation | ||
1127 | * is preferred (phys address) | ||
1128 | * @max_addr: the upper bound of the memory region from where the allocation | ||
1129 | * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to | ||
1130 | * allocate only from memory limited by memblock.current_limit value | ||
1131 | * @nid: nid of the free area to find, %NUMA_NO_NODE for any node | ||
1132 | * | ||
1133 | * Public version of _memblock_virt_alloc_try_nid_nopanic() which provides | ||
1134 | * additional debug information (including caller info), if enabled. | ||
1135 | * | ||
1136 | * RETURNS: | ||
1137 | * Virtual address of allocated memory block on success, NULL on failure. | ||
1138 | */ | ||
1139 | void * __init memblock_virt_alloc_try_nid_nopanic( | ||
1140 | phys_addr_t size, phys_addr_t align, | ||
1141 | phys_addr_t min_addr, phys_addr_t max_addr, | ||
1142 | int nid) | ||
1143 | { | ||
1144 | memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n", | ||
1145 | __func__, (u64)size, (u64)align, nid, (u64)min_addr, | ||
1146 | (u64)max_addr, (void *)_RET_IP_); | ||
1147 | return memblock_virt_alloc_internal(size, align, min_addr, | ||
1148 | max_addr, nid); | ||
1149 | } | ||
1150 | |||
1151 | /** | ||
1152 | * memblock_virt_alloc_try_nid - allocate boot memory block with panicking | ||
1153 | * @size: size of memory block to be allocated in bytes | ||
1154 | * @align: alignment of the region and block's size | ||
1155 | * @min_addr: the lower bound of the memory region from where the allocation | ||
1156 | * is preferred (phys address) | ||
1157 | * @max_addr: the upper bound of the memory region from where the allocation | ||
1158 | * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to | ||
1159 | * allocate only from memory limited by memblock.current_limit value | ||
1160 | * @nid: nid of the free area to find, %NUMA_NO_NODE for any node | ||
1161 | * | ||
1162 | * Public panicking version of _memblock_virt_alloc_try_nid_nopanic() | ||
1163 | * which provides debug information (including caller info), if enabled, | ||
1164 | * and panics if the request can not be satisfied. | ||
1165 | * | ||
1166 | * RETURNS: | ||
1167 | * Virtual address of allocated memory block on success, NULL on failure. | ||
1168 | */ | ||
1169 | void * __init memblock_virt_alloc_try_nid( | ||
1170 | phys_addr_t size, phys_addr_t align, | ||
1171 | phys_addr_t min_addr, phys_addr_t max_addr, | ||
1172 | int nid) | ||
1173 | { | ||
1174 | void *ptr; | ||
1175 | |||
1176 | memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n", | ||
1177 | __func__, (u64)size, (u64)align, nid, (u64)min_addr, | ||
1178 | (u64)max_addr, (void *)_RET_IP_); | ||
1179 | ptr = memblock_virt_alloc_internal(size, align, | ||
1180 | min_addr, max_addr, nid); | ||
1181 | if (ptr) | ||
1182 | return ptr; | ||
1183 | |||
1184 | panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx\n", | ||
1185 | __func__, (u64)size, (u64)align, nid, (u64)min_addr, | ||
1186 | (u64)max_addr); | ||
1187 | return NULL; | ||
1188 | } | ||
1189 | |||
1190 | /** | ||
1191 | * __memblock_free_early - free boot memory block | ||
1192 | * @base: phys starting address of the boot memory block | ||
1193 | * @size: size of the boot memory block in bytes | ||
1194 | * | ||
1195 | * Free boot memory block previously allocated by memblock_virt_alloc_xx() API. | ||
1196 | * The freeing memory will not be released to the buddy allocator. | ||
1197 | */ | ||
1198 | void __init __memblock_free_early(phys_addr_t base, phys_addr_t size) | ||
1199 | { | ||
1200 | memblock_dbg("%s: [%#016llx-%#016llx] %pF\n", | ||
1201 | __func__, (u64)base, (u64)base + size - 1, | ||
1202 | (void *)_RET_IP_); | ||
1203 | kmemleak_free_part(__va(base), size); | ||
1204 | __memblock_remove(&memblock.reserved, base, size); | ||
1205 | } | ||
1206 | |||
1207 | /* | ||
1208 | * __memblock_free_late - free bootmem block pages directly to buddy allocator | ||
1209 | * @addr: phys starting address of the boot memory block | ||
1210 | * @size: size of the boot memory block in bytes | ||
1211 | * | ||
1212 | * This is only useful when the bootmem allocator has already been torn | ||
1213 | * down, but we are still initializing the system. Pages are released directly | ||
1214 | * to the buddy allocator, no bootmem metadata is updated because it is gone. | ||
1215 | */ | ||
1216 | void __init __memblock_free_late(phys_addr_t base, phys_addr_t size) | ||
1217 | { | ||
1218 | u64 cursor, end; | ||
1219 | |||
1220 | memblock_dbg("%s: [%#016llx-%#016llx] %pF\n", | ||
1221 | __func__, (u64)base, (u64)base + size - 1, | ||
1222 | (void *)_RET_IP_); | ||
1223 | kmemleak_free_part(__va(base), size); | ||
1224 | cursor = PFN_UP(base); | ||
1225 | end = PFN_DOWN(base + size); | ||
1226 | |||
1227 | for (; cursor < end; cursor++) { | ||
1228 | __free_pages_bootmem(pfn_to_page(cursor), 0); | ||
1229 | totalram_pages++; | ||
1230 | } | ||
1231 | } | ||
923 | 1232 | ||
924 | /* | 1233 | /* |
925 | * Remaining API functions | 1234 | * Remaining API functions |
@@ -1101,6 +1410,7 @@ void __init_memblock memblock_set_current_limit(phys_addr_t limit) | |||
1101 | static void __init_memblock memblock_dump(struct memblock_type *type, char *name) | 1410 | static void __init_memblock memblock_dump(struct memblock_type *type, char *name) |
1102 | { | 1411 | { |
1103 | unsigned long long base, size; | 1412 | unsigned long long base, size; |
1413 | unsigned long flags; | ||
1104 | int i; | 1414 | int i; |
1105 | 1415 | ||
1106 | pr_info(" %s.cnt = 0x%lx\n", name, type->cnt); | 1416 | pr_info(" %s.cnt = 0x%lx\n", name, type->cnt); |
@@ -1111,13 +1421,14 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name | |||
1111 | 1421 | ||
1112 | base = rgn->base; | 1422 | base = rgn->base; |
1113 | size = rgn->size; | 1423 | size = rgn->size; |
1424 | flags = rgn->flags; | ||
1114 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 1425 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
1115 | if (memblock_get_region_node(rgn) != MAX_NUMNODES) | 1426 | if (memblock_get_region_node(rgn) != MAX_NUMNODES) |
1116 | snprintf(nid_buf, sizeof(nid_buf), " on node %d", | 1427 | snprintf(nid_buf, sizeof(nid_buf), " on node %d", |
1117 | memblock_get_region_node(rgn)); | 1428 | memblock_get_region_node(rgn)); |
1118 | #endif | 1429 | #endif |
1119 | pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s\n", | 1430 | pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s flags: %#lx\n", |
1120 | name, i, base, base + size - 1, size, nid_buf); | 1431 | name, i, base, base + size - 1, size, nid_buf, flags); |
1121 | } | 1432 | } |
1122 | } | 1433 | } |
1123 | 1434 | ||
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7caff36180cd..67dd2a881433 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -1688,13 +1688,13 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, | |||
1688 | */ | 1688 | */ |
1689 | void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) | 1689 | void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) |
1690 | { | 1690 | { |
1691 | struct cgroup *task_cgrp; | ||
1692 | struct cgroup *mem_cgrp; | ||
1693 | /* | 1691 | /* |
1694 | * Need a buffer in BSS, can't rely on allocations. The code relies | 1692 | * protects memcg_name and makes sure that parallel ooms do not |
1695 | * on the assumption that OOM is serialized for memory controller. | 1693 | * interleave |
1696 | * If this assumption is broken, revisit this code. | ||
1697 | */ | 1694 | */ |
1695 | static DEFINE_SPINLOCK(oom_info_lock); | ||
1696 | struct cgroup *task_cgrp; | ||
1697 | struct cgroup *mem_cgrp; | ||
1698 | static char memcg_name[PATH_MAX]; | 1698 | static char memcg_name[PATH_MAX]; |
1699 | int ret; | 1699 | int ret; |
1700 | struct mem_cgroup *iter; | 1700 | struct mem_cgroup *iter; |
@@ -1703,6 +1703,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) | |||
1703 | if (!p) | 1703 | if (!p) |
1704 | return; | 1704 | return; |
1705 | 1705 | ||
1706 | spin_lock(&oom_info_lock); | ||
1706 | rcu_read_lock(); | 1707 | rcu_read_lock(); |
1707 | 1708 | ||
1708 | mem_cgrp = memcg->css.cgroup; | 1709 | mem_cgrp = memcg->css.cgroup; |
@@ -1771,6 +1772,7 @@ done: | |||
1771 | 1772 | ||
1772 | pr_cont("\n"); | 1773 | pr_cont("\n"); |
1773 | } | 1774 | } |
1775 | spin_unlock(&oom_info_lock); | ||
1774 | } | 1776 | } |
1775 | 1777 | ||
1776 | /* | 1778 | /* |
@@ -3000,7 +3002,8 @@ static DEFINE_MUTEX(set_limit_mutex); | |||
3000 | static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) | 3002 | static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) |
3001 | { | 3003 | { |
3002 | return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) && | 3004 | return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) && |
3003 | (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK); | 3005 | (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK) == |
3006 | KMEM_ACCOUNTED_MASK; | ||
3004 | } | 3007 | } |
3005 | 3008 | ||
3006 | /* | 3009 | /* |
@@ -3126,7 +3129,7 @@ int memcg_cache_id(struct mem_cgroup *memcg) | |||
3126 | * But when we create a new cache, we can call this as well if its parent | 3129 | * But when we create a new cache, we can call this as well if its parent |
3127 | * is kmem-limited. That will have to hold set_limit_mutex as well. | 3130 | * is kmem-limited. That will have to hold set_limit_mutex as well. |
3128 | */ | 3131 | */ |
3129 | int memcg_update_cache_sizes(struct mem_cgroup *memcg) | 3132 | static int memcg_update_cache_sizes(struct mem_cgroup *memcg) |
3130 | { | 3133 | { |
3131 | int num, ret; | 3134 | int num, ret; |
3132 | 3135 | ||
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index fabe55046c1d..b25ed321e667 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -611,7 +611,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) | |||
611 | } | 611 | } |
612 | 612 | ||
613 | /* | 613 | /* |
614 | * Dirty cache page page | 614 | * Dirty pagecache page |
615 | * Issues: when the error hit a hole page the error is not properly | 615 | * Issues: when the error hit a hole page the error is not properly |
616 | * propagated. | 616 | * propagated. |
617 | */ | 617 | */ |
@@ -1585,7 +1585,13 @@ static int __soft_offline_page(struct page *page, int flags) | |||
1585 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, | 1585 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, |
1586 | MIGRATE_SYNC, MR_MEMORY_FAILURE); | 1586 | MIGRATE_SYNC, MR_MEMORY_FAILURE); |
1587 | if (ret) { | 1587 | if (ret) { |
1588 | putback_lru_pages(&pagelist); | 1588 | if (!list_empty(&pagelist)) { |
1589 | list_del(&page->lru); | ||
1590 | dec_zone_page_state(page, NR_ISOLATED_ANON + | ||
1591 | page_is_file_cache(page)); | ||
1592 | putback_lru_page(page); | ||
1593 | } | ||
1594 | |||
1589 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", | 1595 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", |
1590 | pfn, ret, page->flags); | 1596 | pfn, ret, page->flags); |
1591 | if (ret > 0) | 1597 | if (ret > 0) |
diff --git a/mm/memory.c b/mm/memory.c index 6768ce9e57d2..86487dfa5e59 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -59,6 +59,7 @@ | |||
59 | #include <linux/gfp.h> | 59 | #include <linux/gfp.h> |
60 | #include <linux/migrate.h> | 60 | #include <linux/migrate.h> |
61 | #include <linux/string.h> | 61 | #include <linux/string.h> |
62 | #include <linux/dma-debug.h> | ||
62 | 63 | ||
63 | #include <asm/io.h> | 64 | #include <asm/io.h> |
64 | #include <asm/pgalloc.h> | 65 | #include <asm/pgalloc.h> |
@@ -2559,6 +2560,8 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, | |||
2559 | 2560 | ||
2560 | static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) | 2561 | static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) |
2561 | { | 2562 | { |
2563 | debug_dma_assert_idle(src); | ||
2564 | |||
2562 | /* | 2565 | /* |
2563 | * If the source page was a PFN mapping, we don't have | 2566 | * If the source page was a PFN mapping, we don't have |
2564 | * a "struct page" for it. We do a best-effort copy by | 2567 | * a "struct page" for it. We do a best-effort copy by |
@@ -4272,11 +4275,20 @@ void copy_user_huge_page(struct page *dst, struct page *src, | |||
4272 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ | 4275 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ |
4273 | 4276 | ||
4274 | #if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS | 4277 | #if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS |
4278 | |||
4279 | static struct kmem_cache *page_ptl_cachep; | ||
4280 | |||
4281 | void __init ptlock_cache_init(void) | ||
4282 | { | ||
4283 | page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0, | ||
4284 | SLAB_PANIC, NULL); | ||
4285 | } | ||
4286 | |||
4275 | bool ptlock_alloc(struct page *page) | 4287 | bool ptlock_alloc(struct page *page) |
4276 | { | 4288 | { |
4277 | spinlock_t *ptl; | 4289 | spinlock_t *ptl; |
4278 | 4290 | ||
4279 | ptl = kmalloc(sizeof(spinlock_t), GFP_KERNEL); | 4291 | ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL); |
4280 | if (!ptl) | 4292 | if (!ptl) |
4281 | return false; | 4293 | return false; |
4282 | page->ptl = ptl; | 4294 | page->ptl = ptl; |
@@ -4285,6 +4297,6 @@ bool ptlock_alloc(struct page *page) | |||
4285 | 4297 | ||
4286 | void ptlock_free(struct page *page) | 4298 | void ptlock_free(struct page *page) |
4287 | { | 4299 | { |
4288 | kfree(page->ptl); | 4300 | kmem_cache_free(page_ptl_cachep, page->ptl); |
4289 | } | 4301 | } |
4290 | #endif | 4302 | #endif |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 489f235502db..cc2ab37220b7 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -9,7 +9,6 @@ | |||
9 | #include <linux/swap.h> | 9 | #include <linux/swap.h> |
10 | #include <linux/interrupt.h> | 10 | #include <linux/interrupt.h> |
11 | #include <linux/pagemap.h> | 11 | #include <linux/pagemap.h> |
12 | #include <linux/bootmem.h> | ||
13 | #include <linux/compiler.h> | 12 | #include <linux/compiler.h> |
14 | #include <linux/export.h> | 13 | #include <linux/export.h> |
15 | #include <linux/pagevec.h> | 14 | #include <linux/pagevec.h> |
@@ -269,7 +268,7 @@ static void fix_zone_id(struct zone *zone, unsigned long start_pfn, | |||
269 | } | 268 | } |
270 | 269 | ||
271 | /* Can fail with -ENOMEM from allocating a wait table with vmalloc() or | 270 | /* Can fail with -ENOMEM from allocating a wait table with vmalloc() or |
272 | * alloc_bootmem_node_nopanic() */ | 271 | * alloc_bootmem_node_nopanic()/memblock_virt_alloc_node_nopanic() */ |
273 | static int __ref ensure_zone_is_initialized(struct zone *zone, | 272 | static int __ref ensure_zone_is_initialized(struct zone *zone, |
274 | unsigned long start_pfn, unsigned long num_pages) | 273 | unsigned long start_pfn, unsigned long num_pages) |
275 | { | 274 | { |
@@ -1446,6 +1445,7 @@ static int __init cmdline_parse_movable_node(char *p) | |||
1446 | * the kernel away from hotpluggable memory. | 1445 | * the kernel away from hotpluggable memory. |
1447 | */ | 1446 | */ |
1448 | memblock_set_bottom_up(true); | 1447 | memblock_set_bottom_up(true); |
1448 | movable_node_enabled = true; | ||
1449 | #else | 1449 | #else |
1450 | pr_warn("movable_node option not supported\n"); | 1450 | pr_warn("movable_node option not supported\n"); |
1451 | #endif | 1451 | #endif |
diff --git a/mm/migrate.c b/mm/migrate.c index 9194375b2307..a8025befc323 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -72,28 +72,12 @@ int migrate_prep_local(void) | |||
72 | } | 72 | } |
73 | 73 | ||
74 | /* | 74 | /* |
75 | * Add isolated pages on the list back to the LRU under page lock | ||
76 | * to avoid leaking evictable pages back onto unevictable list. | ||
77 | */ | ||
78 | void putback_lru_pages(struct list_head *l) | ||
79 | { | ||
80 | struct page *page; | ||
81 | struct page *page2; | ||
82 | |||
83 | list_for_each_entry_safe(page, page2, l, lru) { | ||
84 | list_del(&page->lru); | ||
85 | dec_zone_page_state(page, NR_ISOLATED_ANON + | ||
86 | page_is_file_cache(page)); | ||
87 | putback_lru_page(page); | ||
88 | } | ||
89 | } | ||
90 | |||
91 | /* | ||
92 | * Put previously isolated pages back onto the appropriate lists | 75 | * Put previously isolated pages back onto the appropriate lists |
93 | * from where they were once taken off for compaction/migration. | 76 | * from where they were once taken off for compaction/migration. |
94 | * | 77 | * |
95 | * This function shall be used instead of putback_lru_pages(), | 78 | * This function shall be used whenever the isolated pageset has been |
96 | * whenever the isolated pageset has been built by isolate_migratepages_range() | 79 | * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range() |
80 | * and isolate_huge_page(). | ||
97 | */ | 81 | */ |
98 | void putback_movable_pages(struct list_head *l) | 82 | void putback_movable_pages(struct list_head *l) |
99 | { | 83 | { |
@@ -199,7 +183,12 @@ out: | |||
199 | */ | 183 | */ |
200 | static void remove_migration_ptes(struct page *old, struct page *new) | 184 | static void remove_migration_ptes(struct page *old, struct page *new) |
201 | { | 185 | { |
202 | rmap_walk(new, remove_migration_pte, old); | 186 | struct rmap_walk_control rwc = { |
187 | .rmap_one = remove_migration_pte, | ||
188 | .arg = old, | ||
189 | }; | ||
190 | |||
191 | rmap_walk(new, &rwc); | ||
203 | } | 192 | } |
204 | 193 | ||
205 | /* | 194 | /* |
@@ -563,14 +552,6 @@ void migrate_page_copy(struct page *newpage, struct page *page) | |||
563 | * Migration functions | 552 | * Migration functions |
564 | ***********************************************************/ | 553 | ***********************************************************/ |
565 | 554 | ||
566 | /* Always fail migration. Used for mappings that are not movable */ | ||
567 | int fail_migrate_page(struct address_space *mapping, | ||
568 | struct page *newpage, struct page *page) | ||
569 | { | ||
570 | return -EIO; | ||
571 | } | ||
572 | EXPORT_SYMBOL(fail_migrate_page); | ||
573 | |||
574 | /* | 555 | /* |
575 | * Common logic to directly migrate a single page suitable for | 556 | * Common logic to directly migrate a single page suitable for |
576 | * pages that do not use PagePrivate/PagePrivate2. | 557 | * pages that do not use PagePrivate/PagePrivate2. |
@@ -1008,7 +989,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
1008 | { | 989 | { |
1009 | int rc = 0; | 990 | int rc = 0; |
1010 | int *result = NULL; | 991 | int *result = NULL; |
1011 | struct page *new_hpage = get_new_page(hpage, private, &result); | 992 | struct page *new_hpage; |
1012 | struct anon_vma *anon_vma = NULL; | 993 | struct anon_vma *anon_vma = NULL; |
1013 | 994 | ||
1014 | /* | 995 | /* |
@@ -1018,9 +999,12 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
1018 | * tables or check whether the hugepage is pmd-based or not before | 999 | * tables or check whether the hugepage is pmd-based or not before |
1019 | * kicking migration. | 1000 | * kicking migration. |
1020 | */ | 1001 | */ |
1021 | if (!hugepage_migration_support(page_hstate(hpage))) | 1002 | if (!hugepage_migration_support(page_hstate(hpage))) { |
1003 | putback_active_hugepage(hpage); | ||
1022 | return -ENOSYS; | 1004 | return -ENOSYS; |
1005 | } | ||
1023 | 1006 | ||
1007 | new_hpage = get_new_page(hpage, private, &result); | ||
1024 | if (!new_hpage) | 1008 | if (!new_hpage) |
1025 | return -ENOMEM; | 1009 | return -ENOMEM; |
1026 | 1010 | ||
@@ -1120,7 +1104,12 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, | |||
1120 | nr_succeeded++; | 1104 | nr_succeeded++; |
1121 | break; | 1105 | break; |
1122 | default: | 1106 | default: |
1123 | /* Permanent failure */ | 1107 | /* |
1108 | * Permanent failure (-EBUSY, -ENOSYS, etc.): | ||
1109 | * unlike -EAGAIN case, the failed page is | ||
1110 | * removed from migration page list and not | ||
1111 | * retried in the next outer loop. | ||
1112 | */ | ||
1124 | nr_failed++; | 1113 | nr_failed++; |
1125 | break; | 1114 | break; |
1126 | } | 1115 | } |
@@ -1594,31 +1583,38 @@ bool migrate_ratelimited(int node) | |||
1594 | } | 1583 | } |
1595 | 1584 | ||
1596 | /* Returns true if the node is migrate rate-limited after the update */ | 1585 | /* Returns true if the node is migrate rate-limited after the update */ |
1597 | bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages) | 1586 | static bool numamigrate_update_ratelimit(pg_data_t *pgdat, |
1587 | unsigned long nr_pages) | ||
1598 | { | 1588 | { |
1599 | bool rate_limited = false; | ||
1600 | |||
1601 | /* | 1589 | /* |
1602 | * Rate-limit the amount of data that is being migrated to a node. | 1590 | * Rate-limit the amount of data that is being migrated to a node. |
1603 | * Optimal placement is no good if the memory bus is saturated and | 1591 | * Optimal placement is no good if the memory bus is saturated and |
1604 | * all the time is being spent migrating! | 1592 | * all the time is being spent migrating! |
1605 | */ | 1593 | */ |
1606 | spin_lock(&pgdat->numabalancing_migrate_lock); | ||
1607 | if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) { | 1594 | if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) { |
1595 | spin_lock(&pgdat->numabalancing_migrate_lock); | ||
1608 | pgdat->numabalancing_migrate_nr_pages = 0; | 1596 | pgdat->numabalancing_migrate_nr_pages = 0; |
1609 | pgdat->numabalancing_migrate_next_window = jiffies + | 1597 | pgdat->numabalancing_migrate_next_window = jiffies + |
1610 | msecs_to_jiffies(migrate_interval_millisecs); | 1598 | msecs_to_jiffies(migrate_interval_millisecs); |
1599 | spin_unlock(&pgdat->numabalancing_migrate_lock); | ||
1611 | } | 1600 | } |
1612 | if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) | 1601 | if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) { |
1613 | rate_limited = true; | 1602 | trace_mm_numa_migrate_ratelimit(current, pgdat->node_id, |
1614 | else | 1603 | nr_pages); |
1615 | pgdat->numabalancing_migrate_nr_pages += nr_pages; | 1604 | return true; |
1616 | spin_unlock(&pgdat->numabalancing_migrate_lock); | 1605 | } |
1617 | 1606 | ||
1618 | return rate_limited; | 1607 | /* |
1608 | * This is an unlocked non-atomic update so errors are possible. | ||
1609 | * The consequences are failing to migrate when we potentiall should | ||
1610 | * have which is not severe enough to warrant locking. If it is ever | ||
1611 | * a problem, it can be converted to a per-cpu counter. | ||
1612 | */ | ||
1613 | pgdat->numabalancing_migrate_nr_pages += nr_pages; | ||
1614 | return false; | ||
1619 | } | 1615 | } |
1620 | 1616 | ||
1621 | int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) | 1617 | static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) |
1622 | { | 1618 | { |
1623 | int page_lru; | 1619 | int page_lru; |
1624 | 1620 | ||
@@ -1705,7 +1701,12 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, | |||
1705 | nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, | 1701 | nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, |
1706 | node, MIGRATE_ASYNC, MR_NUMA_MISPLACED); | 1702 | node, MIGRATE_ASYNC, MR_NUMA_MISPLACED); |
1707 | if (nr_remaining) { | 1703 | if (nr_remaining) { |
1708 | putback_lru_pages(&migratepages); | 1704 | if (!list_empty(&migratepages)) { |
1705 | list_del(&page->lru); | ||
1706 | dec_zone_page_state(page, NR_ISOLATED_ANON + | ||
1707 | page_is_file_cache(page)); | ||
1708 | putback_lru_page(page); | ||
1709 | } | ||
1709 | isolated = 0; | 1710 | isolated = 0; |
1710 | } else | 1711 | } else |
1711 | count_vm_numa_event(NUMA_PAGE_MIGRATE); | 1712 | count_vm_numa_event(NUMA_PAGE_MIGRATE); |
diff --git a/mm/mlock.c b/mm/mlock.c index 192e6eebe4f2..10819ed4df3e 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -709,19 +709,21 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) | |||
709 | 709 | ||
710 | lru_add_drain_all(); /* flush pagevec */ | 710 | lru_add_drain_all(); /* flush pagevec */ |
711 | 711 | ||
712 | down_write(¤t->mm->mmap_sem); | ||
713 | len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); | 712 | len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); |
714 | start &= PAGE_MASK; | 713 | start &= PAGE_MASK; |
715 | 714 | ||
716 | locked = len >> PAGE_SHIFT; | ||
717 | locked += current->mm->locked_vm; | ||
718 | |||
719 | lock_limit = rlimit(RLIMIT_MEMLOCK); | 715 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
720 | lock_limit >>= PAGE_SHIFT; | 716 | lock_limit >>= PAGE_SHIFT; |
717 | locked = len >> PAGE_SHIFT; | ||
718 | |||
719 | down_write(¤t->mm->mmap_sem); | ||
720 | |||
721 | locked += current->mm->locked_vm; | ||
721 | 722 | ||
722 | /* check against resource limits */ | 723 | /* check against resource limits */ |
723 | if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) | 724 | if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) |
724 | error = do_mlock(start, len, 1); | 725 | error = do_mlock(start, len, 1); |
726 | |||
725 | up_write(¤t->mm->mmap_sem); | 727 | up_write(¤t->mm->mmap_sem); |
726 | if (!error) | 728 | if (!error) |
727 | error = __mm_populate(start, len, 0); | 729 | error = __mm_populate(start, len, 0); |
@@ -732,11 +734,13 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) | |||
732 | { | 734 | { |
733 | int ret; | 735 | int ret; |
734 | 736 | ||
735 | down_write(¤t->mm->mmap_sem); | ||
736 | len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); | 737 | len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); |
737 | start &= PAGE_MASK; | 738 | start &= PAGE_MASK; |
739 | |||
740 | down_write(¤t->mm->mmap_sem); | ||
738 | ret = do_mlock(start, len, 0); | 741 | ret = do_mlock(start, len, 0); |
739 | up_write(¤t->mm->mmap_sem); | 742 | up_write(¤t->mm->mmap_sem); |
743 | |||
740 | return ret; | 744 | return ret; |
741 | } | 745 | } |
742 | 746 | ||
@@ -781,12 +785,12 @@ SYSCALL_DEFINE1(mlockall, int, flags) | |||
781 | if (flags & MCL_CURRENT) | 785 | if (flags & MCL_CURRENT) |
782 | lru_add_drain_all(); /* flush pagevec */ | 786 | lru_add_drain_all(); /* flush pagevec */ |
783 | 787 | ||
784 | down_write(¤t->mm->mmap_sem); | ||
785 | |||
786 | lock_limit = rlimit(RLIMIT_MEMLOCK); | 788 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
787 | lock_limit >>= PAGE_SHIFT; | 789 | lock_limit >>= PAGE_SHIFT; |
788 | 790 | ||
789 | ret = -ENOMEM; | 791 | ret = -ENOMEM; |
792 | down_write(¤t->mm->mmap_sem); | ||
793 | |||
790 | if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || | 794 | if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || |
791 | capable(CAP_IPC_LOCK)) | 795 | capable(CAP_IPC_LOCK)) |
792 | ret = do_mlockall(flags); | 796 | ret = do_mlockall(flags); |
@@ -86,6 +86,7 @@ EXPORT_SYMBOL(vm_get_page_prot); | |||
86 | 86 | ||
87 | int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ | 87 | int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ |
88 | int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ | 88 | int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ |
89 | unsigned long sysctl_overcommit_kbytes __read_mostly; | ||
89 | int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; | 90 | int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; |
90 | unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ | 91 | unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ |
91 | unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ | 92 | unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ |
@@ -1190,6 +1191,24 @@ static inline unsigned long round_hint_to_min(unsigned long hint) | |||
1190 | return hint; | 1191 | return hint; |
1191 | } | 1192 | } |
1192 | 1193 | ||
1194 | static inline int mlock_future_check(struct mm_struct *mm, | ||
1195 | unsigned long flags, | ||
1196 | unsigned long len) | ||
1197 | { | ||
1198 | unsigned long locked, lock_limit; | ||
1199 | |||
1200 | /* mlock MCL_FUTURE? */ | ||
1201 | if (flags & VM_LOCKED) { | ||
1202 | locked = len >> PAGE_SHIFT; | ||
1203 | locked += mm->locked_vm; | ||
1204 | lock_limit = rlimit(RLIMIT_MEMLOCK); | ||
1205 | lock_limit >>= PAGE_SHIFT; | ||
1206 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | ||
1207 | return -EAGAIN; | ||
1208 | } | ||
1209 | return 0; | ||
1210 | } | ||
1211 | |||
1193 | /* | 1212 | /* |
1194 | * The caller must hold down_write(¤t->mm->mmap_sem). | 1213 | * The caller must hold down_write(¤t->mm->mmap_sem). |
1195 | */ | 1214 | */ |
@@ -1251,16 +1270,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
1251 | if (!can_do_mlock()) | 1270 | if (!can_do_mlock()) |
1252 | return -EPERM; | 1271 | return -EPERM; |
1253 | 1272 | ||
1254 | /* mlock MCL_FUTURE? */ | 1273 | if (mlock_future_check(mm, vm_flags, len)) |
1255 | if (vm_flags & VM_LOCKED) { | 1274 | return -EAGAIN; |
1256 | unsigned long locked, lock_limit; | ||
1257 | locked = len >> PAGE_SHIFT; | ||
1258 | locked += mm->locked_vm; | ||
1259 | lock_limit = rlimit(RLIMIT_MEMLOCK); | ||
1260 | lock_limit >>= PAGE_SHIFT; | ||
1261 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | ||
1262 | return -EAGAIN; | ||
1263 | } | ||
1264 | 1275 | ||
1265 | if (file) { | 1276 | if (file) { |
1266 | struct inode *inode = file_inode(file); | 1277 | struct inode *inode = file_inode(file); |
@@ -2591,18 +2602,9 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) | |||
2591 | if (error & ~PAGE_MASK) | 2602 | if (error & ~PAGE_MASK) |
2592 | return error; | 2603 | return error; |
2593 | 2604 | ||
2594 | /* | 2605 | error = mlock_future_check(mm, mm->def_flags, len); |
2595 | * mlock MCL_FUTURE? | 2606 | if (error) |
2596 | */ | 2607 | return error; |
2597 | if (mm->def_flags & VM_LOCKED) { | ||
2598 | unsigned long locked, lock_limit; | ||
2599 | locked = len >> PAGE_SHIFT; | ||
2600 | locked += mm->locked_vm; | ||
2601 | lock_limit = rlimit(RLIMIT_MEMLOCK); | ||
2602 | lock_limit >>= PAGE_SHIFT; | ||
2603 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | ||
2604 | return -EAGAIN; | ||
2605 | } | ||
2606 | 2608 | ||
2607 | /* | 2609 | /* |
2608 | * mm->mmap_sem is required to protect against another thread | 2610 | * mm->mmap_sem is required to protect against another thread |
diff --git a/mm/mprotect.c b/mm/mprotect.c index bb53a6591aea..7332c1785744 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -23,6 +23,7 @@ | |||
23 | #include <linux/mmu_notifier.h> | 23 | #include <linux/mmu_notifier.h> |
24 | #include <linux/migrate.h> | 24 | #include <linux/migrate.h> |
25 | #include <linux/perf_event.h> | 25 | #include <linux/perf_event.h> |
26 | #include <linux/ksm.h> | ||
26 | #include <asm/uaccess.h> | 27 | #include <asm/uaccess.h> |
27 | #include <asm/pgtable.h> | 28 | #include <asm/pgtable.h> |
28 | #include <asm/cacheflush.h> | 29 | #include <asm/cacheflush.h> |
@@ -63,7 +64,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
63 | 64 | ||
64 | ptent = *pte; | 65 | ptent = *pte; |
65 | page = vm_normal_page(vma, addr, oldpte); | 66 | page = vm_normal_page(vma, addr, oldpte); |
66 | if (page) { | 67 | if (page && !PageKsm(page)) { |
67 | if (!pte_numa(oldpte)) { | 68 | if (!pte_numa(oldpte)) { |
68 | ptent = pte_mknuma(ptent); | 69 | ptent = pte_mknuma(ptent); |
69 | set_pte_at(mm, addr, pte, ptent); | 70 | set_pte_at(mm, addr, pte, ptent); |
diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 2c254d374655..19121ceb8874 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c | |||
@@ -41,7 +41,7 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, | |||
41 | if (limit > memblock.current_limit) | 41 | if (limit > memblock.current_limit) |
42 | limit = memblock.current_limit; | 42 | limit = memblock.current_limit; |
43 | 43 | ||
44 | addr = memblock_find_in_range_node(goal, limit, size, align, nid); | 44 | addr = memblock_find_in_range_node(size, align, goal, limit, nid); |
45 | if (!addr) | 45 | if (!addr) |
46 | return NULL; | 46 | return NULL; |
47 | 47 | ||
@@ -117,7 +117,7 @@ static unsigned long __init free_low_memory_core_early(void) | |||
117 | phys_addr_t start, end, size; | 117 | phys_addr_t start, end, size; |
118 | u64 i; | 118 | u64 i; |
119 | 119 | ||
120 | for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) | 120 | for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) |
121 | count += __free_memory_core(start, end); | 121 | count += __free_memory_core(start, end); |
122 | 122 | ||
123 | /* free range that is used for reserved array if we allocate it */ | 123 | /* free range that is used for reserved array if we allocate it */ |
@@ -161,7 +161,7 @@ unsigned long __init free_all_bootmem(void) | |||
161 | reset_all_zones_managed_pages(); | 161 | reset_all_zones_managed_pages(); |
162 | 162 | ||
163 | /* | 163 | /* |
164 | * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id | 164 | * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id |
165 | * because in some case like Node0 doesn't have RAM installed | 165 | * because in some case like Node0 doesn't have RAM installed |
166 | * low ram will be on Node1 | 166 | * low ram will be on Node1 |
167 | */ | 167 | */ |
@@ -215,7 +215,7 @@ static void * __init ___alloc_bootmem_nopanic(unsigned long size, | |||
215 | 215 | ||
216 | restart: | 216 | restart: |
217 | 217 | ||
218 | ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit); | 218 | ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align, goal, limit); |
219 | 219 | ||
220 | if (ptr) | 220 | if (ptr) |
221 | return ptr; | 221 | return ptr; |
@@ -299,7 +299,7 @@ again: | |||
299 | if (ptr) | 299 | if (ptr) |
300 | return ptr; | 300 | return ptr; |
301 | 301 | ||
302 | ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, | 302 | ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align, |
303 | goal, limit); | 303 | goal, limit); |
304 | if (ptr) | 304 | if (ptr) |
305 | return ptr; | 305 | return ptr; |
diff --git a/mm/nommu.c b/mm/nommu.c index fec093adad9c..8740213b1647 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -60,6 +60,7 @@ unsigned long highest_memmap_pfn; | |||
60 | struct percpu_counter vm_committed_as; | 60 | struct percpu_counter vm_committed_as; |
61 | int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ | 61 | int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ |
62 | int sysctl_overcommit_ratio = 50; /* default is 50% */ | 62 | int sysctl_overcommit_ratio = 50; /* default is 50% */ |
63 | unsigned long sysctl_overcommit_kbytes __read_mostly; | ||
63 | int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; | 64 | int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; |
64 | int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; | 65 | int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; |
65 | unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ | 66 | unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 1e4a600a6163..054ff47c4478 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -47,19 +47,21 @@ static DEFINE_SPINLOCK(zone_scan_lock); | |||
47 | #ifdef CONFIG_NUMA | 47 | #ifdef CONFIG_NUMA |
48 | /** | 48 | /** |
49 | * has_intersects_mems_allowed() - check task eligiblity for kill | 49 | * has_intersects_mems_allowed() - check task eligiblity for kill |
50 | * @tsk: task struct of which task to consider | 50 | * @start: task struct of which task to consider |
51 | * @mask: nodemask passed to page allocator for mempolicy ooms | 51 | * @mask: nodemask passed to page allocator for mempolicy ooms |
52 | * | 52 | * |
53 | * Task eligibility is determined by whether or not a candidate task, @tsk, | 53 | * Task eligibility is determined by whether or not a candidate task, @tsk, |
54 | * shares the same mempolicy nodes as current if it is bound by such a policy | 54 | * shares the same mempolicy nodes as current if it is bound by such a policy |
55 | * and whether or not it has the same set of allowed cpuset nodes. | 55 | * and whether or not it has the same set of allowed cpuset nodes. |
56 | */ | 56 | */ |
57 | static bool has_intersects_mems_allowed(struct task_struct *tsk, | 57 | static bool has_intersects_mems_allowed(struct task_struct *start, |
58 | const nodemask_t *mask) | 58 | const nodemask_t *mask) |
59 | { | 59 | { |
60 | struct task_struct *start = tsk; | 60 | struct task_struct *tsk; |
61 | bool ret = false; | ||
61 | 62 | ||
62 | do { | 63 | rcu_read_lock(); |
64 | for_each_thread(start, tsk) { | ||
63 | if (mask) { | 65 | if (mask) { |
64 | /* | 66 | /* |
65 | * If this is a mempolicy constrained oom, tsk's | 67 | * If this is a mempolicy constrained oom, tsk's |
@@ -67,19 +69,20 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk, | |||
67 | * mempolicy intersects current, otherwise it may be | 69 | * mempolicy intersects current, otherwise it may be |
68 | * needlessly killed. | 70 | * needlessly killed. |
69 | */ | 71 | */ |
70 | if (mempolicy_nodemask_intersects(tsk, mask)) | 72 | ret = mempolicy_nodemask_intersects(tsk, mask); |
71 | return true; | ||
72 | } else { | 73 | } else { |
73 | /* | 74 | /* |
74 | * This is not a mempolicy constrained oom, so only | 75 | * This is not a mempolicy constrained oom, so only |
75 | * check the mems of tsk's cpuset. | 76 | * check the mems of tsk's cpuset. |
76 | */ | 77 | */ |
77 | if (cpuset_mems_allowed_intersects(current, tsk)) | 78 | ret = cpuset_mems_allowed_intersects(current, tsk); |
78 | return true; | ||
79 | } | 79 | } |
80 | } while_each_thread(start, tsk); | 80 | if (ret) |
81 | break; | ||
82 | } | ||
83 | rcu_read_unlock(); | ||
81 | 84 | ||
82 | return false; | 85 | return ret; |
83 | } | 86 | } |
84 | #else | 87 | #else |
85 | static bool has_intersects_mems_allowed(struct task_struct *tsk, | 88 | static bool has_intersects_mems_allowed(struct task_struct *tsk, |
@@ -97,16 +100,21 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk, | |||
97 | */ | 100 | */ |
98 | struct task_struct *find_lock_task_mm(struct task_struct *p) | 101 | struct task_struct *find_lock_task_mm(struct task_struct *p) |
99 | { | 102 | { |
100 | struct task_struct *t = p; | 103 | struct task_struct *t; |
101 | 104 | ||
102 | do { | 105 | rcu_read_lock(); |
106 | |||
107 | for_each_thread(p, t) { | ||
103 | task_lock(t); | 108 | task_lock(t); |
104 | if (likely(t->mm)) | 109 | if (likely(t->mm)) |
105 | return t; | 110 | goto found; |
106 | task_unlock(t); | 111 | task_unlock(t); |
107 | } while_each_thread(p, t); | 112 | } |
113 | t = NULL; | ||
114 | found: | ||
115 | rcu_read_unlock(); | ||
108 | 116 | ||
109 | return NULL; | 117 | return t; |
110 | } | 118 | } |
111 | 119 | ||
112 | /* return true if the task is not adequate as candidate victim task. */ | 120 | /* return true if the task is not adequate as candidate victim task. */ |
@@ -301,7 +309,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
301 | unsigned long chosen_points = 0; | 309 | unsigned long chosen_points = 0; |
302 | 310 | ||
303 | rcu_read_lock(); | 311 | rcu_read_lock(); |
304 | do_each_thread(g, p) { | 312 | for_each_process_thread(g, p) { |
305 | unsigned int points; | 313 | unsigned int points; |
306 | 314 | ||
307 | switch (oom_scan_process_thread(p, totalpages, nodemask, | 315 | switch (oom_scan_process_thread(p, totalpages, nodemask, |
@@ -323,7 +331,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
323 | chosen = p; | 331 | chosen = p; |
324 | chosen_points = points; | 332 | chosen_points = points; |
325 | } | 333 | } |
326 | } while_each_thread(g, p); | 334 | } |
327 | if (chosen) | 335 | if (chosen) |
328 | get_task_struct(chosen); | 336 | get_task_struct(chosen); |
329 | rcu_read_unlock(); | 337 | rcu_read_unlock(); |
@@ -406,7 +414,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
406 | { | 414 | { |
407 | struct task_struct *victim = p; | 415 | struct task_struct *victim = p; |
408 | struct task_struct *child; | 416 | struct task_struct *child; |
409 | struct task_struct *t = p; | 417 | struct task_struct *t; |
410 | struct mm_struct *mm; | 418 | struct mm_struct *mm; |
411 | unsigned int victim_points = 0; | 419 | unsigned int victim_points = 0; |
412 | static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, | 420 | static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, |
@@ -437,7 +445,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
437 | * still freeing memory. | 445 | * still freeing memory. |
438 | */ | 446 | */ |
439 | read_lock(&tasklist_lock); | 447 | read_lock(&tasklist_lock); |
440 | do { | 448 | for_each_thread(p, t) { |
441 | list_for_each_entry(child, &t->children, sibling) { | 449 | list_for_each_entry(child, &t->children, sibling) { |
442 | unsigned int child_points; | 450 | unsigned int child_points; |
443 | 451 | ||
@@ -455,13 +463,11 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
455 | get_task_struct(victim); | 463 | get_task_struct(victim); |
456 | } | 464 | } |
457 | } | 465 | } |
458 | } while_each_thread(p, t); | 466 | } |
459 | read_unlock(&tasklist_lock); | 467 | read_unlock(&tasklist_lock); |
460 | 468 | ||
461 | rcu_read_lock(); | ||
462 | p = find_lock_task_mm(victim); | 469 | p = find_lock_task_mm(victim); |
463 | if (!p) { | 470 | if (!p) { |
464 | rcu_read_unlock(); | ||
465 | put_task_struct(victim); | 471 | put_task_struct(victim); |
466 | return; | 472 | return; |
467 | } else if (victim != p) { | 473 | } else if (victim != p) { |
@@ -487,6 +493,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
487 | * That thread will now get access to memory reserves since it has a | 493 | * That thread will now get access to memory reserves since it has a |
488 | * pending fatal signal. | 494 | * pending fatal signal. |
489 | */ | 495 | */ |
496 | rcu_read_lock(); | ||
490 | for_each_process(p) | 497 | for_each_process(p) |
491 | if (p->mm == mm && !same_thread_group(p, victim) && | 498 | if (p->mm == mm && !same_thread_group(p, victim) && |
492 | !(p->flags & PF_KTHREAD)) { | 499 | !(p->flags & PF_KTHREAD)) { |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5248fe070aa4..533e2147d14f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -2072,13 +2072,6 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) | |||
2072 | return; | 2072 | return; |
2073 | 2073 | ||
2074 | /* | 2074 | /* |
2075 | * Walking all memory to count page types is very expensive and should | ||
2076 | * be inhibited in non-blockable contexts. | ||
2077 | */ | ||
2078 | if (!(gfp_mask & __GFP_WAIT)) | ||
2079 | filter |= SHOW_MEM_FILTER_PAGE_COUNT; | ||
2080 | |||
2081 | /* | ||
2082 | * This documents exceptions given to allocations in certain | 2075 | * This documents exceptions given to allocations in certain |
2083 | * contexts that are allowed to allocate outside current's set | 2076 | * contexts that are allowed to allocate outside current's set |
2084 | * of allowed nodes. | 2077 | * of allowed nodes. |
@@ -2242,10 +2235,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2242 | preferred_zone, migratetype); | 2235 | preferred_zone, migratetype); |
2243 | if (page) { | 2236 | if (page) { |
2244 | preferred_zone->compact_blockskip_flush = false; | 2237 | preferred_zone->compact_blockskip_flush = false; |
2245 | preferred_zone->compact_considered = 0; | 2238 | compaction_defer_reset(preferred_zone, order, true); |
2246 | preferred_zone->compact_defer_shift = 0; | ||
2247 | if (order >= preferred_zone->compact_order_failed) | ||
2248 | preferred_zone->compact_order_failed = order + 1; | ||
2249 | count_vm_event(COMPACTSUCCESS); | 2239 | count_vm_event(COMPACTSUCCESS); |
2250 | return page; | 2240 | return page; |
2251 | } | 2241 | } |
@@ -2535,8 +2525,15 @@ rebalance: | |||
2535 | } | 2525 | } |
2536 | 2526 | ||
2537 | /* Atomic allocations - we can't balance anything */ | 2527 | /* Atomic allocations - we can't balance anything */ |
2538 | if (!wait) | 2528 | if (!wait) { |
2529 | /* | ||
2530 | * All existing users of the deprecated __GFP_NOFAIL are | ||
2531 | * blockable, so warn of any new users that actually allow this | ||
2532 | * type of allocation to fail. | ||
2533 | */ | ||
2534 | WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL); | ||
2539 | goto nopage; | 2535 | goto nopage; |
2536 | } | ||
2540 | 2537 | ||
2541 | /* Avoid recursion of direct reclaim */ | 2538 | /* Avoid recursion of direct reclaim */ |
2542 | if (current->flags & PF_MEMALLOC) | 2539 | if (current->flags & PF_MEMALLOC) |
@@ -3901,6 +3898,7 @@ static void setup_zone_migrate_reserve(struct zone *zone) | |||
3901 | struct page *page; | 3898 | struct page *page; |
3902 | unsigned long block_migratetype; | 3899 | unsigned long block_migratetype; |
3903 | int reserve; | 3900 | int reserve; |
3901 | int old_reserve; | ||
3904 | 3902 | ||
3905 | /* | 3903 | /* |
3906 | * Get the start pfn, end pfn and the number of blocks to reserve | 3904 | * Get the start pfn, end pfn and the number of blocks to reserve |
@@ -3922,6 +3920,12 @@ static void setup_zone_migrate_reserve(struct zone *zone) | |||
3922 | * future allocation of hugepages at runtime. | 3920 | * future allocation of hugepages at runtime. |
3923 | */ | 3921 | */ |
3924 | reserve = min(2, reserve); | 3922 | reserve = min(2, reserve); |
3923 | old_reserve = zone->nr_migrate_reserve_block; | ||
3924 | |||
3925 | /* When memory hot-add, we almost always need to do nothing */ | ||
3926 | if (reserve == old_reserve) | ||
3927 | return; | ||
3928 | zone->nr_migrate_reserve_block = reserve; | ||
3925 | 3929 | ||
3926 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { | 3930 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { |
3927 | if (!pfn_valid(pfn)) | 3931 | if (!pfn_valid(pfn)) |
@@ -3959,6 +3963,12 @@ static void setup_zone_migrate_reserve(struct zone *zone) | |||
3959 | reserve--; | 3963 | reserve--; |
3960 | continue; | 3964 | continue; |
3961 | } | 3965 | } |
3966 | } else if (!old_reserve) { | ||
3967 | /* | ||
3968 | * At boot time we don't need to scan the whole zone | ||
3969 | * for turning off MIGRATE_RESERVE. | ||
3970 | */ | ||
3971 | break; | ||
3962 | } | 3972 | } |
3963 | 3973 | ||
3964 | /* | 3974 | /* |
@@ -4209,7 +4219,6 @@ static noinline __init_refok | |||
4209 | int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) | 4219 | int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) |
4210 | { | 4220 | { |
4211 | int i; | 4221 | int i; |
4212 | struct pglist_data *pgdat = zone->zone_pgdat; | ||
4213 | size_t alloc_size; | 4222 | size_t alloc_size; |
4214 | 4223 | ||
4215 | /* | 4224 | /* |
@@ -4225,7 +4234,8 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) | |||
4225 | 4234 | ||
4226 | if (!slab_is_available()) { | 4235 | if (!slab_is_available()) { |
4227 | zone->wait_table = (wait_queue_head_t *) | 4236 | zone->wait_table = (wait_queue_head_t *) |
4228 | alloc_bootmem_node_nopanic(pgdat, alloc_size); | 4237 | memblock_virt_alloc_node_nopanic( |
4238 | alloc_size, zone->zone_pgdat->node_id); | ||
4229 | } else { | 4239 | } else { |
4230 | /* | 4240 | /* |
4231 | * This case means that a zone whose size was 0 gets new memory | 4241 | * This case means that a zone whose size was 0 gets new memory |
@@ -4345,13 +4355,14 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node) | |||
4345 | #endif | 4355 | #endif |
4346 | 4356 | ||
4347 | /** | 4357 | /** |
4348 | * free_bootmem_with_active_regions - Call free_bootmem_node for each active range | 4358 | * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range |
4349 | * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. | 4359 | * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. |
4350 | * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node | 4360 | * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid |
4351 | * | 4361 | * |
4352 | * If an architecture guarantees that all ranges registered with | 4362 | * If an architecture guarantees that all ranges registered with |
4353 | * add_active_ranges() contain no holes and may be freed, this | 4363 | * add_active_ranges() contain no holes and may be freed, this |
4354 | * this function may be used instead of calling free_bootmem() manually. | 4364 | * this function may be used instead of calling memblock_free_early_nid() |
4365 | * manually. | ||
4355 | */ | 4366 | */ |
4356 | void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) | 4367 | void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) |
4357 | { | 4368 | { |
@@ -4363,9 +4374,9 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) | |||
4363 | end_pfn = min(end_pfn, max_low_pfn); | 4374 | end_pfn = min(end_pfn, max_low_pfn); |
4364 | 4375 | ||
4365 | if (start_pfn < end_pfn) | 4376 | if (start_pfn < end_pfn) |
4366 | free_bootmem_node(NODE_DATA(this_nid), | 4377 | memblock_free_early_nid(PFN_PHYS(start_pfn), |
4367 | PFN_PHYS(start_pfn), | 4378 | (end_pfn - start_pfn) << PAGE_SHIFT, |
4368 | (end_pfn - start_pfn) << PAGE_SHIFT); | 4379 | this_nid); |
4369 | } | 4380 | } |
4370 | } | 4381 | } |
4371 | 4382 | ||
@@ -4636,8 +4647,9 @@ static void __init setup_usemap(struct pglist_data *pgdat, | |||
4636 | unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); | 4647 | unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); |
4637 | zone->pageblock_flags = NULL; | 4648 | zone->pageblock_flags = NULL; |
4638 | if (usemapsize) | 4649 | if (usemapsize) |
4639 | zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat, | 4650 | zone->pageblock_flags = |
4640 | usemapsize); | 4651 | memblock_virt_alloc_node_nopanic(usemapsize, |
4652 | pgdat->node_id); | ||
4641 | } | 4653 | } |
4642 | #else | 4654 | #else |
4643 | static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, | 4655 | static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, |
@@ -4831,7 +4843,8 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) | |||
4831 | size = (end - start) * sizeof(struct page); | 4843 | size = (end - start) * sizeof(struct page); |
4832 | map = alloc_remap(pgdat->node_id, size); | 4844 | map = alloc_remap(pgdat->node_id, size); |
4833 | if (!map) | 4845 | if (!map) |
4834 | map = alloc_bootmem_node_nopanic(pgdat, size); | 4846 | map = memblock_virt_alloc_node_nopanic(size, |
4847 | pgdat->node_id); | ||
4835 | pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); | 4848 | pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); |
4836 | } | 4849 | } |
4837 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 4850 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
@@ -5012,9 +5025,33 @@ static void __init find_zone_movable_pfns_for_nodes(void) | |||
5012 | nodemask_t saved_node_state = node_states[N_MEMORY]; | 5025 | nodemask_t saved_node_state = node_states[N_MEMORY]; |
5013 | unsigned long totalpages = early_calculate_totalpages(); | 5026 | unsigned long totalpages = early_calculate_totalpages(); |
5014 | int usable_nodes = nodes_weight(node_states[N_MEMORY]); | 5027 | int usable_nodes = nodes_weight(node_states[N_MEMORY]); |
5028 | struct memblock_type *type = &memblock.memory; | ||
5029 | |||
5030 | /* Need to find movable_zone earlier when movable_node is specified. */ | ||
5031 | find_usable_zone_for_movable(); | ||
5032 | |||
5033 | /* | ||
5034 | * If movable_node is specified, ignore kernelcore and movablecore | ||
5035 | * options. | ||
5036 | */ | ||
5037 | if (movable_node_is_enabled()) { | ||
5038 | for (i = 0; i < type->cnt; i++) { | ||
5039 | if (!memblock_is_hotpluggable(&type->regions[i])) | ||
5040 | continue; | ||
5041 | |||
5042 | nid = type->regions[i].nid; | ||
5043 | |||
5044 | usable_startpfn = PFN_DOWN(type->regions[i].base); | ||
5045 | zone_movable_pfn[nid] = zone_movable_pfn[nid] ? | ||
5046 | min(usable_startpfn, zone_movable_pfn[nid]) : | ||
5047 | usable_startpfn; | ||
5048 | } | ||
5049 | |||
5050 | goto out2; | ||
5051 | } | ||
5015 | 5052 | ||
5016 | /* | 5053 | /* |
5017 | * If movablecore was specified, calculate what size of | 5054 | * If movablecore=nn[KMG] was specified, calculate what size of |
5018 | * kernelcore that corresponds so that memory usable for | 5055 | * kernelcore that corresponds so that memory usable for |
5019 | * any allocation type is evenly spread. If both kernelcore | 5056 | * any allocation type is evenly spread. If both kernelcore |
5020 | * and movablecore are specified, then the value of kernelcore | 5057 | * and movablecore are specified, then the value of kernelcore |
@@ -5040,7 +5077,6 @@ static void __init find_zone_movable_pfns_for_nodes(void) | |||
5040 | goto out; | 5077 | goto out; |
5041 | 5078 | ||
5042 | /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ | 5079 | /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ |
5043 | find_usable_zone_for_movable(); | ||
5044 | usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; | 5080 | usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; |
5045 | 5081 | ||
5046 | restart: | 5082 | restart: |
@@ -5131,6 +5167,7 @@ restart: | |||
5131 | if (usable_nodes && required_kernelcore > usable_nodes) | 5167 | if (usable_nodes && required_kernelcore > usable_nodes) |
5132 | goto restart; | 5168 | goto restart; |
5133 | 5169 | ||
5170 | out2: | ||
5134 | /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ | 5171 | /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ |
5135 | for (nid = 0; nid < MAX_NUMNODES; nid++) | 5172 | for (nid = 0; nid < MAX_NUMNODES; nid++) |
5136 | zone_movable_pfn[nid] = | 5173 | zone_movable_pfn[nid] = |
@@ -5857,7 +5894,7 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
5857 | do { | 5894 | do { |
5858 | size = bucketsize << log2qty; | 5895 | size = bucketsize << log2qty; |
5859 | if (flags & HASH_EARLY) | 5896 | if (flags & HASH_EARLY) |
5860 | table = alloc_bootmem_nopanic(size); | 5897 | table = memblock_virt_alloc_nopanic(size, 0); |
5861 | else if (hashdist) | 5898 | else if (hashdist) |
5862 | table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); | 5899 | table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); |
5863 | else { | 5900 | else { |
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 3bd0b8e6ab12..cfd162882c00 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c | |||
@@ -54,8 +54,9 @@ static int __init alloc_node_page_cgroup(int nid) | |||
54 | 54 | ||
55 | table_size = sizeof(struct page_cgroup) * nr_pages; | 55 | table_size = sizeof(struct page_cgroup) * nr_pages; |
56 | 56 | ||
57 | base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), | 57 | base = memblock_virt_alloc_try_nid_nopanic( |
58 | table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); | 58 | table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), |
59 | BOOTMEM_ALLOC_ACCESSIBLE, nid); | ||
59 | if (!base) | 60 | if (!base) |
60 | return -ENOMEM; | 61 | return -ENOMEM; |
61 | NODE_DATA(nid)->node_page_cgroup = base; | 62 | NODE_DATA(nid)->node_page_cgroup = base; |
diff --git a/mm/percpu.c b/mm/percpu.c index afbf352ae580..036cfe07050f 100644 --- a/mm/percpu.c +++ b/mm/percpu.c | |||
@@ -1063,7 +1063,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, | |||
1063 | __alignof__(ai->groups[0].cpu_map[0])); | 1063 | __alignof__(ai->groups[0].cpu_map[0])); |
1064 | ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]); | 1064 | ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]); |
1065 | 1065 | ||
1066 | ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size)); | 1066 | ptr = memblock_virt_alloc_nopanic(PFN_ALIGN(ai_size), 0); |
1067 | if (!ptr) | 1067 | if (!ptr) |
1068 | return NULL; | 1068 | return NULL; |
1069 | ai = ptr; | 1069 | ai = ptr; |
@@ -1088,7 +1088,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, | |||
1088 | */ | 1088 | */ |
1089 | void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) | 1089 | void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) |
1090 | { | 1090 | { |
1091 | free_bootmem(__pa(ai), ai->__ai_size); | 1091 | memblock_free_early(__pa(ai), ai->__ai_size); |
1092 | } | 1092 | } |
1093 | 1093 | ||
1094 | /** | 1094 | /** |
@@ -1246,10 +1246,12 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, | |||
1246 | PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); | 1246 | PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); |
1247 | 1247 | ||
1248 | /* process group information and build config tables accordingly */ | 1248 | /* process group information and build config tables accordingly */ |
1249 | group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0])); | 1249 | group_offsets = memblock_virt_alloc(ai->nr_groups * |
1250 | group_sizes = alloc_bootmem(ai->nr_groups * sizeof(group_sizes[0])); | 1250 | sizeof(group_offsets[0]), 0); |
1251 | unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0])); | 1251 | group_sizes = memblock_virt_alloc(ai->nr_groups * |
1252 | unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0])); | 1252 | sizeof(group_sizes[0]), 0); |
1253 | unit_map = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_map[0]), 0); | ||
1254 | unit_off = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_off[0]), 0); | ||
1253 | 1255 | ||
1254 | for (cpu = 0; cpu < nr_cpu_ids; cpu++) | 1256 | for (cpu = 0; cpu < nr_cpu_ids; cpu++) |
1255 | unit_map[cpu] = UINT_MAX; | 1257 | unit_map[cpu] = UINT_MAX; |
@@ -1311,7 +1313,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, | |||
1311 | * empty chunks. | 1313 | * empty chunks. |
1312 | */ | 1314 | */ |
1313 | pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2; | 1315 | pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2; |
1314 | pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0])); | 1316 | pcpu_slot = memblock_virt_alloc( |
1317 | pcpu_nr_slots * sizeof(pcpu_slot[0]), 0); | ||
1315 | for (i = 0; i < pcpu_nr_slots; i++) | 1318 | for (i = 0; i < pcpu_nr_slots; i++) |
1316 | INIT_LIST_HEAD(&pcpu_slot[i]); | 1319 | INIT_LIST_HEAD(&pcpu_slot[i]); |
1317 | 1320 | ||
@@ -1322,7 +1325,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, | |||
1322 | * covers static area + reserved area (mostly used for module | 1325 | * covers static area + reserved area (mostly used for module |
1323 | * static percpu allocation). | 1326 | * static percpu allocation). |
1324 | */ | 1327 | */ |
1325 | schunk = alloc_bootmem(pcpu_chunk_struct_size); | 1328 | schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); |
1326 | INIT_LIST_HEAD(&schunk->list); | 1329 | INIT_LIST_HEAD(&schunk->list); |
1327 | schunk->base_addr = base_addr; | 1330 | schunk->base_addr = base_addr; |
1328 | schunk->map = smap; | 1331 | schunk->map = smap; |
@@ -1346,7 +1349,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, | |||
1346 | 1349 | ||
1347 | /* init dynamic chunk if necessary */ | 1350 | /* init dynamic chunk if necessary */ |
1348 | if (dyn_size) { | 1351 | if (dyn_size) { |
1349 | dchunk = alloc_bootmem(pcpu_chunk_struct_size); | 1352 | dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); |
1350 | INIT_LIST_HEAD(&dchunk->list); | 1353 | INIT_LIST_HEAD(&dchunk->list); |
1351 | dchunk->base_addr = base_addr; | 1354 | dchunk->base_addr = base_addr; |
1352 | dchunk->map = dmap; | 1355 | dchunk->map = dmap; |
@@ -1626,7 +1629,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, | |||
1626 | size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; | 1629 | size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; |
1627 | areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *)); | 1630 | areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *)); |
1628 | 1631 | ||
1629 | areas = alloc_bootmem_nopanic(areas_size); | 1632 | areas = memblock_virt_alloc_nopanic(areas_size, 0); |
1630 | if (!areas) { | 1633 | if (!areas) { |
1631 | rc = -ENOMEM; | 1634 | rc = -ENOMEM; |
1632 | goto out_free; | 1635 | goto out_free; |
@@ -1712,7 +1715,7 @@ out_free_areas: | |||
1712 | out_free: | 1715 | out_free: |
1713 | pcpu_free_alloc_info(ai); | 1716 | pcpu_free_alloc_info(ai); |
1714 | if (areas) | 1717 | if (areas) |
1715 | free_bootmem(__pa(areas), areas_size); | 1718 | memblock_free_early(__pa(areas), areas_size); |
1716 | return rc; | 1719 | return rc; |
1717 | } | 1720 | } |
1718 | #endif /* BUILD_EMBED_FIRST_CHUNK */ | 1721 | #endif /* BUILD_EMBED_FIRST_CHUNK */ |
@@ -1760,7 +1763,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, | |||
1760 | /* unaligned allocations can't be freed, round up to page size */ | 1763 | /* unaligned allocations can't be freed, round up to page size */ |
1761 | pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * | 1764 | pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * |
1762 | sizeof(pages[0])); | 1765 | sizeof(pages[0])); |
1763 | pages = alloc_bootmem(pages_size); | 1766 | pages = memblock_virt_alloc(pages_size, 0); |
1764 | 1767 | ||
1765 | /* allocate pages */ | 1768 | /* allocate pages */ |
1766 | j = 0; | 1769 | j = 0; |
@@ -1823,7 +1826,7 @@ enomem: | |||
1823 | free_fn(page_address(pages[j]), PAGE_SIZE); | 1826 | free_fn(page_address(pages[j]), PAGE_SIZE); |
1824 | rc = -ENOMEM; | 1827 | rc = -ENOMEM; |
1825 | out_free_ar: | 1828 | out_free_ar: |
1826 | free_bootmem(__pa(pages), pages_size); | 1829 | memblock_free_early(__pa(pages), pages_size); |
1827 | pcpu_free_alloc_info(ai); | 1830 | pcpu_free_alloc_info(ai); |
1828 | return rc; | 1831 | return rc; |
1829 | } | 1832 | } |
@@ -1848,12 +1851,13 @@ EXPORT_SYMBOL(__per_cpu_offset); | |||
1848 | static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, | 1851 | static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, |
1849 | size_t align) | 1852 | size_t align) |
1850 | { | 1853 | { |
1851 | return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS)); | 1854 | return memblock_virt_alloc_from_nopanic( |
1855 | size, align, __pa(MAX_DMA_ADDRESS)); | ||
1852 | } | 1856 | } |
1853 | 1857 | ||
1854 | static void __init pcpu_dfl_fc_free(void *ptr, size_t size) | 1858 | static void __init pcpu_dfl_fc_free(void *ptr, size_t size) |
1855 | { | 1859 | { |
1856 | free_bootmem(__pa(ptr), size); | 1860 | memblock_free_early(__pa(ptr), size); |
1857 | } | 1861 | } |
1858 | 1862 | ||
1859 | void __init setup_per_cpu_areas(void) | 1863 | void __init setup_per_cpu_areas(void) |
@@ -1896,7 +1900,9 @@ void __init setup_per_cpu_areas(void) | |||
1896 | void *fc; | 1900 | void *fc; |
1897 | 1901 | ||
1898 | ai = pcpu_alloc_alloc_info(1, 1); | 1902 | ai = pcpu_alloc_alloc_info(1, 1); |
1899 | fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); | 1903 | fc = memblock_virt_alloc_from_nopanic(unit_size, |
1904 | PAGE_SIZE, | ||
1905 | __pa(MAX_DMA_ADDRESS)); | ||
1900 | if (!ai || !fc) | 1906 | if (!ai || !fc) |
1901 | panic("Failed to allocate memory for percpu areas."); | 1907 | panic("Failed to allocate memory for percpu areas."); |
1902 | /* kmemleak tracks the percpu allocations separately */ | 1908 | /* kmemleak tracks the percpu allocations separately */ |
@@ -660,17 +660,22 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) | |||
660 | return 1; | 660 | return 1; |
661 | } | 661 | } |
662 | 662 | ||
663 | struct page_referenced_arg { | ||
664 | int mapcount; | ||
665 | int referenced; | ||
666 | unsigned long vm_flags; | ||
667 | struct mem_cgroup *memcg; | ||
668 | }; | ||
663 | /* | 669 | /* |
664 | * Subfunctions of page_referenced: page_referenced_one called | 670 | * arg: page_referenced_arg will be passed |
665 | * repeatedly from either page_referenced_anon or page_referenced_file. | ||
666 | */ | 671 | */ |
667 | int page_referenced_one(struct page *page, struct vm_area_struct *vma, | 672 | int page_referenced_one(struct page *page, struct vm_area_struct *vma, |
668 | unsigned long address, unsigned int *mapcount, | 673 | unsigned long address, void *arg) |
669 | unsigned long *vm_flags) | ||
670 | { | 674 | { |
671 | struct mm_struct *mm = vma->vm_mm; | 675 | struct mm_struct *mm = vma->vm_mm; |
672 | spinlock_t *ptl; | 676 | spinlock_t *ptl; |
673 | int referenced = 0; | 677 | int referenced = 0; |
678 | struct page_referenced_arg *pra = arg; | ||
674 | 679 | ||
675 | if (unlikely(PageTransHuge(page))) { | 680 | if (unlikely(PageTransHuge(page))) { |
676 | pmd_t *pmd; | 681 | pmd_t *pmd; |
@@ -682,13 +687,12 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, | |||
682 | pmd = page_check_address_pmd(page, mm, address, | 687 | pmd = page_check_address_pmd(page, mm, address, |
683 | PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl); | 688 | PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl); |
684 | if (!pmd) | 689 | if (!pmd) |
685 | goto out; | 690 | return SWAP_AGAIN; |
686 | 691 | ||
687 | if (vma->vm_flags & VM_LOCKED) { | 692 | if (vma->vm_flags & VM_LOCKED) { |
688 | spin_unlock(ptl); | 693 | spin_unlock(ptl); |
689 | *mapcount = 0; /* break early from loop */ | 694 | pra->vm_flags |= VM_LOCKED; |
690 | *vm_flags |= VM_LOCKED; | 695 | return SWAP_FAIL; /* To break the loop */ |
691 | goto out; | ||
692 | } | 696 | } |
693 | 697 | ||
694 | /* go ahead even if the pmd is pmd_trans_splitting() */ | 698 | /* go ahead even if the pmd is pmd_trans_splitting() */ |
@@ -704,13 +708,12 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, | |||
704 | */ | 708 | */ |
705 | pte = page_check_address(page, mm, address, &ptl, 0); | 709 | pte = page_check_address(page, mm, address, &ptl, 0); |
706 | if (!pte) | 710 | if (!pte) |
707 | goto out; | 711 | return SWAP_AGAIN; |
708 | 712 | ||
709 | if (vma->vm_flags & VM_LOCKED) { | 713 | if (vma->vm_flags & VM_LOCKED) { |
710 | pte_unmap_unlock(pte, ptl); | 714 | pte_unmap_unlock(pte, ptl); |
711 | *mapcount = 0; /* break early from loop */ | 715 | pra->vm_flags |= VM_LOCKED; |
712 | *vm_flags |= VM_LOCKED; | 716 | return SWAP_FAIL; /* To break the loop */ |
713 | goto out; | ||
714 | } | 717 | } |
715 | 718 | ||
716 | if (ptep_clear_flush_young_notify(vma, address, pte)) { | 719 | if (ptep_clear_flush_young_notify(vma, address, pte)) { |
@@ -727,113 +730,27 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, | |||
727 | pte_unmap_unlock(pte, ptl); | 730 | pte_unmap_unlock(pte, ptl); |
728 | } | 731 | } |
729 | 732 | ||
730 | (*mapcount)--; | 733 | if (referenced) { |
731 | 734 | pra->referenced++; | |
732 | if (referenced) | 735 | pra->vm_flags |= vma->vm_flags; |
733 | *vm_flags |= vma->vm_flags; | ||
734 | out: | ||
735 | return referenced; | ||
736 | } | ||
737 | |||
738 | static int page_referenced_anon(struct page *page, | ||
739 | struct mem_cgroup *memcg, | ||
740 | unsigned long *vm_flags) | ||
741 | { | ||
742 | unsigned int mapcount; | ||
743 | struct anon_vma *anon_vma; | ||
744 | pgoff_t pgoff; | ||
745 | struct anon_vma_chain *avc; | ||
746 | int referenced = 0; | ||
747 | |||
748 | anon_vma = page_lock_anon_vma_read(page); | ||
749 | if (!anon_vma) | ||
750 | return referenced; | ||
751 | |||
752 | mapcount = page_mapcount(page); | ||
753 | pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
754 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { | ||
755 | struct vm_area_struct *vma = avc->vma; | ||
756 | unsigned long address = vma_address(page, vma); | ||
757 | /* | ||
758 | * If we are reclaiming on behalf of a cgroup, skip | ||
759 | * counting on behalf of references from different | ||
760 | * cgroups | ||
761 | */ | ||
762 | if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) | ||
763 | continue; | ||
764 | referenced += page_referenced_one(page, vma, address, | ||
765 | &mapcount, vm_flags); | ||
766 | if (!mapcount) | ||
767 | break; | ||
768 | } | 736 | } |
769 | 737 | ||
770 | page_unlock_anon_vma_read(anon_vma); | 738 | pra->mapcount--; |
771 | return referenced; | 739 | if (!pra->mapcount) |
740 | return SWAP_SUCCESS; /* To break the loop */ | ||
741 | |||
742 | return SWAP_AGAIN; | ||
772 | } | 743 | } |
773 | 744 | ||
774 | /** | 745 | static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg) |
775 | * page_referenced_file - referenced check for object-based rmap | ||
776 | * @page: the page we're checking references on. | ||
777 | * @memcg: target memory control group | ||
778 | * @vm_flags: collect encountered vma->vm_flags who actually referenced the page | ||
779 | * | ||
780 | * For an object-based mapped page, find all the places it is mapped and | ||
781 | * check/clear the referenced flag. This is done by following the page->mapping | ||
782 | * pointer, then walking the chain of vmas it holds. It returns the number | ||
783 | * of references it found. | ||
784 | * | ||
785 | * This function is only called from page_referenced for object-based pages. | ||
786 | */ | ||
787 | static int page_referenced_file(struct page *page, | ||
788 | struct mem_cgroup *memcg, | ||
789 | unsigned long *vm_flags) | ||
790 | { | 746 | { |
791 | unsigned int mapcount; | 747 | struct page_referenced_arg *pra = arg; |
792 | struct address_space *mapping = page->mapping; | 748 | struct mem_cgroup *memcg = pra->memcg; |
793 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
794 | struct vm_area_struct *vma; | ||
795 | int referenced = 0; | ||
796 | |||
797 | /* | ||
798 | * The caller's checks on page->mapping and !PageAnon have made | ||
799 | * sure that this is a file page: the check for page->mapping | ||
800 | * excludes the case just before it gets set on an anon page. | ||
801 | */ | ||
802 | BUG_ON(PageAnon(page)); | ||
803 | |||
804 | /* | ||
805 | * The page lock not only makes sure that page->mapping cannot | ||
806 | * suddenly be NULLified by truncation, it makes sure that the | ||
807 | * structure at mapping cannot be freed and reused yet, | ||
808 | * so we can safely take mapping->i_mmap_mutex. | ||
809 | */ | ||
810 | BUG_ON(!PageLocked(page)); | ||
811 | |||
812 | mutex_lock(&mapping->i_mmap_mutex); | ||
813 | 749 | ||
814 | /* | 750 | if (!mm_match_cgroup(vma->vm_mm, memcg)) |
815 | * i_mmap_mutex does not stabilize mapcount at all, but mapcount | 751 | return true; |
816 | * is more likely to be accurate if we note it after spinning. | ||
817 | */ | ||
818 | mapcount = page_mapcount(page); | ||
819 | |||
820 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { | ||
821 | unsigned long address = vma_address(page, vma); | ||
822 | /* | ||
823 | * If we are reclaiming on behalf of a cgroup, skip | ||
824 | * counting on behalf of references from different | ||
825 | * cgroups | ||
826 | */ | ||
827 | if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) | ||
828 | continue; | ||
829 | referenced += page_referenced_one(page, vma, address, | ||
830 | &mapcount, vm_flags); | ||
831 | if (!mapcount) | ||
832 | break; | ||
833 | } | ||
834 | 752 | ||
835 | mutex_unlock(&mapping->i_mmap_mutex); | 753 | return false; |
836 | return referenced; | ||
837 | } | 754 | } |
838 | 755 | ||
839 | /** | 756 | /** |
@@ -851,41 +768,57 @@ int page_referenced(struct page *page, | |||
851 | struct mem_cgroup *memcg, | 768 | struct mem_cgroup *memcg, |
852 | unsigned long *vm_flags) | 769 | unsigned long *vm_flags) |
853 | { | 770 | { |
854 | int referenced = 0; | 771 | int ret; |
855 | int we_locked = 0; | 772 | int we_locked = 0; |
773 | struct page_referenced_arg pra = { | ||
774 | .mapcount = page_mapcount(page), | ||
775 | .memcg = memcg, | ||
776 | }; | ||
777 | struct rmap_walk_control rwc = { | ||
778 | .rmap_one = page_referenced_one, | ||
779 | .arg = (void *)&pra, | ||
780 | .anon_lock = page_lock_anon_vma_read, | ||
781 | }; | ||
856 | 782 | ||
857 | *vm_flags = 0; | 783 | *vm_flags = 0; |
858 | if (page_mapped(page) && page_rmapping(page)) { | 784 | if (!page_mapped(page)) |
859 | if (!is_locked && (!PageAnon(page) || PageKsm(page))) { | 785 | return 0; |
860 | we_locked = trylock_page(page); | 786 | |
861 | if (!we_locked) { | 787 | if (!page_rmapping(page)) |
862 | referenced++; | 788 | return 0; |
863 | goto out; | 789 | |
864 | } | 790 | if (!is_locked && (!PageAnon(page) || PageKsm(page))) { |
865 | } | 791 | we_locked = trylock_page(page); |
866 | if (unlikely(PageKsm(page))) | 792 | if (!we_locked) |
867 | referenced += page_referenced_ksm(page, memcg, | 793 | return 1; |
868 | vm_flags); | ||
869 | else if (PageAnon(page)) | ||
870 | referenced += page_referenced_anon(page, memcg, | ||
871 | vm_flags); | ||
872 | else if (page->mapping) | ||
873 | referenced += page_referenced_file(page, memcg, | ||
874 | vm_flags); | ||
875 | if (we_locked) | ||
876 | unlock_page(page); | ||
877 | } | 794 | } |
878 | out: | 795 | |
879 | return referenced; | 796 | /* |
797 | * If we are reclaiming on behalf of a cgroup, skip | ||
798 | * counting on behalf of references from different | ||
799 | * cgroups | ||
800 | */ | ||
801 | if (memcg) { | ||
802 | rwc.invalid_vma = invalid_page_referenced_vma; | ||
803 | } | ||
804 | |||
805 | ret = rmap_walk(page, &rwc); | ||
806 | *vm_flags = pra.vm_flags; | ||
807 | |||
808 | if (we_locked) | ||
809 | unlock_page(page); | ||
810 | |||
811 | return pra.referenced; | ||
880 | } | 812 | } |
881 | 813 | ||
882 | static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, | 814 | static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, |
883 | unsigned long address) | 815 | unsigned long address, void *arg) |
884 | { | 816 | { |
885 | struct mm_struct *mm = vma->vm_mm; | 817 | struct mm_struct *mm = vma->vm_mm; |
886 | pte_t *pte; | 818 | pte_t *pte; |
887 | spinlock_t *ptl; | 819 | spinlock_t *ptl; |
888 | int ret = 0; | 820 | int ret = 0; |
821 | int *cleaned = arg; | ||
889 | 822 | ||
890 | pte = page_check_address(page, mm, address, &ptl, 1); | 823 | pte = page_check_address(page, mm, address, &ptl, 1); |
891 | if (!pte) | 824 | if (!pte) |
@@ -904,44 +837,44 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, | |||
904 | 837 | ||
905 | pte_unmap_unlock(pte, ptl); | 838 | pte_unmap_unlock(pte, ptl); |
906 | 839 | ||
907 | if (ret) | 840 | if (ret) { |
908 | mmu_notifier_invalidate_page(mm, address); | 841 | mmu_notifier_invalidate_page(mm, address); |
842 | (*cleaned)++; | ||
843 | } | ||
909 | out: | 844 | out: |
910 | return ret; | 845 | return SWAP_AGAIN; |
911 | } | 846 | } |
912 | 847 | ||
913 | static int page_mkclean_file(struct address_space *mapping, struct page *page) | 848 | static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg) |
914 | { | 849 | { |
915 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 850 | if (vma->vm_flags & VM_SHARED) |
916 | struct vm_area_struct *vma; | 851 | return 0; |
917 | int ret = 0; | ||
918 | |||
919 | BUG_ON(PageAnon(page)); | ||
920 | 852 | ||
921 | mutex_lock(&mapping->i_mmap_mutex); | 853 | return 1; |
922 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { | ||
923 | if (vma->vm_flags & VM_SHARED) { | ||
924 | unsigned long address = vma_address(page, vma); | ||
925 | ret += page_mkclean_one(page, vma, address); | ||
926 | } | ||
927 | } | ||
928 | mutex_unlock(&mapping->i_mmap_mutex); | ||
929 | return ret; | ||
930 | } | 854 | } |
931 | 855 | ||
932 | int page_mkclean(struct page *page) | 856 | int page_mkclean(struct page *page) |
933 | { | 857 | { |
934 | int ret = 0; | 858 | int cleaned = 0; |
859 | struct address_space *mapping; | ||
860 | struct rmap_walk_control rwc = { | ||
861 | .arg = (void *)&cleaned, | ||
862 | .rmap_one = page_mkclean_one, | ||
863 | .invalid_vma = invalid_mkclean_vma, | ||
864 | }; | ||
935 | 865 | ||
936 | BUG_ON(!PageLocked(page)); | 866 | BUG_ON(!PageLocked(page)); |
937 | 867 | ||
938 | if (page_mapped(page)) { | 868 | if (!page_mapped(page)) |
939 | struct address_space *mapping = page_mapping(page); | 869 | return 0; |
940 | if (mapping) | ||
941 | ret = page_mkclean_file(mapping, page); | ||
942 | } | ||
943 | 870 | ||
944 | return ret; | 871 | mapping = page_mapping(page); |
872 | if (!mapping) | ||
873 | return 0; | ||
874 | |||
875 | rmap_walk(page, &rwc); | ||
876 | |||
877 | return cleaned; | ||
945 | } | 878 | } |
946 | EXPORT_SYMBOL_GPL(page_mkclean); | 879 | EXPORT_SYMBOL_GPL(page_mkclean); |
947 | 880 | ||
@@ -1177,17 +1110,17 @@ out: | |||
1177 | } | 1110 | } |
1178 | 1111 | ||
1179 | /* | 1112 | /* |
1180 | * Subfunctions of try_to_unmap: try_to_unmap_one called | 1113 | * @arg: enum ttu_flags will be passed to this argument |
1181 | * repeatedly from try_to_unmap_ksm, try_to_unmap_anon or try_to_unmap_file. | ||
1182 | */ | 1114 | */ |
1183 | int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | 1115 | int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, |
1184 | unsigned long address, enum ttu_flags flags) | 1116 | unsigned long address, void *arg) |
1185 | { | 1117 | { |
1186 | struct mm_struct *mm = vma->vm_mm; | 1118 | struct mm_struct *mm = vma->vm_mm; |
1187 | pte_t *pte; | 1119 | pte_t *pte; |
1188 | pte_t pteval; | 1120 | pte_t pteval; |
1189 | spinlock_t *ptl; | 1121 | spinlock_t *ptl; |
1190 | int ret = SWAP_AGAIN; | 1122 | int ret = SWAP_AGAIN; |
1123 | enum ttu_flags flags = (enum ttu_flags)arg; | ||
1191 | 1124 | ||
1192 | pte = page_check_address(page, mm, address, &ptl, 0); | 1125 | pte = page_check_address(page, mm, address, &ptl, 0); |
1193 | if (!pte) | 1126 | if (!pte) |
@@ -1426,124 +1359,18 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
1426 | return ret; | 1359 | return ret; |
1427 | } | 1360 | } |
1428 | 1361 | ||
1429 | bool is_vma_temporary_stack(struct vm_area_struct *vma) | 1362 | static int try_to_unmap_nonlinear(struct page *page, |
1430 | { | 1363 | struct address_space *mapping, struct vm_area_struct *vma) |
1431 | int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); | ||
1432 | |||
1433 | if (!maybe_stack) | ||
1434 | return false; | ||
1435 | |||
1436 | if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == | ||
1437 | VM_STACK_INCOMPLETE_SETUP) | ||
1438 | return true; | ||
1439 | |||
1440 | return false; | ||
1441 | } | ||
1442 | |||
1443 | /** | ||
1444 | * try_to_unmap_anon - unmap or unlock anonymous page using the object-based | ||
1445 | * rmap method | ||
1446 | * @page: the page to unmap/unlock | ||
1447 | * @flags: action and flags | ||
1448 | * | ||
1449 | * Find all the mappings of a page using the mapping pointer and the vma chains | ||
1450 | * contained in the anon_vma struct it points to. | ||
1451 | * | ||
1452 | * This function is only called from try_to_unmap/try_to_munlock for | ||
1453 | * anonymous pages. | ||
1454 | * When called from try_to_munlock(), the mmap_sem of the mm containing the vma | ||
1455 | * where the page was found will be held for write. So, we won't recheck | ||
1456 | * vm_flags for that VMA. That should be OK, because that vma shouldn't be | ||
1457 | * 'LOCKED. | ||
1458 | */ | ||
1459 | static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) | ||
1460 | { | ||
1461 | struct anon_vma *anon_vma; | ||
1462 | pgoff_t pgoff; | ||
1463 | struct anon_vma_chain *avc; | ||
1464 | int ret = SWAP_AGAIN; | ||
1465 | |||
1466 | anon_vma = page_lock_anon_vma_read(page); | ||
1467 | if (!anon_vma) | ||
1468 | return ret; | ||
1469 | |||
1470 | pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
1471 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { | ||
1472 | struct vm_area_struct *vma = avc->vma; | ||
1473 | unsigned long address; | ||
1474 | |||
1475 | /* | ||
1476 | * During exec, a temporary VMA is setup and later moved. | ||
1477 | * The VMA is moved under the anon_vma lock but not the | ||
1478 | * page tables leading to a race where migration cannot | ||
1479 | * find the migration ptes. Rather than increasing the | ||
1480 | * locking requirements of exec(), migration skips | ||
1481 | * temporary VMAs until after exec() completes. | ||
1482 | */ | ||
1483 | if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) && | ||
1484 | is_vma_temporary_stack(vma)) | ||
1485 | continue; | ||
1486 | |||
1487 | address = vma_address(page, vma); | ||
1488 | ret = try_to_unmap_one(page, vma, address, flags); | ||
1489 | if (ret != SWAP_AGAIN || !page_mapped(page)) | ||
1490 | break; | ||
1491 | } | ||
1492 | |||
1493 | page_unlock_anon_vma_read(anon_vma); | ||
1494 | return ret; | ||
1495 | } | ||
1496 | |||
1497 | /** | ||
1498 | * try_to_unmap_file - unmap/unlock file page using the object-based rmap method | ||
1499 | * @page: the page to unmap/unlock | ||
1500 | * @flags: action and flags | ||
1501 | * | ||
1502 | * Find all the mappings of a page using the mapping pointer and the vma chains | ||
1503 | * contained in the address_space struct it points to. | ||
1504 | * | ||
1505 | * This function is only called from try_to_unmap/try_to_munlock for | ||
1506 | * object-based pages. | ||
1507 | * When called from try_to_munlock(), the mmap_sem of the mm containing the vma | ||
1508 | * where the page was found will be held for write. So, we won't recheck | ||
1509 | * vm_flags for that VMA. That should be OK, because that vma shouldn't be | ||
1510 | * 'LOCKED. | ||
1511 | */ | ||
1512 | static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | ||
1513 | { | 1364 | { |
1514 | struct address_space *mapping = page->mapping; | ||
1515 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
1516 | struct vm_area_struct *vma; | ||
1517 | int ret = SWAP_AGAIN; | 1365 | int ret = SWAP_AGAIN; |
1518 | unsigned long cursor; | 1366 | unsigned long cursor; |
1519 | unsigned long max_nl_cursor = 0; | 1367 | unsigned long max_nl_cursor = 0; |
1520 | unsigned long max_nl_size = 0; | 1368 | unsigned long max_nl_size = 0; |
1521 | unsigned int mapcount; | 1369 | unsigned int mapcount; |
1522 | 1370 | ||
1523 | if (PageHuge(page)) | 1371 | list_for_each_entry(vma, |
1524 | pgoff = page->index << compound_order(page); | 1372 | &mapping->i_mmap_nonlinear, shared.nonlinear) { |
1525 | 1373 | ||
1526 | mutex_lock(&mapping->i_mmap_mutex); | ||
1527 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { | ||
1528 | unsigned long address = vma_address(page, vma); | ||
1529 | ret = try_to_unmap_one(page, vma, address, flags); | ||
1530 | if (ret != SWAP_AGAIN || !page_mapped(page)) | ||
1531 | goto out; | ||
1532 | } | ||
1533 | |||
1534 | if (list_empty(&mapping->i_mmap_nonlinear)) | ||
1535 | goto out; | ||
1536 | |||
1537 | /* | ||
1538 | * We don't bother to try to find the munlocked page in nonlinears. | ||
1539 | * It's costly. Instead, later, page reclaim logic may call | ||
1540 | * try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily. | ||
1541 | */ | ||
1542 | if (TTU_ACTION(flags) == TTU_MUNLOCK) | ||
1543 | goto out; | ||
1544 | |||
1545 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | ||
1546 | shared.nonlinear) { | ||
1547 | cursor = (unsigned long) vma->vm_private_data; | 1374 | cursor = (unsigned long) vma->vm_private_data; |
1548 | if (cursor > max_nl_cursor) | 1375 | if (cursor > max_nl_cursor) |
1549 | max_nl_cursor = cursor; | 1376 | max_nl_cursor = cursor; |
@@ -1553,8 +1380,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1553 | } | 1380 | } |
1554 | 1381 | ||
1555 | if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */ | 1382 | if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */ |
1556 | ret = SWAP_FAIL; | 1383 | return SWAP_FAIL; |
1557 | goto out; | ||
1558 | } | 1384 | } |
1559 | 1385 | ||
1560 | /* | 1386 | /* |
@@ -1566,7 +1392,8 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1566 | */ | 1392 | */ |
1567 | mapcount = page_mapcount(page); | 1393 | mapcount = page_mapcount(page); |
1568 | if (!mapcount) | 1394 | if (!mapcount) |
1569 | goto out; | 1395 | return ret; |
1396 | |||
1570 | cond_resched(); | 1397 | cond_resched(); |
1571 | 1398 | ||
1572 | max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; | 1399 | max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; |
@@ -1574,10 +1401,11 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1574 | max_nl_cursor = CLUSTER_SIZE; | 1401 | max_nl_cursor = CLUSTER_SIZE; |
1575 | 1402 | ||
1576 | do { | 1403 | do { |
1577 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | 1404 | list_for_each_entry(vma, |
1578 | shared.nonlinear) { | 1405 | &mapping->i_mmap_nonlinear, shared.nonlinear) { |
1406 | |||
1579 | cursor = (unsigned long) vma->vm_private_data; | 1407 | cursor = (unsigned long) vma->vm_private_data; |
1580 | while ( cursor < max_nl_cursor && | 1408 | while (cursor < max_nl_cursor && |
1581 | cursor < vma->vm_end - vma->vm_start) { | 1409 | cursor < vma->vm_end - vma->vm_start) { |
1582 | if (try_to_unmap_cluster(cursor, &mapcount, | 1410 | if (try_to_unmap_cluster(cursor, &mapcount, |
1583 | vma, page) == SWAP_MLOCK) | 1411 | vma, page) == SWAP_MLOCK) |
@@ -1585,7 +1413,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1585 | cursor += CLUSTER_SIZE; | 1413 | cursor += CLUSTER_SIZE; |
1586 | vma->vm_private_data = (void *) cursor; | 1414 | vma->vm_private_data = (void *) cursor; |
1587 | if ((int)mapcount <= 0) | 1415 | if ((int)mapcount <= 0) |
1588 | goto out; | 1416 | return ret; |
1589 | } | 1417 | } |
1590 | vma->vm_private_data = (void *) max_nl_cursor; | 1418 | vma->vm_private_data = (void *) max_nl_cursor; |
1591 | } | 1419 | } |
@@ -1600,11 +1428,34 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1600 | */ | 1428 | */ |
1601 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear) | 1429 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear) |
1602 | vma->vm_private_data = NULL; | 1430 | vma->vm_private_data = NULL; |
1603 | out: | 1431 | |
1604 | mutex_unlock(&mapping->i_mmap_mutex); | ||
1605 | return ret; | 1432 | return ret; |
1606 | } | 1433 | } |
1607 | 1434 | ||
1435 | bool is_vma_temporary_stack(struct vm_area_struct *vma) | ||
1436 | { | ||
1437 | int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); | ||
1438 | |||
1439 | if (!maybe_stack) | ||
1440 | return false; | ||
1441 | |||
1442 | if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == | ||
1443 | VM_STACK_INCOMPLETE_SETUP) | ||
1444 | return true; | ||
1445 | |||
1446 | return false; | ||
1447 | } | ||
1448 | |||
1449 | static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) | ||
1450 | { | ||
1451 | return is_vma_temporary_stack(vma); | ||
1452 | } | ||
1453 | |||
1454 | static int page_not_mapped(struct page *page) | ||
1455 | { | ||
1456 | return !page_mapped(page); | ||
1457 | }; | ||
1458 | |||
1608 | /** | 1459 | /** |
1609 | * try_to_unmap - try to remove all page table mappings to a page | 1460 | * try_to_unmap - try to remove all page table mappings to a page |
1610 | * @page: the page to get unmapped | 1461 | * @page: the page to get unmapped |
@@ -1622,16 +1473,29 @@ out: | |||
1622 | int try_to_unmap(struct page *page, enum ttu_flags flags) | 1473 | int try_to_unmap(struct page *page, enum ttu_flags flags) |
1623 | { | 1474 | { |
1624 | int ret; | 1475 | int ret; |
1476 | struct rmap_walk_control rwc = { | ||
1477 | .rmap_one = try_to_unmap_one, | ||
1478 | .arg = (void *)flags, | ||
1479 | .done = page_not_mapped, | ||
1480 | .file_nonlinear = try_to_unmap_nonlinear, | ||
1481 | .anon_lock = page_lock_anon_vma_read, | ||
1482 | }; | ||
1625 | 1483 | ||
1626 | BUG_ON(!PageLocked(page)); | ||
1627 | VM_BUG_ON(!PageHuge(page) && PageTransHuge(page)); | 1484 | VM_BUG_ON(!PageHuge(page) && PageTransHuge(page)); |
1628 | 1485 | ||
1629 | if (unlikely(PageKsm(page))) | 1486 | /* |
1630 | ret = try_to_unmap_ksm(page, flags); | 1487 | * During exec, a temporary VMA is setup and later moved. |
1631 | else if (PageAnon(page)) | 1488 | * The VMA is moved under the anon_vma lock but not the |
1632 | ret = try_to_unmap_anon(page, flags); | 1489 | * page tables leading to a race where migration cannot |
1633 | else | 1490 | * find the migration ptes. Rather than increasing the |
1634 | ret = try_to_unmap_file(page, flags); | 1491 | * locking requirements of exec(), migration skips |
1492 | * temporary VMAs until after exec() completes. | ||
1493 | */ | ||
1494 | if (flags & TTU_MIGRATION && !PageKsm(page) && PageAnon(page)) | ||
1495 | rwc.invalid_vma = invalid_migration_vma; | ||
1496 | |||
1497 | ret = rmap_walk(page, &rwc); | ||
1498 | |||
1635 | if (ret != SWAP_MLOCK && !page_mapped(page)) | 1499 | if (ret != SWAP_MLOCK && !page_mapped(page)) |
1636 | ret = SWAP_SUCCESS; | 1500 | ret = SWAP_SUCCESS; |
1637 | return ret; | 1501 | return ret; |
@@ -1654,14 +1518,25 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) | |||
1654 | */ | 1518 | */ |
1655 | int try_to_munlock(struct page *page) | 1519 | int try_to_munlock(struct page *page) |
1656 | { | 1520 | { |
1521 | int ret; | ||
1522 | struct rmap_walk_control rwc = { | ||
1523 | .rmap_one = try_to_unmap_one, | ||
1524 | .arg = (void *)TTU_MUNLOCK, | ||
1525 | .done = page_not_mapped, | ||
1526 | /* | ||
1527 | * We don't bother to try to find the munlocked page in | ||
1528 | * nonlinears. It's costly. Instead, later, page reclaim logic | ||
1529 | * may call try_to_unmap() and recover PG_mlocked lazily. | ||
1530 | */ | ||
1531 | .file_nonlinear = NULL, | ||
1532 | .anon_lock = page_lock_anon_vma_read, | ||
1533 | |||
1534 | }; | ||
1535 | |||
1657 | VM_BUG_ON(!PageLocked(page) || PageLRU(page)); | 1536 | VM_BUG_ON(!PageLocked(page) || PageLRU(page)); |
1658 | 1537 | ||
1659 | if (unlikely(PageKsm(page))) | 1538 | ret = rmap_walk(page, &rwc); |
1660 | return try_to_unmap_ksm(page, TTU_MUNLOCK); | 1539 | return ret; |
1661 | else if (PageAnon(page)) | ||
1662 | return try_to_unmap_anon(page, TTU_MUNLOCK); | ||
1663 | else | ||
1664 | return try_to_unmap_file(page, TTU_MUNLOCK); | ||
1665 | } | 1540 | } |
1666 | 1541 | ||
1667 | void __put_anon_vma(struct anon_vma *anon_vma) | 1542 | void __put_anon_vma(struct anon_vma *anon_vma) |
@@ -1674,18 +1549,13 @@ void __put_anon_vma(struct anon_vma *anon_vma) | |||
1674 | anon_vma_free(anon_vma); | 1549 | anon_vma_free(anon_vma); |
1675 | } | 1550 | } |
1676 | 1551 | ||
1677 | #ifdef CONFIG_MIGRATION | 1552 | static struct anon_vma *rmap_walk_anon_lock(struct page *page, |
1678 | /* | 1553 | struct rmap_walk_control *rwc) |
1679 | * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file(): | ||
1680 | * Called by migrate.c to remove migration ptes, but might be used more later. | ||
1681 | */ | ||
1682 | static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, | ||
1683 | struct vm_area_struct *, unsigned long, void *), void *arg) | ||
1684 | { | 1554 | { |
1685 | struct anon_vma *anon_vma; | 1555 | struct anon_vma *anon_vma; |
1686 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 1556 | |
1687 | struct anon_vma_chain *avc; | 1557 | if (rwc->anon_lock) |
1688 | int ret = SWAP_AGAIN; | 1558 | return rwc->anon_lock(page); |
1689 | 1559 | ||
1690 | /* | 1560 | /* |
1691 | * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() | 1561 | * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() |
@@ -1695,58 +1565,120 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, | |||
1695 | */ | 1565 | */ |
1696 | anon_vma = page_anon_vma(page); | 1566 | anon_vma = page_anon_vma(page); |
1697 | if (!anon_vma) | 1567 | if (!anon_vma) |
1698 | return ret; | 1568 | return NULL; |
1569 | |||
1699 | anon_vma_lock_read(anon_vma); | 1570 | anon_vma_lock_read(anon_vma); |
1571 | return anon_vma; | ||
1572 | } | ||
1573 | |||
1574 | /* | ||
1575 | * rmap_walk_anon - do something to anonymous page using the object-based | ||
1576 | * rmap method | ||
1577 | * @page: the page to be handled | ||
1578 | * @rwc: control variable according to each walk type | ||
1579 | * | ||
1580 | * Find all the mappings of a page using the mapping pointer and the vma chains | ||
1581 | * contained in the anon_vma struct it points to. | ||
1582 | * | ||
1583 | * When called from try_to_munlock(), the mmap_sem of the mm containing the vma | ||
1584 | * where the page was found will be held for write. So, we won't recheck | ||
1585 | * vm_flags for that VMA. That should be OK, because that vma shouldn't be | ||
1586 | * LOCKED. | ||
1587 | */ | ||
1588 | static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) | ||
1589 | { | ||
1590 | struct anon_vma *anon_vma; | ||
1591 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
1592 | struct anon_vma_chain *avc; | ||
1593 | int ret = SWAP_AGAIN; | ||
1594 | |||
1595 | anon_vma = rmap_walk_anon_lock(page, rwc); | ||
1596 | if (!anon_vma) | ||
1597 | return ret; | ||
1598 | |||
1700 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { | 1599 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { |
1701 | struct vm_area_struct *vma = avc->vma; | 1600 | struct vm_area_struct *vma = avc->vma; |
1702 | unsigned long address = vma_address(page, vma); | 1601 | unsigned long address = vma_address(page, vma); |
1703 | ret = rmap_one(page, vma, address, arg); | 1602 | |
1603 | if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) | ||
1604 | continue; | ||
1605 | |||
1606 | ret = rwc->rmap_one(page, vma, address, rwc->arg); | ||
1704 | if (ret != SWAP_AGAIN) | 1607 | if (ret != SWAP_AGAIN) |
1705 | break; | 1608 | break; |
1609 | if (rwc->done && rwc->done(page)) | ||
1610 | break; | ||
1706 | } | 1611 | } |
1707 | anon_vma_unlock_read(anon_vma); | 1612 | anon_vma_unlock_read(anon_vma); |
1708 | return ret; | 1613 | return ret; |
1709 | } | 1614 | } |
1710 | 1615 | ||
1711 | static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, | 1616 | /* |
1712 | struct vm_area_struct *, unsigned long, void *), void *arg) | 1617 | * rmap_walk_file - do something to file page using the object-based rmap method |
1618 | * @page: the page to be handled | ||
1619 | * @rwc: control variable according to each walk type | ||
1620 | * | ||
1621 | * Find all the mappings of a page using the mapping pointer and the vma chains | ||
1622 | * contained in the address_space struct it points to. | ||
1623 | * | ||
1624 | * When called from try_to_munlock(), the mmap_sem of the mm containing the vma | ||
1625 | * where the page was found will be held for write. So, we won't recheck | ||
1626 | * vm_flags for that VMA. That should be OK, because that vma shouldn't be | ||
1627 | * LOCKED. | ||
1628 | */ | ||
1629 | static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) | ||
1713 | { | 1630 | { |
1714 | struct address_space *mapping = page->mapping; | 1631 | struct address_space *mapping = page->mapping; |
1715 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 1632 | pgoff_t pgoff = page->index << compound_order(page); |
1716 | struct vm_area_struct *vma; | 1633 | struct vm_area_struct *vma; |
1717 | int ret = SWAP_AGAIN; | 1634 | int ret = SWAP_AGAIN; |
1718 | 1635 | ||
1636 | /* | ||
1637 | * The page lock not only makes sure that page->mapping cannot | ||
1638 | * suddenly be NULLified by truncation, it makes sure that the | ||
1639 | * structure at mapping cannot be freed and reused yet, | ||
1640 | * so we can safely take mapping->i_mmap_mutex. | ||
1641 | */ | ||
1642 | VM_BUG_ON(!PageLocked(page)); | ||
1643 | |||
1719 | if (!mapping) | 1644 | if (!mapping) |
1720 | return ret; | 1645 | return ret; |
1721 | mutex_lock(&mapping->i_mmap_mutex); | 1646 | mutex_lock(&mapping->i_mmap_mutex); |
1722 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { | 1647 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { |
1723 | unsigned long address = vma_address(page, vma); | 1648 | unsigned long address = vma_address(page, vma); |
1724 | ret = rmap_one(page, vma, address, arg); | 1649 | |
1650 | if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) | ||
1651 | continue; | ||
1652 | |||
1653 | ret = rwc->rmap_one(page, vma, address, rwc->arg); | ||
1725 | if (ret != SWAP_AGAIN) | 1654 | if (ret != SWAP_AGAIN) |
1726 | break; | 1655 | goto done; |
1656 | if (rwc->done && rwc->done(page)) | ||
1657 | goto done; | ||
1727 | } | 1658 | } |
1728 | /* | 1659 | |
1729 | * No nonlinear handling: being always shared, nonlinear vmas | 1660 | if (!rwc->file_nonlinear) |
1730 | * never contain migration ptes. Decide what to do about this | 1661 | goto done; |
1731 | * limitation to linear when we need rmap_walk() on nonlinear. | 1662 | |
1732 | */ | 1663 | if (list_empty(&mapping->i_mmap_nonlinear)) |
1664 | goto done; | ||
1665 | |||
1666 | ret = rwc->file_nonlinear(page, mapping, vma); | ||
1667 | |||
1668 | done: | ||
1733 | mutex_unlock(&mapping->i_mmap_mutex); | 1669 | mutex_unlock(&mapping->i_mmap_mutex); |
1734 | return ret; | 1670 | return ret; |
1735 | } | 1671 | } |
1736 | 1672 | ||
1737 | int rmap_walk(struct page *page, int (*rmap_one)(struct page *, | 1673 | int rmap_walk(struct page *page, struct rmap_walk_control *rwc) |
1738 | struct vm_area_struct *, unsigned long, void *), void *arg) | ||
1739 | { | 1674 | { |
1740 | VM_BUG_ON(!PageLocked(page)); | ||
1741 | |||
1742 | if (unlikely(PageKsm(page))) | 1675 | if (unlikely(PageKsm(page))) |
1743 | return rmap_walk_ksm(page, rmap_one, arg); | 1676 | return rmap_walk_ksm(page, rwc); |
1744 | else if (PageAnon(page)) | 1677 | else if (PageAnon(page)) |
1745 | return rmap_walk_anon(page, rmap_one, arg); | 1678 | return rmap_walk_anon(page, rwc); |
1746 | else | 1679 | else |
1747 | return rmap_walk_file(page, rmap_one, arg); | 1680 | return rmap_walk_file(page, rwc); |
1748 | } | 1681 | } |
1749 | #endif /* CONFIG_MIGRATION */ | ||
1750 | 1682 | ||
1751 | #ifdef CONFIG_HUGETLB_PAGE | 1683 | #ifdef CONFIG_HUGETLB_PAGE |
1752 | /* | 1684 | /* |
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 27eeab3be757..4cba9c2783a1 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c | |||
@@ -40,7 +40,8 @@ static void * __init_refok __earlyonly_bootmem_alloc(int node, | |||
40 | unsigned long align, | 40 | unsigned long align, |
41 | unsigned long goal) | 41 | unsigned long goal) |
42 | { | 42 | { |
43 | return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal); | 43 | return memblock_virt_alloc_try_nid(size, align, goal, |
44 | BOOTMEM_ALLOC_ACCESSIBLE, node); | ||
44 | } | 45 | } |
45 | 46 | ||
46 | static void *vmemmap_buf; | 47 | static void *vmemmap_buf; |
@@ -226,7 +227,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, | |||
226 | 227 | ||
227 | if (vmemmap_buf_start) { | 228 | if (vmemmap_buf_start) { |
228 | /* need to free left buf */ | 229 | /* need to free left buf */ |
229 | free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf); | 230 | memblock_free_early(__pa(vmemmap_buf), |
231 | vmemmap_buf_end - vmemmap_buf); | ||
230 | vmemmap_buf = NULL; | 232 | vmemmap_buf = NULL; |
231 | vmemmap_buf_end = NULL; | 233 | vmemmap_buf_end = NULL; |
232 | } | 234 | } |
diff --git a/mm/sparse.c b/mm/sparse.c index 8cc7be0e9590..63c3ea5c119c 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -69,7 +69,7 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid) | |||
69 | else | 69 | else |
70 | section = kzalloc(array_size, GFP_KERNEL); | 70 | section = kzalloc(array_size, GFP_KERNEL); |
71 | } else { | 71 | } else { |
72 | section = alloc_bootmem_node(NODE_DATA(nid), array_size); | 72 | section = memblock_virt_alloc_node(array_size, nid); |
73 | } | 73 | } |
74 | 74 | ||
75 | return section; | 75 | return section; |
@@ -279,8 +279,9 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, | |||
279 | limit = goal + (1UL << PA_SECTION_SHIFT); | 279 | limit = goal + (1UL << PA_SECTION_SHIFT); |
280 | nid = early_pfn_to_nid(goal >> PAGE_SHIFT); | 280 | nid = early_pfn_to_nid(goal >> PAGE_SHIFT); |
281 | again: | 281 | again: |
282 | p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size, | 282 | p = memblock_virt_alloc_try_nid_nopanic(size, |
283 | SMP_CACHE_BYTES, goal, limit); | 283 | SMP_CACHE_BYTES, goal, limit, |
284 | nid); | ||
284 | if (!p && limit) { | 285 | if (!p && limit) { |
285 | limit = 0; | 286 | limit = 0; |
286 | goto again; | 287 | goto again; |
@@ -331,7 +332,7 @@ static unsigned long * __init | |||
331 | sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, | 332 | sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, |
332 | unsigned long size) | 333 | unsigned long size) |
333 | { | 334 | { |
334 | return alloc_bootmem_node_nopanic(pgdat, size); | 335 | return memblock_virt_alloc_node_nopanic(size, pgdat->node_id); |
335 | } | 336 | } |
336 | 337 | ||
337 | static void __init check_usemap_section_nr(int nid, unsigned long *usemap) | 338 | static void __init check_usemap_section_nr(int nid, unsigned long *usemap) |
@@ -376,8 +377,9 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid) | |||
376 | return map; | 377 | return map; |
377 | 378 | ||
378 | size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION); | 379 | size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION); |
379 | map = __alloc_bootmem_node_high(NODE_DATA(nid), size, | 380 | map = memblock_virt_alloc_try_nid(size, |
380 | PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); | 381 | PAGE_SIZE, __pa(MAX_DMA_ADDRESS), |
382 | BOOTMEM_ALLOC_ACCESSIBLE, nid); | ||
381 | return map; | 383 | return map; |
382 | } | 384 | } |
383 | void __init sparse_mem_maps_populate_node(struct page **map_map, | 385 | void __init sparse_mem_maps_populate_node(struct page **map_map, |
@@ -401,8 +403,9 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, | |||
401 | } | 403 | } |
402 | 404 | ||
403 | size = PAGE_ALIGN(size); | 405 | size = PAGE_ALIGN(size); |
404 | map = __alloc_bootmem_node_high(NODE_DATA(nodeid), size * map_count, | 406 | map = memblock_virt_alloc_try_nid(size * map_count, |
405 | PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); | 407 | PAGE_SIZE, __pa(MAX_DMA_ADDRESS), |
408 | BOOTMEM_ALLOC_ACCESSIBLE, nodeid); | ||
406 | if (map) { | 409 | if (map) { |
407 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { | 410 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { |
408 | if (!present_section_nr(pnum)) | 411 | if (!present_section_nr(pnum)) |
@@ -545,7 +548,7 @@ void __init sparse_init(void) | |||
545 | * sparse_early_mem_map_alloc, so allocate usemap_map at first. | 548 | * sparse_early_mem_map_alloc, so allocate usemap_map at first. |
546 | */ | 549 | */ |
547 | size = sizeof(unsigned long *) * NR_MEM_SECTIONS; | 550 | size = sizeof(unsigned long *) * NR_MEM_SECTIONS; |
548 | usemap_map = alloc_bootmem(size); | 551 | usemap_map = memblock_virt_alloc(size, 0); |
549 | if (!usemap_map) | 552 | if (!usemap_map) |
550 | panic("can not allocate usemap_map\n"); | 553 | panic("can not allocate usemap_map\n"); |
551 | alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node, | 554 | alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node, |
@@ -553,7 +556,7 @@ void __init sparse_init(void) | |||
553 | 556 | ||
554 | #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER | 557 | #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER |
555 | size2 = sizeof(struct page *) * NR_MEM_SECTIONS; | 558 | size2 = sizeof(struct page *) * NR_MEM_SECTIONS; |
556 | map_map = alloc_bootmem(size2); | 559 | map_map = memblock_virt_alloc(size2, 0); |
557 | if (!map_map) | 560 | if (!map_map) |
558 | panic("can not allocate map_map\n"); | 561 | panic("can not allocate map_map\n"); |
559 | alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node, | 562 | alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node, |
@@ -583,9 +586,9 @@ void __init sparse_init(void) | |||
583 | vmemmap_populate_print_last(); | 586 | vmemmap_populate_print_last(); |
584 | 587 | ||
585 | #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER | 588 | #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER |
586 | free_bootmem(__pa(map_map), size2); | 589 | memblock_free_early(__pa(map_map), size2); |
587 | #endif | 590 | #endif |
588 | free_bootmem(__pa(usemap_map), size); | 591 | memblock_free_early(__pa(usemap_map), size); |
589 | } | 592 | } |
590 | 593 | ||
591 | #ifdef CONFIG_MEMORY_HOTPLUG | 594 | #ifdef CONFIG_MEMORY_HOTPLUG |
@@ -31,7 +31,6 @@ | |||
31 | #include <linux/memcontrol.h> | 31 | #include <linux/memcontrol.h> |
32 | #include <linux/gfp.h> | 32 | #include <linux/gfp.h> |
33 | #include <linux/uio.h> | 33 | #include <linux/uio.h> |
34 | #include <linux/hugetlb.h> | ||
35 | 34 | ||
36 | #include "internal.h" | 35 | #include "internal.h" |
37 | 36 | ||
@@ -82,118 +81,150 @@ static void __put_compound_page(struct page *page) | |||
82 | 81 | ||
83 | static void put_compound_page(struct page *page) | 82 | static void put_compound_page(struct page *page) |
84 | { | 83 | { |
85 | if (unlikely(PageTail(page))) { | 84 | struct page *page_head; |
86 | /* __split_huge_page_refcount can run under us */ | ||
87 | struct page *page_head = compound_trans_head(page); | ||
88 | |||
89 | if (likely(page != page_head && | ||
90 | get_page_unless_zero(page_head))) { | ||
91 | unsigned long flags; | ||
92 | 85 | ||
86 | if (likely(!PageTail(page))) { | ||
87 | if (put_page_testzero(page)) { | ||
93 | /* | 88 | /* |
94 | * THP can not break up slab pages so avoid taking | 89 | * By the time all refcounts have been released |
95 | * compound_lock(). Slab performs non-atomic bit ops | 90 | * split_huge_page cannot run anymore from under us. |
96 | * on page->flags for better performance. In particular | ||
97 | * slab_unlock() in slub used to be a hot path. It is | ||
98 | * still hot on arches that do not support | ||
99 | * this_cpu_cmpxchg_double(). | ||
100 | */ | 91 | */ |
101 | if (PageSlab(page_head) || PageHeadHuge(page_head)) { | 92 | if (PageHead(page)) |
102 | if (likely(PageTail(page))) { | 93 | __put_compound_page(page); |
103 | /* | 94 | else |
104 | * __split_huge_page_refcount | 95 | __put_single_page(page); |
105 | * cannot race here. | 96 | } |
106 | */ | 97 | return; |
107 | VM_BUG_ON(!PageHead(page_head)); | 98 | } |
108 | atomic_dec(&page->_mapcount); | 99 | |
109 | if (put_page_testzero(page_head)) | 100 | /* __split_huge_page_refcount can run under us */ |
110 | VM_BUG_ON(1); | 101 | page_head = compound_trans_head(page); |
111 | if (put_page_testzero(page_head)) | 102 | |
112 | __put_compound_page(page_head); | 103 | /* |
113 | return; | 104 | * THP can not break up slab pages so avoid taking |
114 | } else | 105 | * compound_lock() and skip the tail page refcounting (in |
115 | /* | 106 | * _mapcount) too. Slab performs non-atomic bit ops on |
116 | * __split_huge_page_refcount | 107 | * page->flags for better performance. In particular |
117 | * run before us, "page" was a | 108 | * slab_unlock() in slub used to be a hot path. It is still |
118 | * THP tail. The split | 109 | * hot on arches that do not support |
119 | * page_head has been freed | 110 | * this_cpu_cmpxchg_double(). |
120 | * and reallocated as slab or | 111 | * |
121 | * hugetlbfs page of smaller | 112 | * If "page" is part of a slab or hugetlbfs page it cannot be |
122 | * order (only possible if | 113 | * splitted and the head page cannot change from under us. And |
123 | * reallocated as slab on | 114 | * if "page" is part of a THP page under splitting, if the |
124 | * x86). | 115 | * head page pointed by the THP tail isn't a THP head anymore, |
125 | */ | 116 | * we'll find PageTail clear after smp_rmb() and we'll treat |
126 | goto skip_lock; | 117 | * it as a single page. |
127 | } | 118 | */ |
119 | if (!__compound_tail_refcounted(page_head)) { | ||
120 | /* | ||
121 | * If "page" is a THP tail, we must read the tail page | ||
122 | * flags after the head page flags. The | ||
123 | * split_huge_page side enforces write memory barriers | ||
124 | * between clearing PageTail and before the head page | ||
125 | * can be freed and reallocated. | ||
126 | */ | ||
127 | smp_rmb(); | ||
128 | if (likely(PageTail(page))) { | ||
128 | /* | 129 | /* |
129 | * page_head wasn't a dangling pointer but it | 130 | * __split_huge_page_refcount cannot race |
130 | * may not be a head page anymore by the time | 131 | * here. |
131 | * we obtain the lock. That is ok as long as it | ||
132 | * can't be freed from under us. | ||
133 | */ | 132 | */ |
134 | flags = compound_lock_irqsave(page_head); | 133 | VM_BUG_ON(!PageHead(page_head)); |
135 | if (unlikely(!PageTail(page))) { | 134 | VM_BUG_ON(page_mapcount(page) != 0); |
136 | /* __split_huge_page_refcount run before us */ | 135 | if (put_page_testzero(page_head)) { |
137 | compound_unlock_irqrestore(page_head, flags); | 136 | /* |
138 | skip_lock: | 137 | * If this is the tail of a slab |
139 | if (put_page_testzero(page_head)) { | 138 | * compound page, the tail pin must |
140 | /* | 139 | * not be the last reference held on |
141 | * The head page may have been | 140 | * the page, because the PG_slab |
142 | * freed and reallocated as a | 141 | * cannot be cleared before all tail |
143 | * compound page of smaller | 142 | * pins (which skips the _mapcount |
144 | * order and then freed again. | 143 | * tail refcounting) have been |
145 | * All we know is that it | 144 | * released. For hugetlbfs the tail |
146 | * cannot have become: a THP | 145 | * pin may be the last reference on |
147 | * page, a compound page of | 146 | * the page instead, because |
148 | * higher order, a tail page. | 147 | * PageHeadHuge will not go away until |
149 | * That is because we still | 148 | * the compound page enters the buddy |
150 | * hold the refcount of the | 149 | * allocator. |
151 | * split THP tail and | 150 | */ |
152 | * page_head was the THP head | 151 | VM_BUG_ON(PageSlab(page_head)); |
153 | * before the split. | 152 | __put_compound_page(page_head); |
154 | */ | ||
155 | if (PageHead(page_head)) | ||
156 | __put_compound_page(page_head); | ||
157 | else | ||
158 | __put_single_page(page_head); | ||
159 | } | ||
160 | out_put_single: | ||
161 | if (put_page_testzero(page)) | ||
162 | __put_single_page(page); | ||
163 | return; | ||
164 | } | 153 | } |
165 | VM_BUG_ON(page_head != page->first_page); | 154 | return; |
155 | } else | ||
166 | /* | 156 | /* |
167 | * We can release the refcount taken by | 157 | * __split_huge_page_refcount run before us, |
168 | * get_page_unless_zero() now that | 158 | * "page" was a THP tail. The split page_head |
169 | * __split_huge_page_refcount() is blocked on | 159 | * has been freed and reallocated as slab or |
170 | * the compound_lock. | 160 | * hugetlbfs page of smaller order (only |
161 | * possible if reallocated as slab on x86). | ||
171 | */ | 162 | */ |
172 | if (put_page_testzero(page_head)) | 163 | goto out_put_single; |
173 | VM_BUG_ON(1); | 164 | } |
174 | /* __split_huge_page_refcount will wait now */ | ||
175 | VM_BUG_ON(page_mapcount(page) <= 0); | ||
176 | atomic_dec(&page->_mapcount); | ||
177 | VM_BUG_ON(atomic_read(&page_head->_count) <= 0); | ||
178 | VM_BUG_ON(atomic_read(&page->_count) != 0); | ||
179 | compound_unlock_irqrestore(page_head, flags); | ||
180 | 165 | ||
166 | if (likely(page != page_head && get_page_unless_zero(page_head))) { | ||
167 | unsigned long flags; | ||
168 | |||
169 | /* | ||
170 | * page_head wasn't a dangling pointer but it may not | ||
171 | * be a head page anymore by the time we obtain the | ||
172 | * lock. That is ok as long as it can't be freed from | ||
173 | * under us. | ||
174 | */ | ||
175 | flags = compound_lock_irqsave(page_head); | ||
176 | if (unlikely(!PageTail(page))) { | ||
177 | /* __split_huge_page_refcount run before us */ | ||
178 | compound_unlock_irqrestore(page_head, flags); | ||
181 | if (put_page_testzero(page_head)) { | 179 | if (put_page_testzero(page_head)) { |
180 | /* | ||
181 | * The head page may have been freed | ||
182 | * and reallocated as a compound page | ||
183 | * of smaller order and then freed | ||
184 | * again. All we know is that it | ||
185 | * cannot have become: a THP page, a | ||
186 | * compound page of higher order, a | ||
187 | * tail page. That is because we | ||
188 | * still hold the refcount of the | ||
189 | * split THP tail and page_head was | ||
190 | * the THP head before the split. | ||
191 | */ | ||
182 | if (PageHead(page_head)) | 192 | if (PageHead(page_head)) |
183 | __put_compound_page(page_head); | 193 | __put_compound_page(page_head); |
184 | else | 194 | else |
185 | __put_single_page(page_head); | 195 | __put_single_page(page_head); |
186 | } | 196 | } |
187 | } else { | 197 | out_put_single: |
188 | /* page_head is a dangling pointer */ | 198 | if (put_page_testzero(page)) |
189 | VM_BUG_ON(PageTail(page)); | 199 | __put_single_page(page); |
190 | goto out_put_single; | 200 | return; |
191 | } | 201 | } |
192 | } else if (put_page_testzero(page)) { | 202 | VM_BUG_ON(page_head != page->first_page); |
193 | if (PageHead(page)) | 203 | /* |
194 | __put_compound_page(page); | 204 | * We can release the refcount taken by |
195 | else | 205 | * get_page_unless_zero() now that |
196 | __put_single_page(page); | 206 | * __split_huge_page_refcount() is blocked on the |
207 | * compound_lock. | ||
208 | */ | ||
209 | if (put_page_testzero(page_head)) | ||
210 | VM_BUG_ON(1); | ||
211 | /* __split_huge_page_refcount will wait now */ | ||
212 | VM_BUG_ON(page_mapcount(page) <= 0); | ||
213 | atomic_dec(&page->_mapcount); | ||
214 | VM_BUG_ON(atomic_read(&page_head->_count) <= 0); | ||
215 | VM_BUG_ON(atomic_read(&page->_count) != 0); | ||
216 | compound_unlock_irqrestore(page_head, flags); | ||
217 | |||
218 | if (put_page_testzero(page_head)) { | ||
219 | if (PageHead(page_head)) | ||
220 | __put_compound_page(page_head); | ||
221 | else | ||
222 | __put_single_page(page_head); | ||
223 | } | ||
224 | } else { | ||
225 | /* page_head is a dangling pointer */ | ||
226 | VM_BUG_ON(PageTail(page)); | ||
227 | goto out_put_single; | ||
197 | } | 228 | } |
198 | } | 229 | } |
199 | 230 | ||
@@ -221,36 +252,37 @@ bool __get_page_tail(struct page *page) | |||
221 | * split_huge_page(). | 252 | * split_huge_page(). |
222 | */ | 253 | */ |
223 | unsigned long flags; | 254 | unsigned long flags; |
224 | bool got = false; | 255 | bool got; |
225 | struct page *page_head = compound_trans_head(page); | 256 | struct page *page_head = compound_trans_head(page); |
226 | 257 | ||
227 | if (likely(page != page_head && get_page_unless_zero(page_head))) { | 258 | /* Ref to put_compound_page() comment. */ |
228 | /* Ref to put_compound_page() comment. */ | 259 | if (!__compound_tail_refcounted(page_head)) { |
229 | if (PageSlab(page_head) || PageHeadHuge(page_head)) { | 260 | smp_rmb(); |
230 | if (likely(PageTail(page))) { | 261 | if (likely(PageTail(page))) { |
231 | /* | 262 | /* |
232 | * This is a hugetlbfs page or a slab | 263 | * This is a hugetlbfs page or a slab |
233 | * page. __split_huge_page_refcount | 264 | * page. __split_huge_page_refcount |
234 | * cannot race here. | 265 | * cannot race here. |
235 | */ | 266 | */ |
236 | VM_BUG_ON(!PageHead(page_head)); | 267 | VM_BUG_ON(!PageHead(page_head)); |
237 | __get_page_tail_foll(page, false); | 268 | __get_page_tail_foll(page, true); |
238 | return true; | 269 | return true; |
239 | } else { | 270 | } else { |
240 | /* | 271 | /* |
241 | * __split_huge_page_refcount run | 272 | * __split_huge_page_refcount run |
242 | * before us, "page" was a THP | 273 | * before us, "page" was a THP |
243 | * tail. The split page_head has been | 274 | * tail. The split page_head has been |
244 | * freed and reallocated as slab or | 275 | * freed and reallocated as slab or |
245 | * hugetlbfs page of smaller order | 276 | * hugetlbfs page of smaller order |
246 | * (only possible if reallocated as | 277 | * (only possible if reallocated as |
247 | * slab on x86). | 278 | * slab on x86). |
248 | */ | 279 | */ |
249 | put_page(page_head); | 280 | return false; |
250 | return false; | ||
251 | } | ||
252 | } | 281 | } |
282 | } | ||
253 | 283 | ||
284 | got = false; | ||
285 | if (likely(page != page_head && get_page_unless_zero(page_head))) { | ||
254 | /* | 286 | /* |
255 | * page_head wasn't a dangling pointer but it | 287 | * page_head wasn't a dangling pointer but it |
256 | * may not be a head page anymore by the time | 288 | * may not be a head page anymore by the time |
@@ -404,13 +404,45 @@ struct address_space *page_mapping(struct page *page) | |||
404 | return mapping; | 404 | return mapping; |
405 | } | 405 | } |
406 | 406 | ||
407 | int overcommit_ratio_handler(struct ctl_table *table, int write, | ||
408 | void __user *buffer, size_t *lenp, | ||
409 | loff_t *ppos) | ||
410 | { | ||
411 | int ret; | ||
412 | |||
413 | ret = proc_dointvec(table, write, buffer, lenp, ppos); | ||
414 | if (ret == 0 && write) | ||
415 | sysctl_overcommit_kbytes = 0; | ||
416 | return ret; | ||
417 | } | ||
418 | |||
419 | int overcommit_kbytes_handler(struct ctl_table *table, int write, | ||
420 | void __user *buffer, size_t *lenp, | ||
421 | loff_t *ppos) | ||
422 | { | ||
423 | int ret; | ||
424 | |||
425 | ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); | ||
426 | if (ret == 0 && write) | ||
427 | sysctl_overcommit_ratio = 0; | ||
428 | return ret; | ||
429 | } | ||
430 | |||
407 | /* | 431 | /* |
408 | * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used | 432 | * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used |
409 | */ | 433 | */ |
410 | unsigned long vm_commit_limit(void) | 434 | unsigned long vm_commit_limit(void) |
411 | { | 435 | { |
412 | return ((totalram_pages - hugetlb_total_pages()) | 436 | unsigned long allowed; |
413 | * sysctl_overcommit_ratio / 100) + total_swap_pages; | 437 | |
438 | if (sysctl_overcommit_kbytes) | ||
439 | allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10); | ||
440 | else | ||
441 | allowed = ((totalram_pages - hugetlb_total_pages()) | ||
442 | * sysctl_overcommit_ratio / 100); | ||
443 | allowed += total_swap_pages; | ||
444 | |||
445 | return allowed; | ||
414 | } | 446 | } |
415 | 447 | ||
416 | 448 | ||
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 0fdf96803c5b..e4f0db2a3eae 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -220,12 +220,12 @@ int is_vmalloc_or_module_addr(const void *x) | |||
220 | } | 220 | } |
221 | 221 | ||
222 | /* | 222 | /* |
223 | * Walk a vmap address to the struct page it maps. | 223 | * Walk a vmap address to the physical pfn it maps to. |
224 | */ | 224 | */ |
225 | struct page *vmalloc_to_page(const void *vmalloc_addr) | 225 | unsigned long vmalloc_to_pfn(const void *vmalloc_addr) |
226 | { | 226 | { |
227 | unsigned long addr = (unsigned long) vmalloc_addr; | 227 | unsigned long addr = (unsigned long) vmalloc_addr; |
228 | struct page *page = NULL; | 228 | unsigned long pfn = 0; |
229 | pgd_t *pgd = pgd_offset_k(addr); | 229 | pgd_t *pgd = pgd_offset_k(addr); |
230 | 230 | ||
231 | /* | 231 | /* |
@@ -244,23 +244,23 @@ struct page *vmalloc_to_page(const void *vmalloc_addr) | |||
244 | ptep = pte_offset_map(pmd, addr); | 244 | ptep = pte_offset_map(pmd, addr); |
245 | pte = *ptep; | 245 | pte = *ptep; |
246 | if (pte_present(pte)) | 246 | if (pte_present(pte)) |
247 | page = pte_page(pte); | 247 | pfn = pte_pfn(pte); |
248 | pte_unmap(ptep); | 248 | pte_unmap(ptep); |
249 | } | 249 | } |
250 | } | 250 | } |
251 | } | 251 | } |
252 | return page; | 252 | return pfn; |
253 | } | 253 | } |
254 | EXPORT_SYMBOL(vmalloc_to_page); | 254 | EXPORT_SYMBOL(vmalloc_to_pfn); |
255 | 255 | ||
256 | /* | 256 | /* |
257 | * Map a vmalloc()-space virtual address to the physical page frame number. | 257 | * Map a vmalloc()-space virtual address to the struct page. |
258 | */ | 258 | */ |
259 | unsigned long vmalloc_to_pfn(const void *vmalloc_addr) | 259 | struct page *vmalloc_to_page(const void *vmalloc_addr) |
260 | { | 260 | { |
261 | return page_to_pfn(vmalloc_to_page(vmalloc_addr)); | 261 | return pfn_to_page(vmalloc_to_pfn(vmalloc_addr)); |
262 | } | 262 | } |
263 | EXPORT_SYMBOL(vmalloc_to_pfn); | 263 | EXPORT_SYMBOL(vmalloc_to_page); |
264 | 264 | ||
265 | 265 | ||
266 | /*** Global kva allocator ***/ | 266 | /*** Global kva allocator ***/ |