aboutsummaryrefslogtreecommitdiffstats
path: root/mm/huge_memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r--mm/huge_memory.c94
1 files changed, 45 insertions, 49 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 55478ab3c83b..5da55b38b1b7 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -629,40 +629,30 @@ release:
629 * available 629 * available
630 * never: never stall for any thp allocation 630 * never: never stall for any thp allocation
631 */ 631 */
632static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, unsigned long addr) 632static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
633{ 633{
634 const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); 634 const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
635 gfp_t this_node = 0;
636
637#ifdef CONFIG_NUMA
638 struct mempolicy *pol;
639 /*
640 * __GFP_THISNODE is used only when __GFP_DIRECT_RECLAIM is not
641 * specified, to express a general desire to stay on the current
642 * node for optimistic allocation attempts. If the defrag mode
643 * and/or madvise hint requires the direct reclaim then we prefer
644 * to fallback to other node rather than node reclaim because that
645 * can lead to excessive reclaim even though there is free memory
646 * on other nodes. We expect that NUMA preferences are specified
647 * by memory policies.
648 */
649 pol = get_vma_policy(vma, addr);
650 if (pol->mode != MPOL_BIND)
651 this_node = __GFP_THISNODE;
652 mpol_cond_put(pol);
653#endif
654 635
636 /* Always do synchronous compaction */
655 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) 637 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
656 return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); 638 return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
639
640 /* Kick kcompactd and fail quickly */
657 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) 641 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
658 return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM | this_node; 642 return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
643
644 /* Synchronous compaction if madvised, otherwise kick kcompactd */
659 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) 645 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
660 return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : 646 return GFP_TRANSHUGE_LIGHT |
661 __GFP_KSWAPD_RECLAIM | this_node); 647 (vma_madvised ? __GFP_DIRECT_RECLAIM :
648 __GFP_KSWAPD_RECLAIM);
649
650 /* Only do synchronous compaction if madvised */
662 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) 651 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
663 return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : 652 return GFP_TRANSHUGE_LIGHT |
664 this_node); 653 (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
665 return GFP_TRANSHUGE_LIGHT | this_node; 654
655 return GFP_TRANSHUGE_LIGHT;
666} 656}
667 657
668/* Caller must hold page table lock. */ 658/* Caller must hold page table lock. */
@@ -734,8 +724,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
734 pte_free(vma->vm_mm, pgtable); 724 pte_free(vma->vm_mm, pgtable);
735 return ret; 725 return ret;
736 } 726 }
737 gfp = alloc_hugepage_direct_gfpmask(vma, haddr); 727 gfp = alloc_hugepage_direct_gfpmask(vma);
738 page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, vma, haddr, numa_node_id()); 728 page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
739 if (unlikely(!page)) { 729 if (unlikely(!page)) {
740 count_vm_event(THP_FAULT_FALLBACK); 730 count_vm_event(THP_FAULT_FALLBACK);
741 return VM_FAULT_FALLBACK; 731 return VM_FAULT_FALLBACK;
@@ -1305,9 +1295,8 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
1305alloc: 1295alloc:
1306 if (transparent_hugepage_enabled(vma) && 1296 if (transparent_hugepage_enabled(vma) &&
1307 !transparent_hugepage_debug_cow()) { 1297 !transparent_hugepage_debug_cow()) {
1308 huge_gfp = alloc_hugepage_direct_gfpmask(vma, haddr); 1298 huge_gfp = alloc_hugepage_direct_gfpmask(vma);
1309 new_page = alloc_pages_vma(huge_gfp, HPAGE_PMD_ORDER, vma, 1299 new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
1310 haddr, numa_node_id());
1311 } else 1300 } else
1312 new_page = NULL; 1301 new_page = NULL;
1313 1302
@@ -2350,7 +2339,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
2350 } 2339 }
2351} 2340}
2352 2341
2353static void freeze_page(struct page *page) 2342static void unmap_page(struct page *page)
2354{ 2343{
2355 enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS | 2344 enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS |
2356 TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD; 2345 TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
@@ -2365,7 +2354,7 @@ static void freeze_page(struct page *page)
2365 VM_BUG_ON_PAGE(!unmap_success, page); 2354 VM_BUG_ON_PAGE(!unmap_success, page);
2366} 2355}
2367 2356
2368static void unfreeze_page(struct page *page) 2357static void remap_page(struct page *page)
2369{ 2358{
2370 int i; 2359 int i;
2371 if (PageTransHuge(page)) { 2360 if (PageTransHuge(page)) {
@@ -2402,6 +2391,12 @@ static void __split_huge_page_tail(struct page *head, int tail,
2402 (1L << PG_unevictable) | 2391 (1L << PG_unevictable) |
2403 (1L << PG_dirty))); 2392 (1L << PG_dirty)));
2404 2393
2394 /* ->mapping in first tail page is compound_mapcount */
2395 VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
2396 page_tail);
2397 page_tail->mapping = head->mapping;
2398 page_tail->index = head->index + tail;
2399
2405 /* Page flags must be visible before we make the page non-compound. */ 2400 /* Page flags must be visible before we make the page non-compound. */
2406 smp_wmb(); 2401 smp_wmb();
2407 2402
@@ -2422,12 +2417,6 @@ static void __split_huge_page_tail(struct page *head, int tail,
2422 if (page_is_idle(head)) 2417 if (page_is_idle(head))
2423 set_page_idle(page_tail); 2418 set_page_idle(page_tail);
2424 2419
2425 /* ->mapping in first tail page is compound_mapcount */
2426 VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
2427 page_tail);
2428 page_tail->mapping = head->mapping;
2429
2430 page_tail->index = head->index + tail;
2431 page_cpupid_xchg_last(page_tail, page_cpupid_last(head)); 2420 page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
2432 2421
2433 /* 2422 /*
@@ -2439,12 +2428,11 @@ static void __split_huge_page_tail(struct page *head, int tail,
2439} 2428}
2440 2429
2441static void __split_huge_page(struct page *page, struct list_head *list, 2430static void __split_huge_page(struct page *page, struct list_head *list,
2442 unsigned long flags) 2431 pgoff_t end, unsigned long flags)
2443{ 2432{
2444 struct page *head = compound_head(page); 2433 struct page *head = compound_head(page);
2445 struct zone *zone = page_zone(head); 2434 struct zone *zone = page_zone(head);
2446 struct lruvec *lruvec; 2435 struct lruvec *lruvec;
2447 pgoff_t end = -1;
2448 int i; 2436 int i;
2449 2437
2450 lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat); 2438 lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat);
@@ -2452,9 +2440,6 @@ static void __split_huge_page(struct page *page, struct list_head *list,
2452 /* complete memcg works before add pages to LRU */ 2440 /* complete memcg works before add pages to LRU */
2453 mem_cgroup_split_huge_fixup(head); 2441 mem_cgroup_split_huge_fixup(head);
2454 2442
2455 if (!PageAnon(page))
2456 end = DIV_ROUND_UP(i_size_read(head->mapping->host), PAGE_SIZE);
2457
2458 for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { 2443 for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
2459 __split_huge_page_tail(head, i, lruvec, list); 2444 __split_huge_page_tail(head, i, lruvec, list);
2460 /* Some pages can be beyond i_size: drop them from page cache */ 2445 /* Some pages can be beyond i_size: drop them from page cache */
@@ -2483,7 +2468,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
2483 2468
2484 spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); 2469 spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
2485 2470
2486 unfreeze_page(head); 2471 remap_page(head);
2487 2472
2488 for (i = 0; i < HPAGE_PMD_NR; i++) { 2473 for (i = 0; i < HPAGE_PMD_NR; i++) {
2489 struct page *subpage = head + i; 2474 struct page *subpage = head + i;
@@ -2626,6 +2611,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
2626 int count, mapcount, extra_pins, ret; 2611 int count, mapcount, extra_pins, ret;
2627 bool mlocked; 2612 bool mlocked;
2628 unsigned long flags; 2613 unsigned long flags;
2614 pgoff_t end;
2629 2615
2630 VM_BUG_ON_PAGE(is_huge_zero_page(page), page); 2616 VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
2631 VM_BUG_ON_PAGE(!PageLocked(page), page); 2617 VM_BUG_ON_PAGE(!PageLocked(page), page);
@@ -2648,6 +2634,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
2648 ret = -EBUSY; 2634 ret = -EBUSY;
2649 goto out; 2635 goto out;
2650 } 2636 }
2637 end = -1;
2651 mapping = NULL; 2638 mapping = NULL;
2652 anon_vma_lock_write(anon_vma); 2639 anon_vma_lock_write(anon_vma);
2653 } else { 2640 } else {
@@ -2661,10 +2648,19 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
2661 2648
2662 anon_vma = NULL; 2649 anon_vma = NULL;
2663 i_mmap_lock_read(mapping); 2650 i_mmap_lock_read(mapping);
2651
2652 /*
2653 *__split_huge_page() may need to trim off pages beyond EOF:
2654 * but on 32-bit, i_size_read() takes an irq-unsafe seqlock,
2655 * which cannot be nested inside the page tree lock. So note
2656 * end now: i_size itself may be changed at any moment, but
2657 * head page lock is good enough to serialize the trimming.
2658 */
2659 end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
2664 } 2660 }
2665 2661
2666 /* 2662 /*
2667 * Racy check if we can split the page, before freeze_page() will 2663 * Racy check if we can split the page, before unmap_page() will
2668 * split PMDs 2664 * split PMDs
2669 */ 2665 */
2670 if (!can_split_huge_page(head, &extra_pins)) { 2666 if (!can_split_huge_page(head, &extra_pins)) {
@@ -2673,7 +2669,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
2673 } 2669 }
2674 2670
2675 mlocked = PageMlocked(page); 2671 mlocked = PageMlocked(page);
2676 freeze_page(head); 2672 unmap_page(head);
2677 VM_BUG_ON_PAGE(compound_mapcount(head), head); 2673 VM_BUG_ON_PAGE(compound_mapcount(head), head);
2678 2674
2679 /* Make sure the page is not on per-CPU pagevec as it takes pin */ 2675 /* Make sure the page is not on per-CPU pagevec as it takes pin */
@@ -2707,7 +2703,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
2707 if (mapping) 2703 if (mapping)
2708 __dec_node_page_state(page, NR_SHMEM_THPS); 2704 __dec_node_page_state(page, NR_SHMEM_THPS);
2709 spin_unlock(&pgdata->split_queue_lock); 2705 spin_unlock(&pgdata->split_queue_lock);
2710 __split_huge_page(page, list, flags); 2706 __split_huge_page(page, list, end, flags);
2711 if (PageSwapCache(head)) { 2707 if (PageSwapCache(head)) {
2712 swp_entry_t entry = { .val = page_private(head) }; 2708 swp_entry_t entry = { .val = page_private(head) };
2713 2709
@@ -2727,7 +2723,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
2727fail: if (mapping) 2723fail: if (mapping)
2728 xa_unlock(&mapping->i_pages); 2724 xa_unlock(&mapping->i_pages);
2729 spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); 2725 spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
2730 unfreeze_page(head); 2726 remap_page(head);
2731 ret = -EBUSY; 2727 ret = -EBUSY;
2732 } 2728 }
2733 2729