diff options
Diffstat (limited to 'mm/huge_memory.c')
| -rw-r--r-- | mm/huge_memory.c | 94 |
1 files changed, 45 insertions, 49 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 55478ab3c83b..5da55b38b1b7 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
| @@ -629,40 +629,30 @@ release: | |||
| 629 | * available | 629 | * available |
| 630 | * never: never stall for any thp allocation | 630 | * never: never stall for any thp allocation |
| 631 | */ | 631 | */ |
| 632 | static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, unsigned long addr) | 632 | static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) |
| 633 | { | 633 | { |
| 634 | const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); | 634 | const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); |
| 635 | gfp_t this_node = 0; | ||
| 636 | |||
| 637 | #ifdef CONFIG_NUMA | ||
| 638 | struct mempolicy *pol; | ||
| 639 | /* | ||
| 640 | * __GFP_THISNODE is used only when __GFP_DIRECT_RECLAIM is not | ||
| 641 | * specified, to express a general desire to stay on the current | ||
| 642 | * node for optimistic allocation attempts. If the defrag mode | ||
| 643 | * and/or madvise hint requires the direct reclaim then we prefer | ||
| 644 | * to fallback to other node rather than node reclaim because that | ||
| 645 | * can lead to excessive reclaim even though there is free memory | ||
| 646 | * on other nodes. We expect that NUMA preferences are specified | ||
| 647 | * by memory policies. | ||
| 648 | */ | ||
| 649 | pol = get_vma_policy(vma, addr); | ||
| 650 | if (pol->mode != MPOL_BIND) | ||
| 651 | this_node = __GFP_THISNODE; | ||
| 652 | mpol_cond_put(pol); | ||
| 653 | #endif | ||
| 654 | 635 | ||
| 636 | /* Always do synchronous compaction */ | ||
| 655 | if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) | 637 | if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) |
| 656 | return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); | 638 | return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); |
| 639 | |||
| 640 | /* Kick kcompactd and fail quickly */ | ||
| 657 | if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) | 641 | if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) |
| 658 | return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM | this_node; | 642 | return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; |
| 643 | |||
| 644 | /* Synchronous compaction if madvised, otherwise kick kcompactd */ | ||
| 659 | if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) | 645 | if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) |
| 660 | return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : | 646 | return GFP_TRANSHUGE_LIGHT | |
| 661 | __GFP_KSWAPD_RECLAIM | this_node); | 647 | (vma_madvised ? __GFP_DIRECT_RECLAIM : |
| 648 | __GFP_KSWAPD_RECLAIM); | ||
| 649 | |||
| 650 | /* Only do synchronous compaction if madvised */ | ||
| 662 | if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) | 651 | if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) |
| 663 | return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : | 652 | return GFP_TRANSHUGE_LIGHT | |
| 664 | this_node); | 653 | (vma_madvised ? __GFP_DIRECT_RECLAIM : 0); |
| 665 | return GFP_TRANSHUGE_LIGHT | this_node; | 654 | |
| 655 | return GFP_TRANSHUGE_LIGHT; | ||
| 666 | } | 656 | } |
| 667 | 657 | ||
| 668 | /* Caller must hold page table lock. */ | 658 | /* Caller must hold page table lock. */ |
| @@ -734,8 +724,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) | |||
| 734 | pte_free(vma->vm_mm, pgtable); | 724 | pte_free(vma->vm_mm, pgtable); |
| 735 | return ret; | 725 | return ret; |
| 736 | } | 726 | } |
| 737 | gfp = alloc_hugepage_direct_gfpmask(vma, haddr); | 727 | gfp = alloc_hugepage_direct_gfpmask(vma); |
| 738 | page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, vma, haddr, numa_node_id()); | 728 | page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); |
| 739 | if (unlikely(!page)) { | 729 | if (unlikely(!page)) { |
| 740 | count_vm_event(THP_FAULT_FALLBACK); | 730 | count_vm_event(THP_FAULT_FALLBACK); |
| 741 | return VM_FAULT_FALLBACK; | 731 | return VM_FAULT_FALLBACK; |
| @@ -1305,9 +1295,8 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) | |||
| 1305 | alloc: | 1295 | alloc: |
| 1306 | if (transparent_hugepage_enabled(vma) && | 1296 | if (transparent_hugepage_enabled(vma) && |
| 1307 | !transparent_hugepage_debug_cow()) { | 1297 | !transparent_hugepage_debug_cow()) { |
| 1308 | huge_gfp = alloc_hugepage_direct_gfpmask(vma, haddr); | 1298 | huge_gfp = alloc_hugepage_direct_gfpmask(vma); |
| 1309 | new_page = alloc_pages_vma(huge_gfp, HPAGE_PMD_ORDER, vma, | 1299 | new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); |
| 1310 | haddr, numa_node_id()); | ||
| 1311 | } else | 1300 | } else |
| 1312 | new_page = NULL; | 1301 | new_page = NULL; |
| 1313 | 1302 | ||
| @@ -2350,7 +2339,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, | |||
| 2350 | } | 2339 | } |
| 2351 | } | 2340 | } |
| 2352 | 2341 | ||
| 2353 | static void freeze_page(struct page *page) | 2342 | static void unmap_page(struct page *page) |
| 2354 | { | 2343 | { |
| 2355 | enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS | | 2344 | enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS | |
| 2356 | TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD; | 2345 | TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD; |
| @@ -2365,7 +2354,7 @@ static void freeze_page(struct page *page) | |||
| 2365 | VM_BUG_ON_PAGE(!unmap_success, page); | 2354 | VM_BUG_ON_PAGE(!unmap_success, page); |
| 2366 | } | 2355 | } |
| 2367 | 2356 | ||
| 2368 | static void unfreeze_page(struct page *page) | 2357 | static void remap_page(struct page *page) |
| 2369 | { | 2358 | { |
| 2370 | int i; | 2359 | int i; |
| 2371 | if (PageTransHuge(page)) { | 2360 | if (PageTransHuge(page)) { |
| @@ -2402,6 +2391,12 @@ static void __split_huge_page_tail(struct page *head, int tail, | |||
| 2402 | (1L << PG_unevictable) | | 2391 | (1L << PG_unevictable) | |
| 2403 | (1L << PG_dirty))); | 2392 | (1L << PG_dirty))); |
| 2404 | 2393 | ||
| 2394 | /* ->mapping in first tail page is compound_mapcount */ | ||
| 2395 | VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, | ||
| 2396 | page_tail); | ||
| 2397 | page_tail->mapping = head->mapping; | ||
| 2398 | page_tail->index = head->index + tail; | ||
| 2399 | |||
| 2405 | /* Page flags must be visible before we make the page non-compound. */ | 2400 | /* Page flags must be visible before we make the page non-compound. */ |
| 2406 | smp_wmb(); | 2401 | smp_wmb(); |
| 2407 | 2402 | ||
| @@ -2422,12 +2417,6 @@ static void __split_huge_page_tail(struct page *head, int tail, | |||
| 2422 | if (page_is_idle(head)) | 2417 | if (page_is_idle(head)) |
| 2423 | set_page_idle(page_tail); | 2418 | set_page_idle(page_tail); |
| 2424 | 2419 | ||
| 2425 | /* ->mapping in first tail page is compound_mapcount */ | ||
| 2426 | VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, | ||
| 2427 | page_tail); | ||
| 2428 | page_tail->mapping = head->mapping; | ||
| 2429 | |||
| 2430 | page_tail->index = head->index + tail; | ||
| 2431 | page_cpupid_xchg_last(page_tail, page_cpupid_last(head)); | 2420 | page_cpupid_xchg_last(page_tail, page_cpupid_last(head)); |
| 2432 | 2421 | ||
| 2433 | /* | 2422 | /* |
| @@ -2439,12 +2428,11 @@ static void __split_huge_page_tail(struct page *head, int tail, | |||
| 2439 | } | 2428 | } |
| 2440 | 2429 | ||
| 2441 | static void __split_huge_page(struct page *page, struct list_head *list, | 2430 | static void __split_huge_page(struct page *page, struct list_head *list, |
| 2442 | unsigned long flags) | 2431 | pgoff_t end, unsigned long flags) |
| 2443 | { | 2432 | { |
| 2444 | struct page *head = compound_head(page); | 2433 | struct page *head = compound_head(page); |
| 2445 | struct zone *zone = page_zone(head); | 2434 | struct zone *zone = page_zone(head); |
| 2446 | struct lruvec *lruvec; | 2435 | struct lruvec *lruvec; |
| 2447 | pgoff_t end = -1; | ||
| 2448 | int i; | 2436 | int i; |
| 2449 | 2437 | ||
| 2450 | lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat); | 2438 | lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat); |
| @@ -2452,9 +2440,6 @@ static void __split_huge_page(struct page *page, struct list_head *list, | |||
| 2452 | /* complete memcg works before add pages to LRU */ | 2440 | /* complete memcg works before add pages to LRU */ |
| 2453 | mem_cgroup_split_huge_fixup(head); | 2441 | mem_cgroup_split_huge_fixup(head); |
| 2454 | 2442 | ||
| 2455 | if (!PageAnon(page)) | ||
| 2456 | end = DIV_ROUND_UP(i_size_read(head->mapping->host), PAGE_SIZE); | ||
| 2457 | |||
| 2458 | for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { | 2443 | for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { |
| 2459 | __split_huge_page_tail(head, i, lruvec, list); | 2444 | __split_huge_page_tail(head, i, lruvec, list); |
| 2460 | /* Some pages can be beyond i_size: drop them from page cache */ | 2445 | /* Some pages can be beyond i_size: drop them from page cache */ |
| @@ -2483,7 +2468,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, | |||
| 2483 | 2468 | ||
| 2484 | spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); | 2469 | spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); |
| 2485 | 2470 | ||
| 2486 | unfreeze_page(head); | 2471 | remap_page(head); |
| 2487 | 2472 | ||
| 2488 | for (i = 0; i < HPAGE_PMD_NR; i++) { | 2473 | for (i = 0; i < HPAGE_PMD_NR; i++) { |
| 2489 | struct page *subpage = head + i; | 2474 | struct page *subpage = head + i; |
| @@ -2626,6 +2611,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) | |||
| 2626 | int count, mapcount, extra_pins, ret; | 2611 | int count, mapcount, extra_pins, ret; |
| 2627 | bool mlocked; | 2612 | bool mlocked; |
| 2628 | unsigned long flags; | 2613 | unsigned long flags; |
| 2614 | pgoff_t end; | ||
| 2629 | 2615 | ||
| 2630 | VM_BUG_ON_PAGE(is_huge_zero_page(page), page); | 2616 | VM_BUG_ON_PAGE(is_huge_zero_page(page), page); |
| 2631 | VM_BUG_ON_PAGE(!PageLocked(page), page); | 2617 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
| @@ -2648,6 +2634,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) | |||
| 2648 | ret = -EBUSY; | 2634 | ret = -EBUSY; |
| 2649 | goto out; | 2635 | goto out; |
| 2650 | } | 2636 | } |
| 2637 | end = -1; | ||
| 2651 | mapping = NULL; | 2638 | mapping = NULL; |
| 2652 | anon_vma_lock_write(anon_vma); | 2639 | anon_vma_lock_write(anon_vma); |
| 2653 | } else { | 2640 | } else { |
| @@ -2661,10 +2648,19 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) | |||
| 2661 | 2648 | ||
| 2662 | anon_vma = NULL; | 2649 | anon_vma = NULL; |
| 2663 | i_mmap_lock_read(mapping); | 2650 | i_mmap_lock_read(mapping); |
| 2651 | |||
| 2652 | /* | ||
| 2653 | *__split_huge_page() may need to trim off pages beyond EOF: | ||
| 2654 | * but on 32-bit, i_size_read() takes an irq-unsafe seqlock, | ||
| 2655 | * which cannot be nested inside the page tree lock. So note | ||
| 2656 | * end now: i_size itself may be changed at any moment, but | ||
| 2657 | * head page lock is good enough to serialize the trimming. | ||
| 2658 | */ | ||
| 2659 | end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); | ||
| 2664 | } | 2660 | } |
| 2665 | 2661 | ||
| 2666 | /* | 2662 | /* |
| 2667 | * Racy check if we can split the page, before freeze_page() will | 2663 | * Racy check if we can split the page, before unmap_page() will |
| 2668 | * split PMDs | 2664 | * split PMDs |
| 2669 | */ | 2665 | */ |
| 2670 | if (!can_split_huge_page(head, &extra_pins)) { | 2666 | if (!can_split_huge_page(head, &extra_pins)) { |
| @@ -2673,7 +2669,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) | |||
| 2673 | } | 2669 | } |
| 2674 | 2670 | ||
| 2675 | mlocked = PageMlocked(page); | 2671 | mlocked = PageMlocked(page); |
| 2676 | freeze_page(head); | 2672 | unmap_page(head); |
| 2677 | VM_BUG_ON_PAGE(compound_mapcount(head), head); | 2673 | VM_BUG_ON_PAGE(compound_mapcount(head), head); |
| 2678 | 2674 | ||
| 2679 | /* Make sure the page is not on per-CPU pagevec as it takes pin */ | 2675 | /* Make sure the page is not on per-CPU pagevec as it takes pin */ |
| @@ -2707,7 +2703,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) | |||
| 2707 | if (mapping) | 2703 | if (mapping) |
| 2708 | __dec_node_page_state(page, NR_SHMEM_THPS); | 2704 | __dec_node_page_state(page, NR_SHMEM_THPS); |
| 2709 | spin_unlock(&pgdata->split_queue_lock); | 2705 | spin_unlock(&pgdata->split_queue_lock); |
| 2710 | __split_huge_page(page, list, flags); | 2706 | __split_huge_page(page, list, end, flags); |
| 2711 | if (PageSwapCache(head)) { | 2707 | if (PageSwapCache(head)) { |
| 2712 | swp_entry_t entry = { .val = page_private(head) }; | 2708 | swp_entry_t entry = { .val = page_private(head) }; |
| 2713 | 2709 | ||
| @@ -2727,7 +2723,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) | |||
| 2727 | fail: if (mapping) | 2723 | fail: if (mapping) |
| 2728 | xa_unlock(&mapping->i_pages); | 2724 | xa_unlock(&mapping->i_pages); |
| 2729 | spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); | 2725 | spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); |
| 2730 | unfreeze_page(head); | 2726 | remap_page(head); |
| 2731 | ret = -EBUSY; | 2727 | ret = -EBUSY; |
| 2732 | } | 2728 | } |
| 2733 | 2729 | ||
