diff options
Diffstat (limited to 'mm/mempolicy.c')
-rw-r--r-- | mm/mempolicy.c | 277 |
1 files changed, 128 insertions, 149 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0e0961b8c39c..f1bd23803576 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -471,24 +471,34 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { | |||
471 | static void migrate_page_add(struct page *page, struct list_head *pagelist, | 471 | static void migrate_page_add(struct page *page, struct list_head *pagelist, |
472 | unsigned long flags); | 472 | unsigned long flags); |
473 | 473 | ||
474 | struct queue_pages { | ||
475 | struct list_head *pagelist; | ||
476 | unsigned long flags; | ||
477 | nodemask_t *nmask; | ||
478 | struct vm_area_struct *prev; | ||
479 | }; | ||
480 | |||
474 | /* | 481 | /* |
475 | * Scan through pages checking if pages follow certain conditions, | 482 | * Scan through pages checking if pages follow certain conditions, |
476 | * and move them to the pagelist if they do. | 483 | * and move them to the pagelist if they do. |
477 | */ | 484 | */ |
478 | static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | 485 | static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, |
479 | unsigned long addr, unsigned long end, | 486 | unsigned long end, struct mm_walk *walk) |
480 | const nodemask_t *nodes, unsigned long flags, | ||
481 | void *private) | ||
482 | { | 487 | { |
483 | pte_t *orig_pte; | 488 | struct vm_area_struct *vma = walk->vma; |
489 | struct page *page; | ||
490 | struct queue_pages *qp = walk->private; | ||
491 | unsigned long flags = qp->flags; | ||
492 | int nid; | ||
484 | pte_t *pte; | 493 | pte_t *pte; |
485 | spinlock_t *ptl; | 494 | spinlock_t *ptl; |
486 | 495 | ||
487 | orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 496 | split_huge_page_pmd(vma, addr, pmd); |
488 | do { | 497 | if (pmd_trans_unstable(pmd)) |
489 | struct page *page; | 498 | return 0; |
490 | int nid; | ||
491 | 499 | ||
500 | pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); | ||
501 | for (; addr != end; pte++, addr += PAGE_SIZE) { | ||
492 | if (!pte_present(*pte)) | 502 | if (!pte_present(*pte)) |
493 | continue; | 503 | continue; |
494 | page = vm_normal_page(vma, addr, *pte); | 504 | page = vm_normal_page(vma, addr, *pte); |
@@ -501,114 +511,46 @@ static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
501 | if (PageReserved(page)) | 511 | if (PageReserved(page)) |
502 | continue; | 512 | continue; |
503 | nid = page_to_nid(page); | 513 | nid = page_to_nid(page); |
504 | if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) | 514 | if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT)) |
505 | continue; | 515 | continue; |
506 | 516 | ||
507 | if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) | 517 | if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) |
508 | migrate_page_add(page, private, flags); | 518 | migrate_page_add(page, qp->pagelist, flags); |
509 | else | 519 | } |
510 | break; | 520 | pte_unmap_unlock(pte - 1, ptl); |
511 | } while (pte++, addr += PAGE_SIZE, addr != end); | 521 | cond_resched(); |
512 | pte_unmap_unlock(orig_pte, ptl); | 522 | return 0; |
513 | return addr != end; | ||
514 | } | 523 | } |
515 | 524 | ||
516 | static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma, | 525 | static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, |
517 | pmd_t *pmd, const nodemask_t *nodes, unsigned long flags, | 526 | unsigned long addr, unsigned long end, |
518 | void *private) | 527 | struct mm_walk *walk) |
519 | { | 528 | { |
520 | #ifdef CONFIG_HUGETLB_PAGE | 529 | #ifdef CONFIG_HUGETLB_PAGE |
530 | struct queue_pages *qp = walk->private; | ||
531 | unsigned long flags = qp->flags; | ||
521 | int nid; | 532 | int nid; |
522 | struct page *page; | 533 | struct page *page; |
523 | spinlock_t *ptl; | 534 | spinlock_t *ptl; |
524 | pte_t entry; | 535 | pte_t entry; |
525 | 536 | ||
526 | ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd); | 537 | ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); |
527 | entry = huge_ptep_get((pte_t *)pmd); | 538 | entry = huge_ptep_get(pte); |
528 | if (!pte_present(entry)) | 539 | if (!pte_present(entry)) |
529 | goto unlock; | 540 | goto unlock; |
530 | page = pte_page(entry); | 541 | page = pte_page(entry); |
531 | nid = page_to_nid(page); | 542 | nid = page_to_nid(page); |
532 | if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) | 543 | if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT)) |
533 | goto unlock; | 544 | goto unlock; |
534 | /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ | 545 | /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ |
535 | if (flags & (MPOL_MF_MOVE_ALL) || | 546 | if (flags & (MPOL_MF_MOVE_ALL) || |
536 | (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) | 547 | (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) |
537 | isolate_huge_page(page, private); | 548 | isolate_huge_page(page, qp->pagelist); |
538 | unlock: | 549 | unlock: |
539 | spin_unlock(ptl); | 550 | spin_unlock(ptl); |
540 | #else | 551 | #else |
541 | BUG(); | 552 | BUG(); |
542 | #endif | 553 | #endif |
543 | } | ||
544 | |||
545 | static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud, | ||
546 | unsigned long addr, unsigned long end, | ||
547 | const nodemask_t *nodes, unsigned long flags, | ||
548 | void *private) | ||
549 | { | ||
550 | pmd_t *pmd; | ||
551 | unsigned long next; | ||
552 | |||
553 | pmd = pmd_offset(pud, addr); | ||
554 | do { | ||
555 | next = pmd_addr_end(addr, end); | ||
556 | if (!pmd_present(*pmd)) | ||
557 | continue; | ||
558 | if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) { | ||
559 | queue_pages_hugetlb_pmd_range(vma, pmd, nodes, | ||
560 | flags, private); | ||
561 | continue; | ||
562 | } | ||
563 | split_huge_page_pmd(vma, addr, pmd); | ||
564 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) | ||
565 | continue; | ||
566 | if (queue_pages_pte_range(vma, pmd, addr, next, nodes, | ||
567 | flags, private)) | ||
568 | return -EIO; | ||
569 | } while (pmd++, addr = next, addr != end); | ||
570 | return 0; | ||
571 | } | ||
572 | |||
573 | static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd, | ||
574 | unsigned long addr, unsigned long end, | ||
575 | const nodemask_t *nodes, unsigned long flags, | ||
576 | void *private) | ||
577 | { | ||
578 | pud_t *pud; | ||
579 | unsigned long next; | ||
580 | |||
581 | pud = pud_offset(pgd, addr); | ||
582 | do { | ||
583 | next = pud_addr_end(addr, end); | ||
584 | if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) | ||
585 | continue; | ||
586 | if (pud_none_or_clear_bad(pud)) | ||
587 | continue; | ||
588 | if (queue_pages_pmd_range(vma, pud, addr, next, nodes, | ||
589 | flags, private)) | ||
590 | return -EIO; | ||
591 | } while (pud++, addr = next, addr != end); | ||
592 | return 0; | ||
593 | } | ||
594 | |||
595 | static inline int queue_pages_pgd_range(struct vm_area_struct *vma, | ||
596 | unsigned long addr, unsigned long end, | ||
597 | const nodemask_t *nodes, unsigned long flags, | ||
598 | void *private) | ||
599 | { | ||
600 | pgd_t *pgd; | ||
601 | unsigned long next; | ||
602 | |||
603 | pgd = pgd_offset(vma->vm_mm, addr); | ||
604 | do { | ||
605 | next = pgd_addr_end(addr, end); | ||
606 | if (pgd_none_or_clear_bad(pgd)) | ||
607 | continue; | ||
608 | if (queue_pages_pud_range(vma, pgd, addr, next, nodes, | ||
609 | flags, private)) | ||
610 | return -EIO; | ||
611 | } while (pgd++, addr = next, addr != end); | ||
612 | return 0; | 554 | return 0; |
613 | } | 555 | } |
614 | 556 | ||
@@ -641,6 +583,49 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma, | |||
641 | } | 583 | } |
642 | #endif /* CONFIG_NUMA_BALANCING */ | 584 | #endif /* CONFIG_NUMA_BALANCING */ |
643 | 585 | ||
586 | static int queue_pages_test_walk(unsigned long start, unsigned long end, | ||
587 | struct mm_walk *walk) | ||
588 | { | ||
589 | struct vm_area_struct *vma = walk->vma; | ||
590 | struct queue_pages *qp = walk->private; | ||
591 | unsigned long endvma = vma->vm_end; | ||
592 | unsigned long flags = qp->flags; | ||
593 | |||
594 | if (vma->vm_flags & VM_PFNMAP) | ||
595 | return 1; | ||
596 | |||
597 | if (endvma > end) | ||
598 | endvma = end; | ||
599 | if (vma->vm_start > start) | ||
600 | start = vma->vm_start; | ||
601 | |||
602 | if (!(flags & MPOL_MF_DISCONTIG_OK)) { | ||
603 | if (!vma->vm_next && vma->vm_end < end) | ||
604 | return -EFAULT; | ||
605 | if (qp->prev && qp->prev->vm_end < vma->vm_start) | ||
606 | return -EFAULT; | ||
607 | } | ||
608 | |||
609 | qp->prev = vma; | ||
610 | |||
611 | if (vma->vm_flags & VM_PFNMAP) | ||
612 | return 1; | ||
613 | |||
614 | if (flags & MPOL_MF_LAZY) { | ||
615 | /* Similar to task_numa_work, skip inaccessible VMAs */ | ||
616 | if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) | ||
617 | change_prot_numa(vma, start, endvma); | ||
618 | return 1; | ||
619 | } | ||
620 | |||
621 | if ((flags & MPOL_MF_STRICT) || | ||
622 | ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && | ||
623 | vma_migratable(vma))) | ||
624 | /* queue pages from current vma */ | ||
625 | return 0; | ||
626 | return 1; | ||
627 | } | ||
628 | |||
644 | /* | 629 | /* |
645 | * Walk through page tables and collect pages to be migrated. | 630 | * Walk through page tables and collect pages to be migrated. |
646 | * | 631 | * |
@@ -650,50 +635,24 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma, | |||
650 | */ | 635 | */ |
651 | static int | 636 | static int |
652 | queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, | 637 | queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, |
653 | const nodemask_t *nodes, unsigned long flags, void *private) | 638 | nodemask_t *nodes, unsigned long flags, |
654 | { | 639 | struct list_head *pagelist) |
655 | int err = 0; | 640 | { |
656 | struct vm_area_struct *vma, *prev; | 641 | struct queue_pages qp = { |
657 | 642 | .pagelist = pagelist, | |
658 | vma = find_vma(mm, start); | 643 | .flags = flags, |
659 | if (!vma) | 644 | .nmask = nodes, |
660 | return -EFAULT; | 645 | .prev = NULL, |
661 | prev = NULL; | 646 | }; |
662 | for (; vma && vma->vm_start < end; vma = vma->vm_next) { | 647 | struct mm_walk queue_pages_walk = { |
663 | unsigned long endvma = vma->vm_end; | 648 | .hugetlb_entry = queue_pages_hugetlb, |
664 | 649 | .pmd_entry = queue_pages_pte_range, | |
665 | if (endvma > end) | 650 | .test_walk = queue_pages_test_walk, |
666 | endvma = end; | 651 | .mm = mm, |
667 | if (vma->vm_start > start) | 652 | .private = &qp, |
668 | start = vma->vm_start; | 653 | }; |
669 | 654 | ||
670 | if (!(flags & MPOL_MF_DISCONTIG_OK)) { | 655 | return walk_page_range(start, end, &queue_pages_walk); |
671 | if (!vma->vm_next && vma->vm_end < end) | ||
672 | return -EFAULT; | ||
673 | if (prev && prev->vm_end < vma->vm_start) | ||
674 | return -EFAULT; | ||
675 | } | ||
676 | |||
677 | if (flags & MPOL_MF_LAZY) { | ||
678 | /* Similar to task_numa_work, skip inaccessible VMAs */ | ||
679 | if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) | ||
680 | change_prot_numa(vma, start, endvma); | ||
681 | goto next; | ||
682 | } | ||
683 | |||
684 | if ((flags & MPOL_MF_STRICT) || | ||
685 | ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && | ||
686 | vma_migratable(vma))) { | ||
687 | |||
688 | err = queue_pages_pgd_range(vma, start, endvma, nodes, | ||
689 | flags, private); | ||
690 | if (err) | ||
691 | break; | ||
692 | } | ||
693 | next: | ||
694 | prev = vma; | ||
695 | } | ||
696 | return err; | ||
697 | } | 656 | } |
698 | 657 | ||
699 | /* | 658 | /* |
@@ -1988,43 +1947,63 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, | |||
1988 | * @order:Order of the GFP allocation. | 1947 | * @order:Order of the GFP allocation. |
1989 | * @vma: Pointer to VMA or NULL if not available. | 1948 | * @vma: Pointer to VMA or NULL if not available. |
1990 | * @addr: Virtual Address of the allocation. Must be inside the VMA. | 1949 | * @addr: Virtual Address of the allocation. Must be inside the VMA. |
1950 | * @node: Which node to prefer for allocation (modulo policy). | ||
1951 | * @hugepage: for hugepages try only the preferred node if possible | ||
1991 | * | 1952 | * |
1992 | * This function allocates a page from the kernel page pool and applies | 1953 | * This function allocates a page from the kernel page pool and applies |
1993 | * a NUMA policy associated with the VMA or the current process. | 1954 | * a NUMA policy associated with the VMA or the current process. |
1994 | * When VMA is not NULL caller must hold down_read on the mmap_sem of the | 1955 | * When VMA is not NULL caller must hold down_read on the mmap_sem of the |
1995 | * mm_struct of the VMA to prevent it from going away. Should be used for | 1956 | * mm_struct of the VMA to prevent it from going away. Should be used for |
1996 | * all allocations for pages that will be mapped into | 1957 | * all allocations for pages that will be mapped into user space. Returns |
1997 | * user space. Returns NULL when no page can be allocated. | 1958 | * NULL when no page can be allocated. |
1998 | * | ||
1999 | * Should be called with the mm_sem of the vma hold. | ||
2000 | */ | 1959 | */ |
2001 | struct page * | 1960 | struct page * |
2002 | alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, | 1961 | alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, |
2003 | unsigned long addr, int node) | 1962 | unsigned long addr, int node, bool hugepage) |
2004 | { | 1963 | { |
2005 | struct mempolicy *pol; | 1964 | struct mempolicy *pol; |
2006 | struct page *page; | 1965 | struct page *page; |
2007 | unsigned int cpuset_mems_cookie; | 1966 | unsigned int cpuset_mems_cookie; |
1967 | struct zonelist *zl; | ||
1968 | nodemask_t *nmask; | ||
2008 | 1969 | ||
2009 | retry_cpuset: | 1970 | retry_cpuset: |
2010 | pol = get_vma_policy(vma, addr); | 1971 | pol = get_vma_policy(vma, addr); |
2011 | cpuset_mems_cookie = read_mems_allowed_begin(); | 1972 | cpuset_mems_cookie = read_mems_allowed_begin(); |
2012 | 1973 | ||
2013 | if (unlikely(pol->mode == MPOL_INTERLEAVE)) { | 1974 | if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage && |
1975 | pol->mode != MPOL_INTERLEAVE)) { | ||
1976 | /* | ||
1977 | * For hugepage allocation and non-interleave policy which | ||
1978 | * allows the current node, we only try to allocate from the | ||
1979 | * current node and don't fall back to other nodes, as the | ||
1980 | * cost of remote accesses would likely offset THP benefits. | ||
1981 | * | ||
1982 | * If the policy is interleave, or does not allow the current | ||
1983 | * node in its nodemask, we allocate the standard way. | ||
1984 | */ | ||
1985 | nmask = policy_nodemask(gfp, pol); | ||
1986 | if (!nmask || node_isset(node, *nmask)) { | ||
1987 | mpol_cond_put(pol); | ||
1988 | page = alloc_pages_exact_node(node, gfp, order); | ||
1989 | goto out; | ||
1990 | } | ||
1991 | } | ||
1992 | |||
1993 | if (pol->mode == MPOL_INTERLEAVE) { | ||
2014 | unsigned nid; | 1994 | unsigned nid; |
2015 | 1995 | ||
2016 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); | 1996 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); |
2017 | mpol_cond_put(pol); | 1997 | mpol_cond_put(pol); |
2018 | page = alloc_page_interleave(gfp, order, nid); | 1998 | page = alloc_page_interleave(gfp, order, nid); |
2019 | if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) | 1999 | goto out; |
2020 | goto retry_cpuset; | ||
2021 | |||
2022 | return page; | ||
2023 | } | 2000 | } |
2024 | page = __alloc_pages_nodemask(gfp, order, | 2001 | |
2025 | policy_zonelist(gfp, pol, node), | 2002 | nmask = policy_nodemask(gfp, pol); |
2026 | policy_nodemask(gfp, pol)); | 2003 | zl = policy_zonelist(gfp, pol, node); |
2027 | mpol_cond_put(pol); | 2004 | mpol_cond_put(pol); |
2005 | page = __alloc_pages_nodemask(gfp, order, zl, nmask); | ||
2006 | out: | ||
2028 | if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) | 2007 | if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) |
2029 | goto retry_cpuset; | 2008 | goto retry_cpuset; |
2030 | return page; | 2009 | return page; |