aboutsummaryrefslogtreecommitdiffstats
path: root/mm/mempolicy.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/mempolicy.c')
-rw-r--r--mm/mempolicy.c277
1 files changed, 128 insertions, 149 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0e0961b8c39c..f1bd23803576 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -471,24 +471,34 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
471static void migrate_page_add(struct page *page, struct list_head *pagelist, 471static void migrate_page_add(struct page *page, struct list_head *pagelist,
472 unsigned long flags); 472 unsigned long flags);
473 473
474struct queue_pages {
475 struct list_head *pagelist;
476 unsigned long flags;
477 nodemask_t *nmask;
478 struct vm_area_struct *prev;
479};
480
474/* 481/*
475 * Scan through pages checking if pages follow certain conditions, 482 * Scan through pages checking if pages follow certain conditions,
476 * and move them to the pagelist if they do. 483 * and move them to the pagelist if they do.
477 */ 484 */
478static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 485static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
479 unsigned long addr, unsigned long end, 486 unsigned long end, struct mm_walk *walk)
480 const nodemask_t *nodes, unsigned long flags,
481 void *private)
482{ 487{
483 pte_t *orig_pte; 488 struct vm_area_struct *vma = walk->vma;
489 struct page *page;
490 struct queue_pages *qp = walk->private;
491 unsigned long flags = qp->flags;
492 int nid;
484 pte_t *pte; 493 pte_t *pte;
485 spinlock_t *ptl; 494 spinlock_t *ptl;
486 495
487 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 496 split_huge_page_pmd(vma, addr, pmd);
488 do { 497 if (pmd_trans_unstable(pmd))
489 struct page *page; 498 return 0;
490 int nid;
491 499
500 pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
501 for (; addr != end; pte++, addr += PAGE_SIZE) {
492 if (!pte_present(*pte)) 502 if (!pte_present(*pte))
493 continue; 503 continue;
494 page = vm_normal_page(vma, addr, *pte); 504 page = vm_normal_page(vma, addr, *pte);
@@ -501,114 +511,46 @@ static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
501 if (PageReserved(page)) 511 if (PageReserved(page))
502 continue; 512 continue;
503 nid = page_to_nid(page); 513 nid = page_to_nid(page);
504 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) 514 if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
505 continue; 515 continue;
506 516
507 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 517 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
508 migrate_page_add(page, private, flags); 518 migrate_page_add(page, qp->pagelist, flags);
509 else 519 }
510 break; 520 pte_unmap_unlock(pte - 1, ptl);
511 } while (pte++, addr += PAGE_SIZE, addr != end); 521 cond_resched();
512 pte_unmap_unlock(orig_pte, ptl); 522 return 0;
513 return addr != end;
514} 523}
515 524
516static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma, 525static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
517 pmd_t *pmd, const nodemask_t *nodes, unsigned long flags, 526 unsigned long addr, unsigned long end,
518 void *private) 527 struct mm_walk *walk)
519{ 528{
520#ifdef CONFIG_HUGETLB_PAGE 529#ifdef CONFIG_HUGETLB_PAGE
530 struct queue_pages *qp = walk->private;
531 unsigned long flags = qp->flags;
521 int nid; 532 int nid;
522 struct page *page; 533 struct page *page;
523 spinlock_t *ptl; 534 spinlock_t *ptl;
524 pte_t entry; 535 pte_t entry;
525 536
526 ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd); 537 ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
527 entry = huge_ptep_get((pte_t *)pmd); 538 entry = huge_ptep_get(pte);
528 if (!pte_present(entry)) 539 if (!pte_present(entry))
529 goto unlock; 540 goto unlock;
530 page = pte_page(entry); 541 page = pte_page(entry);
531 nid = page_to_nid(page); 542 nid = page_to_nid(page);
532 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) 543 if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
533 goto unlock; 544 goto unlock;
534 /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ 545 /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
535 if (flags & (MPOL_MF_MOVE_ALL) || 546 if (flags & (MPOL_MF_MOVE_ALL) ||
536 (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) 547 (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
537 isolate_huge_page(page, private); 548 isolate_huge_page(page, qp->pagelist);
538unlock: 549unlock:
539 spin_unlock(ptl); 550 spin_unlock(ptl);
540#else 551#else
541 BUG(); 552 BUG();
542#endif 553#endif
543}
544
545static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud,
546 unsigned long addr, unsigned long end,
547 const nodemask_t *nodes, unsigned long flags,
548 void *private)
549{
550 pmd_t *pmd;
551 unsigned long next;
552
553 pmd = pmd_offset(pud, addr);
554 do {
555 next = pmd_addr_end(addr, end);
556 if (!pmd_present(*pmd))
557 continue;
558 if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) {
559 queue_pages_hugetlb_pmd_range(vma, pmd, nodes,
560 flags, private);
561 continue;
562 }
563 split_huge_page_pmd(vma, addr, pmd);
564 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
565 continue;
566 if (queue_pages_pte_range(vma, pmd, addr, next, nodes,
567 flags, private))
568 return -EIO;
569 } while (pmd++, addr = next, addr != end);
570 return 0;
571}
572
573static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
574 unsigned long addr, unsigned long end,
575 const nodemask_t *nodes, unsigned long flags,
576 void *private)
577{
578 pud_t *pud;
579 unsigned long next;
580
581 pud = pud_offset(pgd, addr);
582 do {
583 next = pud_addr_end(addr, end);
584 if (pud_huge(*pud) && is_vm_hugetlb_page(vma))
585 continue;
586 if (pud_none_or_clear_bad(pud))
587 continue;
588 if (queue_pages_pmd_range(vma, pud, addr, next, nodes,
589 flags, private))
590 return -EIO;
591 } while (pud++, addr = next, addr != end);
592 return 0;
593}
594
595static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
596 unsigned long addr, unsigned long end,
597 const nodemask_t *nodes, unsigned long flags,
598 void *private)
599{
600 pgd_t *pgd;
601 unsigned long next;
602
603 pgd = pgd_offset(vma->vm_mm, addr);
604 do {
605 next = pgd_addr_end(addr, end);
606 if (pgd_none_or_clear_bad(pgd))
607 continue;
608 if (queue_pages_pud_range(vma, pgd, addr, next, nodes,
609 flags, private))
610 return -EIO;
611 } while (pgd++, addr = next, addr != end);
612 return 0; 554 return 0;
613} 555}
614 556
@@ -641,6 +583,49 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
641} 583}
642#endif /* CONFIG_NUMA_BALANCING */ 584#endif /* CONFIG_NUMA_BALANCING */
643 585
586static int queue_pages_test_walk(unsigned long start, unsigned long end,
587 struct mm_walk *walk)
588{
589 struct vm_area_struct *vma = walk->vma;
590 struct queue_pages *qp = walk->private;
591 unsigned long endvma = vma->vm_end;
592 unsigned long flags = qp->flags;
593
594 if (vma->vm_flags & VM_PFNMAP)
595 return 1;
596
597 if (endvma > end)
598 endvma = end;
599 if (vma->vm_start > start)
600 start = vma->vm_start;
601
602 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
603 if (!vma->vm_next && vma->vm_end < end)
604 return -EFAULT;
605 if (qp->prev && qp->prev->vm_end < vma->vm_start)
606 return -EFAULT;
607 }
608
609 qp->prev = vma;
610
611 if (vma->vm_flags & VM_PFNMAP)
612 return 1;
613
614 if (flags & MPOL_MF_LAZY) {
615 /* Similar to task_numa_work, skip inaccessible VMAs */
616 if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
617 change_prot_numa(vma, start, endvma);
618 return 1;
619 }
620
621 if ((flags & MPOL_MF_STRICT) ||
622 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
623 vma_migratable(vma)))
624 /* queue pages from current vma */
625 return 0;
626 return 1;
627}
628
644/* 629/*
645 * Walk through page tables and collect pages to be migrated. 630 * Walk through page tables and collect pages to be migrated.
646 * 631 *
@@ -650,50 +635,24 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
650 */ 635 */
651static int 636static int
652queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, 637queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
653 const nodemask_t *nodes, unsigned long flags, void *private) 638 nodemask_t *nodes, unsigned long flags,
654{ 639 struct list_head *pagelist)
655 int err = 0; 640{
656 struct vm_area_struct *vma, *prev; 641 struct queue_pages qp = {
657 642 .pagelist = pagelist,
658 vma = find_vma(mm, start); 643 .flags = flags,
659 if (!vma) 644 .nmask = nodes,
660 return -EFAULT; 645 .prev = NULL,
661 prev = NULL; 646 };
662 for (; vma && vma->vm_start < end; vma = vma->vm_next) { 647 struct mm_walk queue_pages_walk = {
663 unsigned long endvma = vma->vm_end; 648 .hugetlb_entry = queue_pages_hugetlb,
664 649 .pmd_entry = queue_pages_pte_range,
665 if (endvma > end) 650 .test_walk = queue_pages_test_walk,
666 endvma = end; 651 .mm = mm,
667 if (vma->vm_start > start) 652 .private = &qp,
668 start = vma->vm_start; 653 };
669 654
670 if (!(flags & MPOL_MF_DISCONTIG_OK)) { 655 return walk_page_range(start, end, &queue_pages_walk);
671 if (!vma->vm_next && vma->vm_end < end)
672 return -EFAULT;
673 if (prev && prev->vm_end < vma->vm_start)
674 return -EFAULT;
675 }
676
677 if (flags & MPOL_MF_LAZY) {
678 /* Similar to task_numa_work, skip inaccessible VMAs */
679 if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
680 change_prot_numa(vma, start, endvma);
681 goto next;
682 }
683
684 if ((flags & MPOL_MF_STRICT) ||
685 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
686 vma_migratable(vma))) {
687
688 err = queue_pages_pgd_range(vma, start, endvma, nodes,
689 flags, private);
690 if (err)
691 break;
692 }
693next:
694 prev = vma;
695 }
696 return err;
697} 656}
698 657
699/* 658/*
@@ -1988,43 +1947,63 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1988 * @order:Order of the GFP allocation. 1947 * @order:Order of the GFP allocation.
1989 * @vma: Pointer to VMA or NULL if not available. 1948 * @vma: Pointer to VMA or NULL if not available.
1990 * @addr: Virtual Address of the allocation. Must be inside the VMA. 1949 * @addr: Virtual Address of the allocation. Must be inside the VMA.
1950 * @node: Which node to prefer for allocation (modulo policy).
1951 * @hugepage: for hugepages try only the preferred node if possible
1991 * 1952 *
1992 * This function allocates a page from the kernel page pool and applies 1953 * This function allocates a page from the kernel page pool and applies
1993 * a NUMA policy associated with the VMA or the current process. 1954 * a NUMA policy associated with the VMA or the current process.
1994 * When VMA is not NULL caller must hold down_read on the mmap_sem of the 1955 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
1995 * mm_struct of the VMA to prevent it from going away. Should be used for 1956 * mm_struct of the VMA to prevent it from going away. Should be used for
1996 * all allocations for pages that will be mapped into 1957 * all allocations for pages that will be mapped into user space. Returns
1997 * user space. Returns NULL when no page can be allocated. 1958 * NULL when no page can be allocated.
1998 *
1999 * Should be called with the mm_sem of the vma hold.
2000 */ 1959 */
2001struct page * 1960struct page *
2002alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, 1961alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2003 unsigned long addr, int node) 1962 unsigned long addr, int node, bool hugepage)
2004{ 1963{
2005 struct mempolicy *pol; 1964 struct mempolicy *pol;
2006 struct page *page; 1965 struct page *page;
2007 unsigned int cpuset_mems_cookie; 1966 unsigned int cpuset_mems_cookie;
1967 struct zonelist *zl;
1968 nodemask_t *nmask;
2008 1969
2009retry_cpuset: 1970retry_cpuset:
2010 pol = get_vma_policy(vma, addr); 1971 pol = get_vma_policy(vma, addr);
2011 cpuset_mems_cookie = read_mems_allowed_begin(); 1972 cpuset_mems_cookie = read_mems_allowed_begin();
2012 1973
2013 if (unlikely(pol->mode == MPOL_INTERLEAVE)) { 1974 if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage &&
1975 pol->mode != MPOL_INTERLEAVE)) {
1976 /*
1977 * For hugepage allocation and non-interleave policy which
1978 * allows the current node, we only try to allocate from the
1979 * current node and don't fall back to other nodes, as the
1980 * cost of remote accesses would likely offset THP benefits.
1981 *
1982 * If the policy is interleave, or does not allow the current
1983 * node in its nodemask, we allocate the standard way.
1984 */
1985 nmask = policy_nodemask(gfp, pol);
1986 if (!nmask || node_isset(node, *nmask)) {
1987 mpol_cond_put(pol);
1988 page = alloc_pages_exact_node(node, gfp, order);
1989 goto out;
1990 }
1991 }
1992
1993 if (pol->mode == MPOL_INTERLEAVE) {
2014 unsigned nid; 1994 unsigned nid;
2015 1995
2016 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); 1996 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2017 mpol_cond_put(pol); 1997 mpol_cond_put(pol);
2018 page = alloc_page_interleave(gfp, order, nid); 1998 page = alloc_page_interleave(gfp, order, nid);
2019 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) 1999 goto out;
2020 goto retry_cpuset;
2021
2022 return page;
2023 } 2000 }
2024 page = __alloc_pages_nodemask(gfp, order, 2001
2025 policy_zonelist(gfp, pol, node), 2002 nmask = policy_nodemask(gfp, pol);
2026 policy_nodemask(gfp, pol)); 2003 zl = policy_zonelist(gfp, pol, node);
2027 mpol_cond_put(pol); 2004 mpol_cond_put(pol);
2005 page = __alloc_pages_nodemask(gfp, order, zl, nmask);
2006out:
2028 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) 2007 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2029 goto retry_cpuset; 2008 goto retry_cpuset;
2030 return page; 2009 return page;