aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLee Schermerhorn <lee.schermerhorn@hp.com>2012-10-25 08:16:32 -0400
committerMel Gorman <mgorman@suse.de>2012-12-11 09:42:43 -0500
commitb24f53a0bea38b266d219ee651b22dba727c44ae (patch)
treef85431707b44913a412efb5483dc366c310aab5e
parent4daae3b4b9e49b7e0935499a352f1c59d90287d2 (diff)
mm: mempolicy: Add MPOL_MF_LAZY
NOTE: Once again there is a lot of patch stealing and the end result is sufficiently different that I had to drop the signed-offs. Will re-add if the original authors are ok with that. This patch adds another mbind() flag to request "lazy migration". The flag, MPOL_MF_LAZY, modifies MPOL_MF_MOVE* such that the selected pages are marked PROT_NONE. The pages will be migrated in the fault path on "first touch", if the policy dictates at that time. "Lazy Migration" will allow testing of migrate-on-fault via mbind(). Also allows applications to specify that only subsequently touched pages be migrated to obey new policy, instead of all pages in range. This can be useful for multi-threaded applications working on a large shared data area that is initialized by an initial thread resulting in all pages on one [or a few, if overflowed] nodes. After PROT_NONE, the pages in regions assigned to the worker threads will be automatically migrated local to the threads on 1st touch. Signed-off-by: Mel Gorman <mgorman@suse.de> Reviewed-by: Rik van Riel <riel@redhat.com>
-rw-r--r--include/linux/mm.h5
-rw-r--r--include/uapi/linux/mempolicy.h13
-rw-r--r--mm/mempolicy.c185
3 files changed, 185 insertions, 18 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h
index fa1615211159..471185e29bab 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1551,6 +1551,11 @@ static inline pgprot_t vm_get_page_prot(unsigned long vm_flags)
1551} 1551}
1552#endif 1552#endif
1553 1553
1554#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
1555void change_prot_numa(struct vm_area_struct *vma,
1556 unsigned long start, unsigned long end);
1557#endif
1558
1554struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); 1559struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
1555int remap_pfn_range(struct vm_area_struct *, unsigned long addr, 1560int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
1556 unsigned long pfn, unsigned long size, pgprot_t); 1561 unsigned long pfn, unsigned long size, pgprot_t);
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 472de8a5d37e..6a1baae3775d 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -49,9 +49,16 @@ enum mpol_rebind_step {
49 49
50/* Flags for mbind */ 50/* Flags for mbind */
51#define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */ 51#define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */
52#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform to mapping */ 52#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform
53#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to mapping */ 53 to policy */
54#define MPOL_MF_INTERNAL (1<<3) /* Internal flags start here */ 54#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to policy */
55#define MPOL_MF_LAZY (1<<3) /* Modifies '_MOVE: lazy migrate on fault */
56#define MPOL_MF_INTERNAL (1<<4) /* Internal flags start here */
57
58#define MPOL_MF_VALID (MPOL_MF_STRICT | \
59 MPOL_MF_MOVE | \
60 MPOL_MF_MOVE_ALL | \
61 MPOL_MF_LAZY)
55 62
56/* 63/*
57 * Internal flags that share the struct mempolicy flags word with 64 * Internal flags that share the struct mempolicy flags word with
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index df1466d3d2d8..51d3ebd8561e 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -90,6 +90,7 @@
90#include <linux/syscalls.h> 90#include <linux/syscalls.h>
91#include <linux/ctype.h> 91#include <linux/ctype.h>
92#include <linux/mm_inline.h> 92#include <linux/mm_inline.h>
93#include <linux/mmu_notifier.h>
93 94
94#include <asm/tlbflush.h> 95#include <asm/tlbflush.h>
95#include <asm/uaccess.h> 96#include <asm/uaccess.h>
@@ -565,6 +566,145 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
565 return 0; 566 return 0;
566} 567}
567 568
569#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
570/*
571 * Here we search for not shared page mappings (mapcount == 1) and we
572 * set up the pmd/pte_numa on those mappings so the very next access
573 * will fire a NUMA hinting page fault.
574 */
575static int
576change_prot_numa_range(struct mm_struct *mm, struct vm_area_struct *vma,
577 unsigned long address)
578{
579 pgd_t *pgd;
580 pud_t *pud;
581 pmd_t *pmd;
582 pte_t *pte, *_pte;
583 struct page *page;
584 unsigned long _address, end;
585 spinlock_t *ptl;
586 int ret = 0;
587
588 VM_BUG_ON(address & ~PAGE_MASK);
589
590 pgd = pgd_offset(mm, address);
591 if (!pgd_present(*pgd))
592 goto out;
593
594 pud = pud_offset(pgd, address);
595 if (!pud_present(*pud))
596 goto out;
597
598 pmd = pmd_offset(pud, address);
599 if (pmd_none(*pmd))
600 goto out;
601
602 if (pmd_trans_huge_lock(pmd, vma) == 1) {
603 int page_nid;
604 ret = HPAGE_PMD_NR;
605
606 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
607
608 if (pmd_numa(*pmd)) {
609 spin_unlock(&mm->page_table_lock);
610 goto out;
611 }
612
613 page = pmd_page(*pmd);
614
615 /* only check non-shared pages */
616 if (page_mapcount(page) != 1) {
617 spin_unlock(&mm->page_table_lock);
618 goto out;
619 }
620
621 page_nid = page_to_nid(page);
622
623 if (pmd_numa(*pmd)) {
624 spin_unlock(&mm->page_table_lock);
625 goto out;
626 }
627
628 set_pmd_at(mm, address, pmd, pmd_mknuma(*pmd));
629 ret += HPAGE_PMD_NR;
630 /* defer TLB flush to lower the overhead */
631 spin_unlock(&mm->page_table_lock);
632 goto out;
633 }
634
635 if (pmd_trans_unstable(pmd))
636 goto out;
637 VM_BUG_ON(!pmd_present(*pmd));
638
639 end = min(vma->vm_end, (address + PMD_SIZE) & PMD_MASK);
640 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
641 for (_address = address, _pte = pte; _address < end;
642 _pte++, _address += PAGE_SIZE) {
643 pte_t pteval = *_pte;
644 if (!pte_present(pteval))
645 continue;
646 if (pte_numa(pteval))
647 continue;
648 page = vm_normal_page(vma, _address, pteval);
649 if (unlikely(!page))
650 continue;
651 /* only check non-shared pages */
652 if (page_mapcount(page) != 1)
653 continue;
654
655 set_pte_at(mm, _address, _pte, pte_mknuma(pteval));
656
657 /* defer TLB flush to lower the overhead */
658 ret++;
659 }
660 pte_unmap_unlock(pte, ptl);
661
662 if (ret && !pmd_numa(*pmd)) {
663 spin_lock(&mm->page_table_lock);
664 set_pmd_at(mm, address, pmd, pmd_mknuma(*pmd));
665 spin_unlock(&mm->page_table_lock);
666 /* defer TLB flush to lower the overhead */
667 }
668
669out:
670 return ret;
671}
672
673/* Assumes mmap_sem is held */
674void
675change_prot_numa(struct vm_area_struct *vma,
676 unsigned long address, unsigned long end)
677{
678 struct mm_struct *mm = vma->vm_mm;
679 int progress = 0;
680
681 while (address < end) {
682 VM_BUG_ON(address < vma->vm_start ||
683 address + PAGE_SIZE > vma->vm_end);
684
685 progress += change_prot_numa_range(mm, vma, address);
686 address = (address + PMD_SIZE) & PMD_MASK;
687 }
688
689 /*
690 * Flush the TLB for the mm to start the NUMA hinting
691 * page faults after we finish scanning this vma part
692 * if there were any PTE updates
693 */
694 if (progress) {
695 mmu_notifier_invalidate_range_start(vma->vm_mm, address, end);
696 flush_tlb_range(vma, address, end);
697 mmu_notifier_invalidate_range_end(vma->vm_mm, address, end);
698 }
699}
700#else
701static unsigned long change_prot_numa(struct vm_area_struct *vma,
702 unsigned long addr, unsigned long end)
703{
704 return 0;
705}
706#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
707
568/* 708/*
569 * Check if all pages in a range are on a set of nodes. 709 * Check if all pages in a range are on a set of nodes.
570 * If pagelist != NULL then isolate pages from the LRU and 710 * If pagelist != NULL then isolate pages from the LRU and
@@ -583,22 +723,32 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
583 return ERR_PTR(-EFAULT); 723 return ERR_PTR(-EFAULT);
584 prev = NULL; 724 prev = NULL;
585 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { 725 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
726 unsigned long endvma = vma->vm_end;
727
728 if (endvma > end)
729 endvma = end;
730 if (vma->vm_start > start)
731 start = vma->vm_start;
732
586 if (!(flags & MPOL_MF_DISCONTIG_OK)) { 733 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
587 if (!vma->vm_next && vma->vm_end < end) 734 if (!vma->vm_next && vma->vm_end < end)
588 return ERR_PTR(-EFAULT); 735 return ERR_PTR(-EFAULT);
589 if (prev && prev->vm_end < vma->vm_start) 736 if (prev && prev->vm_end < vma->vm_start)
590 return ERR_PTR(-EFAULT); 737 return ERR_PTR(-EFAULT);
591 } 738 }
592 if (!is_vm_hugetlb_page(vma) && 739
593 ((flags & MPOL_MF_STRICT) || 740 if (is_vm_hugetlb_page(vma))
741 goto next;
742
743 if (flags & MPOL_MF_LAZY) {
744 change_prot_numa(vma, start, endvma);
745 goto next;
746 }
747
748 if ((flags & MPOL_MF_STRICT) ||
594 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && 749 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
595 vma_migratable(vma)))) { 750 vma_migratable(vma))) {
596 unsigned long endvma = vma->vm_end;
597 751
598 if (endvma > end)
599 endvma = end;
600 if (vma->vm_start > start)
601 start = vma->vm_start;
602 err = check_pgd_range(vma, start, endvma, nodes, 752 err = check_pgd_range(vma, start, endvma, nodes,
603 flags, private); 753 flags, private);
604 if (err) { 754 if (err) {
@@ -606,6 +756,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
606 break; 756 break;
607 } 757 }
608 } 758 }
759next:
609 prev = vma; 760 prev = vma;
610 } 761 }
611 return first; 762 return first;
@@ -1138,8 +1289,7 @@ static long do_mbind(unsigned long start, unsigned long len,
1138 int err; 1289 int err;
1139 LIST_HEAD(pagelist); 1290 LIST_HEAD(pagelist);
1140 1291
1141 if (flags & ~(unsigned long)(MPOL_MF_STRICT | 1292 if (flags & ~(unsigned long)MPOL_MF_VALID)
1142 MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1143 return -EINVAL; 1293 return -EINVAL;
1144 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) 1294 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1145 return -EPERM; 1295 return -EPERM;
@@ -1162,6 +1312,9 @@ static long do_mbind(unsigned long start, unsigned long len,
1162 if (IS_ERR(new)) 1312 if (IS_ERR(new))
1163 return PTR_ERR(new); 1313 return PTR_ERR(new);
1164 1314
1315 if (flags & MPOL_MF_LAZY)
1316 new->flags |= MPOL_F_MOF;
1317
1165 /* 1318 /*
1166 * If we are using the default policy then operation 1319 * If we are using the default policy then operation
1167 * on discontinuous address spaces is okay after all 1320 * on discontinuous address spaces is okay after all
@@ -1198,13 +1351,15 @@ static long do_mbind(unsigned long start, unsigned long len,
1198 vma = check_range(mm, start, end, nmask, 1351 vma = check_range(mm, start, end, nmask,
1199 flags | MPOL_MF_INVERT, &pagelist); 1352 flags | MPOL_MF_INVERT, &pagelist);
1200 1353
1201 err = PTR_ERR(vma); 1354 err = PTR_ERR(vma); /* maybe ... */
1202 if (!IS_ERR(vma)) { 1355 if (!IS_ERR(vma) && mode != MPOL_NOOP)
1203 int nr_failed = 0;
1204
1205 err = mbind_range(mm, start, end, new); 1356 err = mbind_range(mm, start, end, new);
1206 1357
1358 if (!err) {
1359 int nr_failed = 0;
1360
1207 if (!list_empty(&pagelist)) { 1361 if (!list_empty(&pagelist)) {
1362 WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1208 nr_failed = migrate_pages(&pagelist, new_vma_page, 1363 nr_failed = migrate_pages(&pagelist, new_vma_page,
1209 (unsigned long)vma, 1364 (unsigned long)vma,
1210 false, MIGRATE_SYNC, 1365 false, MIGRATE_SYNC,
@@ -1213,7 +1368,7 @@ static long do_mbind(unsigned long start, unsigned long len,
1213 putback_lru_pages(&pagelist); 1368 putback_lru_pages(&pagelist);
1214 } 1369 }
1215 1370
1216 if (!err && nr_failed && (flags & MPOL_MF_STRICT)) 1371 if (nr_failed && (flags & MPOL_MF_STRICT))
1217 err = -EIO; 1372 err = -EIO;
1218 } else 1373 } else
1219 putback_lru_pages(&pagelist); 1374 putback_lru_pages(&pagelist);