diff options
-rw-r--r-- | include/linux/mm.h | 5 | ||||
-rw-r--r-- | include/uapi/linux/mempolicy.h | 13 | ||||
-rw-r--r-- | mm/mempolicy.c | 185 |
3 files changed, 185 insertions, 18 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h index fa1615211159..471185e29bab 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -1551,6 +1551,11 @@ static inline pgprot_t vm_get_page_prot(unsigned long vm_flags) | |||
1551 | } | 1551 | } |
1552 | #endif | 1552 | #endif |
1553 | 1553 | ||
1554 | #ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE | ||
1555 | void change_prot_numa(struct vm_area_struct *vma, | ||
1556 | unsigned long start, unsigned long end); | ||
1557 | #endif | ||
1558 | |||
1554 | struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); | 1559 | struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); |
1555 | int remap_pfn_range(struct vm_area_struct *, unsigned long addr, | 1560 | int remap_pfn_range(struct vm_area_struct *, unsigned long addr, |
1556 | unsigned long pfn, unsigned long size, pgprot_t); | 1561 | unsigned long pfn, unsigned long size, pgprot_t); |
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 472de8a5d37e..6a1baae3775d 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h | |||
@@ -49,9 +49,16 @@ enum mpol_rebind_step { | |||
49 | 49 | ||
50 | /* Flags for mbind */ | 50 | /* Flags for mbind */ |
51 | #define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */ | 51 | #define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */ |
52 | #define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform to mapping */ | 52 | #define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform |
53 | #define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to mapping */ | 53 | to policy */ |
54 | #define MPOL_MF_INTERNAL (1<<3) /* Internal flags start here */ | 54 | #define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to policy */ |
55 | #define MPOL_MF_LAZY (1<<3) /* Modifies '_MOVE: lazy migrate on fault */ | ||
56 | #define MPOL_MF_INTERNAL (1<<4) /* Internal flags start here */ | ||
57 | |||
58 | #define MPOL_MF_VALID (MPOL_MF_STRICT | \ | ||
59 | MPOL_MF_MOVE | \ | ||
60 | MPOL_MF_MOVE_ALL | \ | ||
61 | MPOL_MF_LAZY) | ||
55 | 62 | ||
56 | /* | 63 | /* |
57 | * Internal flags that share the struct mempolicy flags word with | 64 | * Internal flags that share the struct mempolicy flags word with |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index df1466d3d2d8..51d3ebd8561e 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -90,6 +90,7 @@ | |||
90 | #include <linux/syscalls.h> | 90 | #include <linux/syscalls.h> |
91 | #include <linux/ctype.h> | 91 | #include <linux/ctype.h> |
92 | #include <linux/mm_inline.h> | 92 | #include <linux/mm_inline.h> |
93 | #include <linux/mmu_notifier.h> | ||
93 | 94 | ||
94 | #include <asm/tlbflush.h> | 95 | #include <asm/tlbflush.h> |
95 | #include <asm/uaccess.h> | 96 | #include <asm/uaccess.h> |
@@ -565,6 +566,145 @@ static inline int check_pgd_range(struct vm_area_struct *vma, | |||
565 | return 0; | 566 | return 0; |
566 | } | 567 | } |
567 | 568 | ||
569 | #ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE | ||
570 | /* | ||
571 | * Here we search for not shared page mappings (mapcount == 1) and we | ||
572 | * set up the pmd/pte_numa on those mappings so the very next access | ||
573 | * will fire a NUMA hinting page fault. | ||
574 | */ | ||
575 | static int | ||
576 | change_prot_numa_range(struct mm_struct *mm, struct vm_area_struct *vma, | ||
577 | unsigned long address) | ||
578 | { | ||
579 | pgd_t *pgd; | ||
580 | pud_t *pud; | ||
581 | pmd_t *pmd; | ||
582 | pte_t *pte, *_pte; | ||
583 | struct page *page; | ||
584 | unsigned long _address, end; | ||
585 | spinlock_t *ptl; | ||
586 | int ret = 0; | ||
587 | |||
588 | VM_BUG_ON(address & ~PAGE_MASK); | ||
589 | |||
590 | pgd = pgd_offset(mm, address); | ||
591 | if (!pgd_present(*pgd)) | ||
592 | goto out; | ||
593 | |||
594 | pud = pud_offset(pgd, address); | ||
595 | if (!pud_present(*pud)) | ||
596 | goto out; | ||
597 | |||
598 | pmd = pmd_offset(pud, address); | ||
599 | if (pmd_none(*pmd)) | ||
600 | goto out; | ||
601 | |||
602 | if (pmd_trans_huge_lock(pmd, vma) == 1) { | ||
603 | int page_nid; | ||
604 | ret = HPAGE_PMD_NR; | ||
605 | |||
606 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
607 | |||
608 | if (pmd_numa(*pmd)) { | ||
609 | spin_unlock(&mm->page_table_lock); | ||
610 | goto out; | ||
611 | } | ||
612 | |||
613 | page = pmd_page(*pmd); | ||
614 | |||
615 | /* only check non-shared pages */ | ||
616 | if (page_mapcount(page) != 1) { | ||
617 | spin_unlock(&mm->page_table_lock); | ||
618 | goto out; | ||
619 | } | ||
620 | |||
621 | page_nid = page_to_nid(page); | ||
622 | |||
623 | if (pmd_numa(*pmd)) { | ||
624 | spin_unlock(&mm->page_table_lock); | ||
625 | goto out; | ||
626 | } | ||
627 | |||
628 | set_pmd_at(mm, address, pmd, pmd_mknuma(*pmd)); | ||
629 | ret += HPAGE_PMD_NR; | ||
630 | /* defer TLB flush to lower the overhead */ | ||
631 | spin_unlock(&mm->page_table_lock); | ||
632 | goto out; | ||
633 | } | ||
634 | |||
635 | if (pmd_trans_unstable(pmd)) | ||
636 | goto out; | ||
637 | VM_BUG_ON(!pmd_present(*pmd)); | ||
638 | |||
639 | end = min(vma->vm_end, (address + PMD_SIZE) & PMD_MASK); | ||
640 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | ||
641 | for (_address = address, _pte = pte; _address < end; | ||
642 | _pte++, _address += PAGE_SIZE) { | ||
643 | pte_t pteval = *_pte; | ||
644 | if (!pte_present(pteval)) | ||
645 | continue; | ||
646 | if (pte_numa(pteval)) | ||
647 | continue; | ||
648 | page = vm_normal_page(vma, _address, pteval); | ||
649 | if (unlikely(!page)) | ||
650 | continue; | ||
651 | /* only check non-shared pages */ | ||
652 | if (page_mapcount(page) != 1) | ||
653 | continue; | ||
654 | |||
655 | set_pte_at(mm, _address, _pte, pte_mknuma(pteval)); | ||
656 | |||
657 | /* defer TLB flush to lower the overhead */ | ||
658 | ret++; | ||
659 | } | ||
660 | pte_unmap_unlock(pte, ptl); | ||
661 | |||
662 | if (ret && !pmd_numa(*pmd)) { | ||
663 | spin_lock(&mm->page_table_lock); | ||
664 | set_pmd_at(mm, address, pmd, pmd_mknuma(*pmd)); | ||
665 | spin_unlock(&mm->page_table_lock); | ||
666 | /* defer TLB flush to lower the overhead */ | ||
667 | } | ||
668 | |||
669 | out: | ||
670 | return ret; | ||
671 | } | ||
672 | |||
673 | /* Assumes mmap_sem is held */ | ||
674 | void | ||
675 | change_prot_numa(struct vm_area_struct *vma, | ||
676 | unsigned long address, unsigned long end) | ||
677 | { | ||
678 | struct mm_struct *mm = vma->vm_mm; | ||
679 | int progress = 0; | ||
680 | |||
681 | while (address < end) { | ||
682 | VM_BUG_ON(address < vma->vm_start || | ||
683 | address + PAGE_SIZE > vma->vm_end); | ||
684 | |||
685 | progress += change_prot_numa_range(mm, vma, address); | ||
686 | address = (address + PMD_SIZE) & PMD_MASK; | ||
687 | } | ||
688 | |||
689 | /* | ||
690 | * Flush the TLB for the mm to start the NUMA hinting | ||
691 | * page faults after we finish scanning this vma part | ||
692 | * if there were any PTE updates | ||
693 | */ | ||
694 | if (progress) { | ||
695 | mmu_notifier_invalidate_range_start(vma->vm_mm, address, end); | ||
696 | flush_tlb_range(vma, address, end); | ||
697 | mmu_notifier_invalidate_range_end(vma->vm_mm, address, end); | ||
698 | } | ||
699 | } | ||
700 | #else | ||
701 | static unsigned long change_prot_numa(struct vm_area_struct *vma, | ||
702 | unsigned long addr, unsigned long end) | ||
703 | { | ||
704 | return 0; | ||
705 | } | ||
706 | #endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */ | ||
707 | |||
568 | /* | 708 | /* |
569 | * Check if all pages in a range are on a set of nodes. | 709 | * Check if all pages in a range are on a set of nodes. |
570 | * If pagelist != NULL then isolate pages from the LRU and | 710 | * If pagelist != NULL then isolate pages from the LRU and |
@@ -583,22 +723,32 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | |||
583 | return ERR_PTR(-EFAULT); | 723 | return ERR_PTR(-EFAULT); |
584 | prev = NULL; | 724 | prev = NULL; |
585 | for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { | 725 | for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { |
726 | unsigned long endvma = vma->vm_end; | ||
727 | |||
728 | if (endvma > end) | ||
729 | endvma = end; | ||
730 | if (vma->vm_start > start) | ||
731 | start = vma->vm_start; | ||
732 | |||
586 | if (!(flags & MPOL_MF_DISCONTIG_OK)) { | 733 | if (!(flags & MPOL_MF_DISCONTIG_OK)) { |
587 | if (!vma->vm_next && vma->vm_end < end) | 734 | if (!vma->vm_next && vma->vm_end < end) |
588 | return ERR_PTR(-EFAULT); | 735 | return ERR_PTR(-EFAULT); |
589 | if (prev && prev->vm_end < vma->vm_start) | 736 | if (prev && prev->vm_end < vma->vm_start) |
590 | return ERR_PTR(-EFAULT); | 737 | return ERR_PTR(-EFAULT); |
591 | } | 738 | } |
592 | if (!is_vm_hugetlb_page(vma) && | 739 | |
593 | ((flags & MPOL_MF_STRICT) || | 740 | if (is_vm_hugetlb_page(vma)) |
741 | goto next; | ||
742 | |||
743 | if (flags & MPOL_MF_LAZY) { | ||
744 | change_prot_numa(vma, start, endvma); | ||
745 | goto next; | ||
746 | } | ||
747 | |||
748 | if ((flags & MPOL_MF_STRICT) || | ||
594 | ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && | 749 | ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && |
595 | vma_migratable(vma)))) { | 750 | vma_migratable(vma))) { |
596 | unsigned long endvma = vma->vm_end; | ||
597 | 751 | ||
598 | if (endvma > end) | ||
599 | endvma = end; | ||
600 | if (vma->vm_start > start) | ||
601 | start = vma->vm_start; | ||
602 | err = check_pgd_range(vma, start, endvma, nodes, | 752 | err = check_pgd_range(vma, start, endvma, nodes, |
603 | flags, private); | 753 | flags, private); |
604 | if (err) { | 754 | if (err) { |
@@ -606,6 +756,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | |||
606 | break; | 756 | break; |
607 | } | 757 | } |
608 | } | 758 | } |
759 | next: | ||
609 | prev = vma; | 760 | prev = vma; |
610 | } | 761 | } |
611 | return first; | 762 | return first; |
@@ -1138,8 +1289,7 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
1138 | int err; | 1289 | int err; |
1139 | LIST_HEAD(pagelist); | 1290 | LIST_HEAD(pagelist); |
1140 | 1291 | ||
1141 | if (flags & ~(unsigned long)(MPOL_MF_STRICT | | 1292 | if (flags & ~(unsigned long)MPOL_MF_VALID) |
1142 | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) | ||
1143 | return -EINVAL; | 1293 | return -EINVAL; |
1144 | if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) | 1294 | if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) |
1145 | return -EPERM; | 1295 | return -EPERM; |
@@ -1162,6 +1312,9 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
1162 | if (IS_ERR(new)) | 1312 | if (IS_ERR(new)) |
1163 | return PTR_ERR(new); | 1313 | return PTR_ERR(new); |
1164 | 1314 | ||
1315 | if (flags & MPOL_MF_LAZY) | ||
1316 | new->flags |= MPOL_F_MOF; | ||
1317 | |||
1165 | /* | 1318 | /* |
1166 | * If we are using the default policy then operation | 1319 | * If we are using the default policy then operation |
1167 | * on discontinuous address spaces is okay after all | 1320 | * on discontinuous address spaces is okay after all |
@@ -1198,13 +1351,15 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
1198 | vma = check_range(mm, start, end, nmask, | 1351 | vma = check_range(mm, start, end, nmask, |
1199 | flags | MPOL_MF_INVERT, &pagelist); | 1352 | flags | MPOL_MF_INVERT, &pagelist); |
1200 | 1353 | ||
1201 | err = PTR_ERR(vma); | 1354 | err = PTR_ERR(vma); /* maybe ... */ |
1202 | if (!IS_ERR(vma)) { | 1355 | if (!IS_ERR(vma) && mode != MPOL_NOOP) |
1203 | int nr_failed = 0; | ||
1204 | |||
1205 | err = mbind_range(mm, start, end, new); | 1356 | err = mbind_range(mm, start, end, new); |
1206 | 1357 | ||
1358 | if (!err) { | ||
1359 | int nr_failed = 0; | ||
1360 | |||
1207 | if (!list_empty(&pagelist)) { | 1361 | if (!list_empty(&pagelist)) { |
1362 | WARN_ON_ONCE(flags & MPOL_MF_LAZY); | ||
1208 | nr_failed = migrate_pages(&pagelist, new_vma_page, | 1363 | nr_failed = migrate_pages(&pagelist, new_vma_page, |
1209 | (unsigned long)vma, | 1364 | (unsigned long)vma, |
1210 | false, MIGRATE_SYNC, | 1365 | false, MIGRATE_SYNC, |
@@ -1213,7 +1368,7 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
1213 | putback_lru_pages(&pagelist); | 1368 | putback_lru_pages(&pagelist); |
1214 | } | 1369 | } |
1215 | 1370 | ||
1216 | if (!err && nr_failed && (flags & MPOL_MF_STRICT)) | 1371 | if (nr_failed && (flags & MPOL_MF_STRICT)) |
1217 | err = -EIO; | 1372 | err = -EIO; |
1218 | } else | 1373 | } else |
1219 | putback_lru_pages(&pagelist); | 1374 | putback_lru_pages(&pagelist); |