aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/mm.h5
-rw-r--r--include/uapi/linux/mempolicy.h13
-rw-r--r--mm/mempolicy.c185
3 files changed, 185 insertions, 18 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h
index fa1615211159..471185e29bab 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1551,6 +1551,11 @@ static inline pgprot_t vm_get_page_prot(unsigned long vm_flags)
1551} 1551}
1552#endif 1552#endif
1553 1553
1554#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
1555void change_prot_numa(struct vm_area_struct *vma,
1556 unsigned long start, unsigned long end);
1557#endif
1558
1554struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); 1559struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
1555int remap_pfn_range(struct vm_area_struct *, unsigned long addr, 1560int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
1556 unsigned long pfn, unsigned long size, pgprot_t); 1561 unsigned long pfn, unsigned long size, pgprot_t);
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 472de8a5d37e..6a1baae3775d 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -49,9 +49,16 @@ enum mpol_rebind_step {
49 49
50/* Flags for mbind */ 50/* Flags for mbind */
51#define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */ 51#define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */
52#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform to mapping */ 52#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform
53#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to mapping */ 53 to policy */
54#define MPOL_MF_INTERNAL (1<<3) /* Internal flags start here */ 54#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to policy */
55#define MPOL_MF_LAZY (1<<3) /* Modifies '_MOVE: lazy migrate on fault */
56#define MPOL_MF_INTERNAL (1<<4) /* Internal flags start here */
57
58#define MPOL_MF_VALID (MPOL_MF_STRICT | \
59 MPOL_MF_MOVE | \
60 MPOL_MF_MOVE_ALL | \
61 MPOL_MF_LAZY)
55 62
56/* 63/*
57 * Internal flags that share the struct mempolicy flags word with 64 * Internal flags that share the struct mempolicy flags word with
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index df1466d3d2d8..51d3ebd8561e 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -90,6 +90,7 @@
90#include <linux/syscalls.h> 90#include <linux/syscalls.h>
91#include <linux/ctype.h> 91#include <linux/ctype.h>
92#include <linux/mm_inline.h> 92#include <linux/mm_inline.h>
93#include <linux/mmu_notifier.h>
93 94
94#include <asm/tlbflush.h> 95#include <asm/tlbflush.h>
95#include <asm/uaccess.h> 96#include <asm/uaccess.h>
@@ -565,6 +566,145 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
565 return 0; 566 return 0;
566} 567}
567 568
569#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
570/*
571 * Here we search for not shared page mappings (mapcount == 1) and we
572 * set up the pmd/pte_numa on those mappings so the very next access
573 * will fire a NUMA hinting page fault.
574 */
575static int
576change_prot_numa_range(struct mm_struct *mm, struct vm_area_struct *vma,
577 unsigned long address)
578{
579 pgd_t *pgd;
580 pud_t *pud;
581 pmd_t *pmd;
582 pte_t *pte, *_pte;
583 struct page *page;
584 unsigned long _address, end;
585 spinlock_t *ptl;
586 int ret = 0;
587
588 VM_BUG_ON(address & ~PAGE_MASK);
589
590 pgd = pgd_offset(mm, address);
591 if (!pgd_present(*pgd))
592 goto out;
593
594 pud = pud_offset(pgd, address);
595 if (!pud_present(*pud))
596 goto out;
597
598 pmd = pmd_offset(pud, address);
599 if (pmd_none(*pmd))
600 goto out;
601
602 if (pmd_trans_huge_lock(pmd, vma) == 1) {
603 int page_nid;
604 ret = HPAGE_PMD_NR;
605
606 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
607
608 if (pmd_numa(*pmd)) {
609 spin_unlock(&mm->page_table_lock);
610 goto out;
611 }
612
613 page = pmd_page(*pmd);
614
615 /* only check non-shared pages */
616 if (page_mapcount(page) != 1) {
617 spin_unlock(&mm->page_table_lock);
618 goto out;
619 }
620
621 page_nid = page_to_nid(page);
622
623 if (pmd_numa(*pmd)) {
624 spin_unlock(&mm->page_table_lock);
625 goto out;
626 }
627
628 set_pmd_at(mm, address, pmd, pmd_mknuma(*pmd));
629 ret += HPAGE_PMD_NR;
630 /* defer TLB flush to lower the overhead */
631 spin_unlock(&mm->page_table_lock);
632 goto out;
633 }
634
635 if (pmd_trans_unstable(pmd))
636 goto out;
637 VM_BUG_ON(!pmd_present(*pmd));
638
639 end = min(vma->vm_end, (address + PMD_SIZE) & PMD_MASK);
640 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
641 for (_address = address, _pte = pte; _address < end;
642 _pte++, _address += PAGE_SIZE) {
643 pte_t pteval = *_pte;
644 if (!pte_present(pteval))
645 continue;
646 if (pte_numa(pteval))
647 continue;
648 page = vm_normal_page(vma, _address, pteval);
649 if (unlikely(!page))
650 continue;
651 /* only check non-shared pages */
652 if (page_mapcount(page) != 1)
653 continue;
654
655 set_pte_at(mm, _address, _pte, pte_mknuma(pteval));
656
657 /* defer TLB flush to lower the overhead */
658 ret++;
659 }
660 pte_unmap_unlock(pte, ptl);
661
662 if (ret && !pmd_numa(*pmd)) {
663 spin_lock(&mm->page_table_lock);
664 set_pmd_at(mm, address, pmd, pmd_mknuma(*pmd));
665 spin_unlock(&mm->page_table_lock);
666 /* defer TLB flush to lower the overhead */
667 }
668
669out:
670 return ret;
671}
672
673/* Assumes mmap_sem is held */
674void
675change_prot_numa(struct vm_area_struct *vma,
676 unsigned long address, unsigned long end)
677{
678 struct mm_struct *mm = vma->vm_mm;
679 int progress = 0;
680
681 while (address < end) {
682 VM_BUG_ON(address < vma->vm_start ||
683 address + PAGE_SIZE > vma->vm_end);
684
685 progress += change_prot_numa_range(mm, vma, address);
686 address = (address + PMD_SIZE) & PMD_MASK;
687 }
688
689 /*
690 * Flush the TLB for the mm to start the NUMA hinting
691 * page faults after we finish scanning this vma part
692 * if there were any PTE updates
693 */
694 if (progress) {
695 mmu_notifier_invalidate_range_start(vma->vm_mm, address, end);
696 flush_tlb_range(vma, address, end);
697 mmu_notifier_invalidate_range_end(vma->vm_mm, address, end);
698 }
699}
700#else
701static unsigned long change_prot_numa(struct vm_area_struct *vma,
702 unsigned long addr, unsigned long end)
703{
704 return 0;
705}
706#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
707
568/* 708/*
569 * Check if all pages in a range are on a set of nodes. 709 * Check if all pages in a range are on a set of nodes.
570 * If pagelist != NULL then isolate pages from the LRU and 710 * If pagelist != NULL then isolate pages from the LRU and
@@ -583,22 +723,32 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
583 return ERR_PTR(-EFAULT); 723 return ERR_PTR(-EFAULT);
584 prev = NULL; 724 prev = NULL;
585 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { 725 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
726 unsigned long endvma = vma->vm_end;
727
728 if (endvma > end)
729 endvma = end;
730 if (vma->vm_start > start)
731 start = vma->vm_start;
732
586 if (!(flags & MPOL_MF_DISCONTIG_OK)) { 733 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
587 if (!vma->vm_next && vma->vm_end < end) 734 if (!vma->vm_next && vma->vm_end < end)
588 return ERR_PTR(-EFAULT); 735 return ERR_PTR(-EFAULT);
589 if (prev && prev->vm_end < vma->vm_start) 736 if (prev && prev->vm_end < vma->vm_start)
590 return ERR_PTR(-EFAULT); 737 return ERR_PTR(-EFAULT);
591 } 738 }
592 if (!is_vm_hugetlb_page(vma) && 739
593 ((flags & MPOL_MF_STRICT) || 740 if (is_vm_hugetlb_page(vma))
741 goto next;
742
743 if (flags & MPOL_MF_LAZY) {
744 change_prot_numa(vma, start, endvma);
745 goto next;
746 }
747
748 if ((flags & MPOL_MF_STRICT) ||
594 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && 749 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
595 vma_migratable(vma)))) { 750 vma_migratable(vma))) {
596 unsigned long endvma = vma->vm_end;
597 751
598 if (endvma > end)
599 endvma = end;
600 if (vma->vm_start > start)
601 start = vma->vm_start;
602 err = check_pgd_range(vma, start, endvma, nodes, 752 err = check_pgd_range(vma, start, endvma, nodes,
603 flags, private); 753 flags, private);
604 if (err) { 754 if (err) {
@@ -606,6 +756,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
606 break; 756 break;
607 } 757 }
608 } 758 }
759next:
609 prev = vma; 760 prev = vma;
610 } 761 }
611 return first; 762 return first;
@@ -1138,8 +1289,7 @@ static long do_mbind(unsigned long start, unsigned long len,
1138 int err; 1289 int err;
1139 LIST_HEAD(pagelist); 1290 LIST_HEAD(pagelist);
1140 1291
1141 if (flags & ~(unsigned long)(MPOL_MF_STRICT | 1292 if (flags & ~(unsigned long)MPOL_MF_VALID)
1142 MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1143 return -EINVAL; 1293 return -EINVAL;
1144 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) 1294 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1145 return -EPERM; 1295 return -EPERM;
@@ -1162,6 +1312,9 @@ static long do_mbind(unsigned long start, unsigned long len,
1162 if (IS_ERR(new)) 1312 if (IS_ERR(new))
1163 return PTR_ERR(new); 1313 return PTR_ERR(new);
1164 1314
1315 if (flags & MPOL_MF_LAZY)
1316 new->flags |= MPOL_F_MOF;
1317
1165 /* 1318 /*
1166 * If we are using the default policy then operation 1319 * If we are using the default policy then operation
1167 * on discontinuous address spaces is okay after all 1320 * on discontinuous address spaces is okay after all
@@ -1198,13 +1351,15 @@ static long do_mbind(unsigned long start, unsigned long len,
1198 vma = check_range(mm, start, end, nmask, 1351 vma = check_range(mm, start, end, nmask,
1199 flags | MPOL_MF_INVERT, &pagelist); 1352 flags | MPOL_MF_INVERT, &pagelist);
1200 1353
1201 err = PTR_ERR(vma); 1354 err = PTR_ERR(vma); /* maybe ... */
1202 if (!IS_ERR(vma)) { 1355 if (!IS_ERR(vma) && mode != MPOL_NOOP)
1203 int nr_failed = 0;
1204
1205 err = mbind_range(mm, start, end, new); 1356 err = mbind_range(mm, start, end, new);
1206 1357
1358 if (!err) {
1359 int nr_failed = 0;
1360
1207 if (!list_empty(&pagelist)) { 1361 if (!list_empty(&pagelist)) {
1362 WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1208 nr_failed = migrate_pages(&pagelist, new_vma_page, 1363 nr_failed = migrate_pages(&pagelist, new_vma_page,
1209 (unsigned long)vma, 1364 (unsigned long)vma,
1210 false, MIGRATE_SYNC, 1365 false, MIGRATE_SYNC,
@@ -1213,7 +1368,7 @@ static long do_mbind(unsigned long start, unsigned long len,
1213 putback_lru_pages(&pagelist); 1368 putback_lru_pages(&pagelist);
1214 } 1369 }
1215 1370
1216 if (!err && nr_failed && (flags & MPOL_MF_STRICT)) 1371 if (nr_failed && (flags & MPOL_MF_STRICT))
1217 err = -EIO; 1372 err = -EIO;
1218 } else 1373 } else
1219 putback_lru_pages(&pagelist); 1374 putback_lru_pages(&pagelist);