mm: numa: Migrate pages handled during a pmd_numa hinting fault

To say that the PMD handling code was incorrectly transferred from autonuma is an understatement. The intention was to handle a PMDs worth of pages in the same fault and effectively batch the taking of the PTL and page migration. The copied version instead has the impact of clearing a number of pte_numa PTE entries and whether any page migration takes place depends on racing. This just happens to work in some cases. This patch handles pte_numa faults in batch when a pmd_numa fault is handled. The pages are migrated if they are currently misplaced. Essentially this is making an assumption that NUMA locality is on a PMD boundary but that could be addressed by only setting pmd_numa if all the pages within that PMD are on the same node if necessary. Signed-off-by: Mel Gorman <mgorman@suse.de>
author: Mel Gorman <mgorman@suse.de> 2012-11-14 20:24:32 -0500
committer: Mel Gorman <mgorman@suse.de> 2012-12-11 09:42:49 -0500
commit: 9532fec118d485ea37ab6e3ea372d68cd8b4cd0d (patch)
tree: 5076f3da1ff244df554e99b8701749423a6b92ad
parent: 5606e3877ad8baea42f3a71ebde0a03622bbb551 (diff)
2 files changed, 54 insertions, 22 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 8a7b4ccbe136..84c6d9eab182 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3449,6 +3449,18 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
 }
+int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
+                                unsigned long addr, int current_nid)
+{
+        get_page(page);
+        count_vm_numa_event(NUMA_HINT_FAULTS);
+        if (current_nid == numa_node_id())
+                count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+        return mpol_misplaced(page, vma, addr);
+}
 int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
                   unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
 {
@@ -3477,18 +3489,14 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        set_pte_at(mm, addr, ptep, pte);
        update_mmu_cache(vma, addr, ptep);
-        count_vm_numa_event(NUMA_HINT_FAULTS);
        page = vm_normal_page(vma, addr, pte);
        if (!page) {
                pte_unmap_unlock(ptep, ptl);
                return 0;
        }
-        get_page(page);
        current_nid = page_to_nid(page);
-        if (current_nid == numa_node_id())
+        target_nid = numa_migrate_prep(page, vma, addr, current_nid);
-                count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
-        target_nid = mpol_misplaced(page, vma, addr);
        pte_unmap_unlock(ptep, ptl);
        if (target_nid == -1) {
                /*
@@ -3505,7 +3513,8 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
                current_nid = target_nid;
 out:
-        task_numa_fault(current_nid, 1);
+        if (current_nid != -1)
+                task_numa_fault(current_nid, 1);
        return 0;
 }
@@ -3521,8 +3530,6 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        spinlock_t *ptl;
        bool numa = false;
        int local_nid = numa_node_id();
-        unsigned long nr_faults = 0;
-        unsigned long nr_faults_local = 0;
        spin_lock(&mm->page_table_lock);
        pmd = *pmdp;
@@ -3545,7 +3552,8 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
                pte_t pteval = *pte;
                struct page *page;
-                int curr_nid;
+                int curr_nid = local_nid;
+                int target_nid;
                if (!pte_present(pteval))
                        continue;
                if (!pte_numa(pteval))
@@ -3566,21 +3574,30 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
                /* only check non-shared pages */
                if (unlikely(page_mapcount(page) != 1))
                        continue;
-                pte_unmap_unlock(pte, ptl);
-                curr_nid = page_to_nid(page);
+                /*
-                task_numa_fault(curr_nid, 1);
+                 * Note that the NUMA fault is later accounted to either
+                 * the node that is currently running or where the page is
+                 * migrated to.
+                 */
+                curr_nid = local_nid;
+                target_nid = numa_migrate_prep(page, vma, addr,
+                                               page_to_nid(page));
+                if (target_nid == -1) {
+                        put_page(page);
+                        continue;
+                }
-                nr_faults++;
+                /* Migrate to the requested node */
-                if (curr_nid == local_nid)
+                pte_unmap_unlock(pte, ptl);
-                        nr_faults_local++;
+                if (migrate_misplaced_page(page, target_nid))
+                        curr_nid = target_nid;
+                task_numa_fault(curr_nid, 1);
                pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
        }
        pte_unmap_unlock(orig_pte, ptl);
-        count_vm_numa_events(NUMA_HINT_FAULTS, nr_faults);
-        count_vm_numa_events(NUMA_HINT_FAULTS_LOCAL, nr_faults_local);
        return 0;
 }
 #else
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 7ef6ae964e8f..dce6fb48edc4 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -37,12 +37,14 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
 static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long addr, unsigned long end, pgprot_t newprot,
-                int dirty_accountable, int prot_numa)
+                int dirty_accountable, int prot_numa, bool *ret_all_same_node)
 {
        struct mm_struct *mm = vma->vm_mm;
        pte_t *pte, oldpte;
        spinlock_t *ptl;
        unsigned long pages = 0;
+        bool all_same_node = true;
+        int last_nid = -1;
        pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
        arch_enter_lazy_mmu_mode();
@@ -61,6 +63,12 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                                page = vm_normal_page(vma, addr, oldpte);
                                if (page) {
+                                        int this_nid = page_to_nid(page);
+                                        if (last_nid == -1)
+                                                last_nid = this_nid;
+                                        if (last_nid != this_nid)
+                                                all_same_node = false;
                                        /* only check non-shared pages */
                                        if (!pte_numa(oldpte) &&
                                            page_mapcount(page) == 1) {
@@ -81,7 +89,6 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                        if (updated)
                                pages++;
                        ptep_modify_prot_commit(mm, addr, pte, ptent);
                } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
                        swp_entry_t entry = pte_to_swp_entry(oldpte);
@@ -101,6 +108,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
        arch_leave_lazy_mmu_mode();
        pte_unmap_unlock(pte - 1, ptl);
+        *ret_all_same_node = all_same_node;
        return pages;
 }
@@ -127,6 +135,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *
        pmd_t *pmd;
        unsigned long next;
        unsigned long pages = 0;
+        bool all_same_node;
        pmd = pmd_offset(pud, addr);
        do {
@@ -143,9 +152,15 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *
                if (pmd_none_or_clear_bad(pmd))
                        continue;
                pages += change_pte_range(vma, pmd, addr, next, newprot,
-                                 dirty_accountable, prot_numa);
+                                 dirty_accountable, prot_numa, &all_same_node);
-                if (prot_numa)
+                /*
+                 * If we are changing protections for NUMA hinting faults then
+                 * set pmd_numa if the examined pages were all on the same
+                 * node. This allows a regular PMD to be handled as one fault
+                 * and effectively batches the taking of the PTL
+                 */
+                if (prot_numa && all_same_node)
                        change_pmd_protnuma(vma->vm_mm, addr, pmd);
        } while (pmd++, addr = next, addr != end);
author	Mel Gorman <mgorman@suse.de>	2012-11-14 20:24:32 -0500
committer	Mel Gorman <mgorman@suse.de>	2012-12-11 09:42:49 -0500
commit	9532fec118d485ea37ab6e3ea372d68cd8b4cd0d (patch)
tree	5076f3da1ff244df554e99b8701749423a6b92ad
parent	5606e3877ad8baea42f3a71ebde0a03622bbb551 (diff)