mm: numa: Do not batch handle PMD pages

With the THP migration races closed it is still possible to occasionally see corruption. The problem is related to handling PMD pages in batch. When a page fault is handled it can be assumed that the page being faulted will also be flushed from the TLB. The same flushing does not happen when handling PMD pages in batch. Fixing is straight forward but there are a number of reasons not to 1. Multiple TLB flushes may have to be sent depending on what pages get migrated 2. The handling of PMDs in batch means that faults get accounted to the task that is handling the fault. While care is taken to only mark PMDs where the last CPU and PID match it can still have problems due to PID truncation when matching PIDs. 3. Batching on the PMD level may reduce faults but setting pmd_numa requires taking a heavy lock that can contend with THP migration and handling the fault requires the release/acquisition of the PTL for every page migrated. It's still pretty heavy. PMD batch handling is not something that people ever have been happy with. This patch removes it and later patches will deal with the additional fault overhead using more installigent migrate rate adaption. Signed-off-by: Mel Gorman <mgorman@suse.de> Reviewed-by: Rik van Riel <riel@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1381141781-10992-48-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Mel Gorman <mgorman@suse.de> 2013-10-07 06:29:25 -0400
committer: Ingo Molnar <mingo@kernel.org> 2013-10-09 08:47:55 -0400
commit: 0f19c17929c952c6f0966d93ab05558e7bf814cc (patch)
tree: a881a5c520d8d0791dd73859f51c87285d3a06be /mm/memory.c
parent: 6688cc05473b36a0a3d3971e1adf1712919b32eb (diff)
1 files changed, 2 insertions, 99 deletions
diff --git a/mm/memory.c b/mm/memory.c
index eba846bcf124..9898eeb9a21c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3606,103 +3606,6 @@ out:
        return 0;
 }
-/* NUMA hinting page fault entry point for regular pmds */
-#ifdef CONFIG_NUMA_BALANCING
-static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
-                     unsigned long addr, pmd_t *pmdp)
-{
-        pmd_t pmd;
-        pte_t *pte, *orig_pte;
-        unsigned long _addr = addr & PMD_MASK;
-        unsigned long offset;
-        spinlock_t *ptl;
-        bool numa = false;
-        int last_cpupid;
-        spin_lock(&mm->page_table_lock);
-        pmd = *pmdp;
-        if (pmd_numa(pmd)) {
-                set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
-                numa = true;
-        }
-        spin_unlock(&mm->page_table_lock);
-        if (!numa)
-                return 0;
-        /* we're in a page fault so some vma must be in the range */
-        BUG_ON(!vma);
-        BUG_ON(vma->vm_start >= _addr + PMD_SIZE);
-        offset = max(_addr, vma->vm_start) & ~PMD_MASK;
-        VM_BUG_ON(offset >= PMD_SIZE);
-        orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
-        pte += offset >> PAGE_SHIFT;
-        for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
-                pte_t pteval = *pte;
-                struct page *page;
-                int page_nid = -1;
-                int target_nid;
-                bool migrated = false;
-                int flags = 0;
-                if (!pte_present(pteval))
-                        continue;
-                if (!pte_numa(pteval))
-                        continue;
-                if (addr >= vma->vm_end) {
-                        vma = find_vma(mm, addr);
-                        /* there's a pte present so there must be a vma */
-                        BUG_ON(!vma);
-                        BUG_ON(addr < vma->vm_start);
-                }
-                if (pte_numa(pteval)) {
-                        pteval = pte_mknonnuma(pteval);
-                        set_pte_at(mm, addr, pte, pteval);
-                }
-                page = vm_normal_page(vma, addr, pteval);
-                if (unlikely(!page))
-                        continue;
-                /*
-                 * Avoid grouping on DSO/COW pages in specific and RO pages
-                 * in general, RO pages shouldn't hurt as much anyway since
-                 * they can be in shared cache state.
-                 */
-                if (!pte_write(pteval))
-                        flags |= TNF_NO_GROUP;
-                last_cpupid = page_cpupid_last(page);
-                page_nid = page_to_nid(page);
-                target_nid = numa_migrate_prep(page, vma, addr, page_nid);
-                pte_unmap_unlock(pte, ptl);
-                if (target_nid != -1) {
-                        migrated = migrate_misplaced_page(page, vma, target_nid);
-                        if (migrated) {
-                                page_nid = target_nid;
-                                flags |= TNF_MIGRATED;
-                        }
-                } else {
-                        put_page(page);
-                }
-                if (page_nid != -1)
-                        task_numa_fault(last_cpupid, page_nid, 1, flags);
-                pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
-        }
-        pte_unmap_unlock(orig_pte, ptl);
-        return 0;
-}
-#else
-static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
-                     unsigned long addr, pmd_t *pmdp)
-{
-        BUG();
-        return 0;
-}
-#endif /* CONFIG_NUMA_BALANCING */
 /*
 * These routines also need to handle stuff like marking pages dirty
 * and/or accessed for architectures that don't do it in hardware (most
@@ -3841,8 +3744,8 @@ retry:
                }
        }
-        if (pmd_numa(*pmd))
+        /* THP should already have been handled */
-                return do_pmd_numa_page(mm, vma, address, pmd);
+        BUG_ON(pmd_numa(*pmd));
        /*
         * Use __pte_alloc instead of pte_alloc_map, because we can't
author	Mel Gorman <mgorman@suse.de>	2013-10-07 06:29:25 -0400
committer	Ingo Molnar <mingo@kernel.org>	2013-10-09 08:47:55 -0400
commit	0f19c17929c952c6f0966d93ab05558e7bf814cc (patch)
tree	a881a5c520d8d0791dd73859f51c87285d3a06be /mm/memory.c
parent	6688cc05473b36a0a3d3971e1adf1712919b32eb (diff)

diff --git a/mm/memory.c b/mm/memory.c index eba846bcf124..9898eeb9a21c 100644 --- a/mm/memory.c +++ b/mm/memory.c
@@ -3606,103 +3606,6 @@ out:
3606	return 0;	3606	return 0;
3607	}	3607	}
3608		3608
3609	/* NUMA hinting page fault entry point for regular pmds */
3610	#ifdef CONFIG_NUMA_BALANCING
3611	static int do_pmd_numa_page(struct mm_struct mm, struct vm_area_struct vma,
3612	unsigned long addr, pmd_t *pmdp)
3613	{
3614	pmd_t pmd;
3615	pte_t pte, orig_pte;
3616	unsigned long _addr = addr & PMD_MASK;
3617	unsigned long offset;
3618	spinlock_t *ptl;
3619	bool numa = false;
3620	int last_cpupid;
3621
3622	spin_lock(&mm->page_table_lock);
3623	pmd = *pmdp;
3624	if (pmd_numa(pmd)) {
3625	set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
3626	numa = true;
3627	}
3628	spin_unlock(&mm->page_table_lock);
3629
3630	if (!numa)
3631	return 0;
3632
3633	/* we're in a page fault so some vma must be in the range */
3634	BUG_ON(!vma);
3635	BUG_ON(vma->vm_start >= _addr + PMD_SIZE);
3636	offset = max(_addr, vma->vm_start) & ~PMD_MASK;
3637	VM_BUG_ON(offset >= PMD_SIZE);
3638	orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
3639	pte += offset >> PAGE_SHIFT;
3640	for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
3641	pte_t pteval = *pte;
3642	struct page *page;
3643	int page_nid = -1;
3644	int target_nid;
3645	bool migrated = false;
3646	int flags = 0;
3647
3648	if (!pte_present(pteval))
3649	continue;
3650	if (!pte_numa(pteval))
3651	continue;
3652	if (addr >= vma->vm_end) {
3653	vma = find_vma(mm, addr);
3654	/* there's a pte present so there must be a vma */
3655	BUG_ON(!vma);
3656	BUG_ON(addr < vma->vm_start);
3657	}
3658	if (pte_numa(pteval)) {
3659	pteval = pte_mknonnuma(pteval);
3660	set_pte_at(mm, addr, pte, pteval);
3661	}
3662	page = vm_normal_page(vma, addr, pteval);
3663	if (unlikely(!page))
3664	continue;
3665
3666	/*
3667	* Avoid grouping on DSO/COW pages in specific and RO pages
3668	* in general, RO pages shouldn't hurt as much anyway since
3669	* they can be in shared cache state.
3670	*/
3671	if (!pte_write(pteval))
3672	flags \|= TNF_NO_GROUP;
3673
3674	last_cpupid = page_cpupid_last(page);
3675	page_nid = page_to_nid(page);
3676	target_nid = numa_migrate_prep(page, vma, addr, page_nid);
3677	pte_unmap_unlock(pte, ptl);
3678	if (target_nid != -1) {
3679	migrated = migrate_misplaced_page(page, vma, target_nid);
3680	if (migrated) {
3681	page_nid = target_nid;
3682	flags \|= TNF_MIGRATED;
3683	}
3684	} else {
3685	put_page(page);
3686	}
3687
3688	if (page_nid != -1)
3689	task_numa_fault(last_cpupid, page_nid, 1, flags);
3690
3691	pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
3692	}
3693	pte_unmap_unlock(orig_pte, ptl);
3694
3695	return 0;
3696	}
3697	#else
3698	static int do_pmd_numa_page(struct mm_struct mm, struct vm_area_struct vma,
3699	unsigned long addr, pmd_t *pmdp)
3700	{
3701	BUG();
3702	return 0;
3703	}
3704	#endif /* CONFIG_NUMA_BALANCING */
3705
3706	/*	3609	/*
3707	* These routines also need to handle stuff like marking pages dirty	3610	* These routines also need to handle stuff like marking pages dirty
3708	* and/or accessed for architectures that don't do it in hardware (most	3611	* and/or accessed for architectures that don't do it in hardware (most
@@ -3841,8 +3744,8 @@ retry:
3841	}	3744	}
3842	}	3745	}
3843		3746
3844	if (pmd_numa(*pmd))	3747	/* THP should already have been handled */
3845	return do_pmd_numa_page(mm, vma, address, pmd);	3748	BUG_ON(pmd_numa(*pmd));
3846		3749
3847	/*	3750	/*
3848	* Use __pte_alloc instead of pte_alloc_map, because we can't	3751	* Use __pte_alloc instead of pte_alloc_map, because we can't