aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
authorMel Gorman <mgorman@suse.de>2013-10-07 06:29:25 -0400
committerIngo Molnar <mingo@kernel.org>2013-10-09 08:47:55 -0400
commit0f19c17929c952c6f0966d93ab05558e7bf814cc (patch)
treea881a5c520d8d0791dd73859f51c87285d3a06be /mm/memory.c
parent6688cc05473b36a0a3d3971e1adf1712919b32eb (diff)
mm: numa: Do not batch handle PMD pages
With the THP migration races closed it is still possible to occasionally see corruption. The problem is related to handling PMD pages in batch. When a page fault is handled it can be assumed that the page being faulted will also be flushed from the TLB. The same flushing does not happen when handling PMD pages in batch. Fixing is straight forward but there are a number of reasons not to 1. Multiple TLB flushes may have to be sent depending on what pages get migrated 2. The handling of PMDs in batch means that faults get accounted to the task that is handling the fault. While care is taken to only mark PMDs where the last CPU and PID match it can still have problems due to PID truncation when matching PIDs. 3. Batching on the PMD level may reduce faults but setting pmd_numa requires taking a heavy lock that can contend with THP migration and handling the fault requires the release/acquisition of the PTL for every page migrated. It's still pretty heavy. PMD batch handling is not something that people ever have been happy with. This patch removes it and later patches will deal with the additional fault overhead using more installigent migrate rate adaption. Signed-off-by: Mel Gorman <mgorman@suse.de> Reviewed-by: Rik van Riel <riel@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1381141781-10992-48-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c101
1 files changed, 2 insertions, 99 deletions
diff --git a/mm/memory.c b/mm/memory.c
index eba846bcf124..9898eeb9a21c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3606,103 +3606,6 @@ out:
3606 return 0; 3606 return 0;
3607} 3607}
3608 3608
3609/* NUMA hinting page fault entry point for regular pmds */
3610#ifdef CONFIG_NUMA_BALANCING
3611static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3612 unsigned long addr, pmd_t *pmdp)
3613{
3614 pmd_t pmd;
3615 pte_t *pte, *orig_pte;
3616 unsigned long _addr = addr & PMD_MASK;
3617 unsigned long offset;
3618 spinlock_t *ptl;
3619 bool numa = false;
3620 int last_cpupid;
3621
3622 spin_lock(&mm->page_table_lock);
3623 pmd = *pmdp;
3624 if (pmd_numa(pmd)) {
3625 set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
3626 numa = true;
3627 }
3628 spin_unlock(&mm->page_table_lock);
3629
3630 if (!numa)
3631 return 0;
3632
3633 /* we're in a page fault so some vma must be in the range */
3634 BUG_ON(!vma);
3635 BUG_ON(vma->vm_start >= _addr + PMD_SIZE);
3636 offset = max(_addr, vma->vm_start) & ~PMD_MASK;
3637 VM_BUG_ON(offset >= PMD_SIZE);
3638 orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
3639 pte += offset >> PAGE_SHIFT;
3640 for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
3641 pte_t pteval = *pte;
3642 struct page *page;
3643 int page_nid = -1;
3644 int target_nid;
3645 bool migrated = false;
3646 int flags = 0;
3647
3648 if (!pte_present(pteval))
3649 continue;
3650 if (!pte_numa(pteval))
3651 continue;
3652 if (addr >= vma->vm_end) {
3653 vma = find_vma(mm, addr);
3654 /* there's a pte present so there must be a vma */
3655 BUG_ON(!vma);
3656 BUG_ON(addr < vma->vm_start);
3657 }
3658 if (pte_numa(pteval)) {
3659 pteval = pte_mknonnuma(pteval);
3660 set_pte_at(mm, addr, pte, pteval);
3661 }
3662 page = vm_normal_page(vma, addr, pteval);
3663 if (unlikely(!page))
3664 continue;
3665
3666 /*
3667 * Avoid grouping on DSO/COW pages in specific and RO pages
3668 * in general, RO pages shouldn't hurt as much anyway since
3669 * they can be in shared cache state.
3670 */
3671 if (!pte_write(pteval))
3672 flags |= TNF_NO_GROUP;
3673
3674 last_cpupid = page_cpupid_last(page);
3675 page_nid = page_to_nid(page);
3676 target_nid = numa_migrate_prep(page, vma, addr, page_nid);
3677 pte_unmap_unlock(pte, ptl);
3678 if (target_nid != -1) {
3679 migrated = migrate_misplaced_page(page, vma, target_nid);
3680 if (migrated) {
3681 page_nid = target_nid;
3682 flags |= TNF_MIGRATED;
3683 }
3684 } else {
3685 put_page(page);
3686 }
3687
3688 if (page_nid != -1)
3689 task_numa_fault(last_cpupid, page_nid, 1, flags);
3690
3691 pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
3692 }
3693 pte_unmap_unlock(orig_pte, ptl);
3694
3695 return 0;
3696}
3697#else
3698static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3699 unsigned long addr, pmd_t *pmdp)
3700{
3701 BUG();
3702 return 0;
3703}
3704#endif /* CONFIG_NUMA_BALANCING */
3705
3706/* 3609/*
3707 * These routines also need to handle stuff like marking pages dirty 3610 * These routines also need to handle stuff like marking pages dirty
3708 * and/or accessed for architectures that don't do it in hardware (most 3611 * and/or accessed for architectures that don't do it in hardware (most
@@ -3841,8 +3744,8 @@ retry:
3841 } 3744 }
3842 } 3745 }
3843 3746
3844 if (pmd_numa(*pmd)) 3747 /* THP should already have been handled */
3845 return do_pmd_numa_page(mm, vma, address, pmd); 3748 BUG_ON(pmd_numa(*pmd));
3846 3749
3847 /* 3750 /*
3848 * Use __pte_alloc instead of pte_alloc_map, because we can't 3751 * Use __pte_alloc instead of pte_alloc_map, because we can't