aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-12-16 17:33:25 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-12-16 18:18:08 -0500
commit3d59eebc5e137bd89c6351e4c70e90ba1d0dc234 (patch)
treeb4ddfd0b057454a7437a3b4e3074a3b8b4b03817 /mm/memory.c
parent11520e5e7c1855fc3bf202bb3be35a39d9efa034 (diff)
parent4fc3f1d66b1ef0d7b8dc11f4ff1cc510f78b37d6 (diff)
Merge tag 'balancenuma-v11' of git://git.kernel.org/pub/scm/linux/kernel/git/mel/linux-balancenuma
Pull Automatic NUMA Balancing bare-bones from Mel Gorman: "There are three implementations for NUMA balancing, this tree (balancenuma), numacore which has been developed in tip/master and autonuma which is in aa.git. In almost all respects balancenuma is the dumbest of the three because its main impact is on the VM side with no attempt to be smart about scheduling. In the interest of getting the ball rolling, it would be desirable to see this much merged for 3.8 with the view to building scheduler smarts on top and adapting the VM where required for 3.9. The most recent set of comparisons available from different people are mel: https://lkml.org/lkml/2012/12/9/108 mingo: https://lkml.org/lkml/2012/12/7/331 tglx: https://lkml.org/lkml/2012/12/10/437 srikar: https://lkml.org/lkml/2012/12/10/397 The results are a mixed bag. In my own tests, balancenuma does reasonably well. It's dumb as rocks and does not regress against mainline. On the other hand, Ingo's tests shows that balancenuma is incapable of converging for this workloads driven by perf which is bad but is potentially explained by the lack of scheduler smarts. Thomas' results show balancenuma improves on mainline but falls far short of numacore or autonuma. Srikar's results indicate we all suffer on a large machine with imbalanced node sizes. My own testing showed that recent numacore results have improved dramatically, particularly in the last week but not universally. We've butted heads heavily on system CPU usage and high levels of migration even when it shows that overall performance is better. There are also cases where it regresses. Of interest is that for specjbb in some configurations it will regress for lower numbers of warehouses and show gains for higher numbers which is not reported by the tool by default and sometimes missed in treports. Recently I reported for numacore that the JVM was crashing with NullPointerExceptions but currently it's unclear what the source of this problem is. Initially I thought it was in how numacore batch handles PTEs but I'm no longer think this is the case. It's possible numacore is just able to trigger it due to higher rates of migration. These reports were quite late in the cycle so I/we would like to start with this tree as it contains much of the code we can agree on and has not changed significantly over the last 2-3 weeks." * tag 'balancenuma-v11' of git://git.kernel.org/pub/scm/linux/kernel/git/mel/linux-balancenuma: (50 commits) mm/rmap, migration: Make rmap_walk_anon() and try_to_unmap_anon() more scalable mm/rmap: Convert the struct anon_vma::mutex to an rwsem mm: migrate: Account a transhuge page properly when rate limiting mm: numa: Account for failed allocations and isolations as migration failures mm: numa: Add THP migration for the NUMA working set scanning fault case build fix mm: numa: Add THP migration for the NUMA working set scanning fault case. mm: sched: numa: Delay PTE scanning until a task is scheduled on a new node mm: sched: numa: Control enabling and disabling of NUMA balancing if !SCHED_DEBUG mm: sched: numa: Control enabling and disabling of NUMA balancing mm: sched: Adapt the scanning rate if a NUMA hinting fault does not migrate mm: numa: Use a two-stage filter to restrict pages being migrated for unlikely task<->node relationships mm: numa: migrate: Set last_nid on newly allocated page mm: numa: split_huge_page: Transfer last_nid on tail page mm: numa: Introduce last_nid to the page frame sched: numa: Slowly increase the scanning period as NUMA faults are handled mm: numa: Rate limit setting of pte_numa if node is saturated mm: numa: Rate limit the amount of memory that is migrated between nodes mm: numa: Structures for Migrate On Fault per NUMA migration rate limiting mm: numa: Migrate pages handled during a pmd_numa hinting fault mm: numa: Migrate on reference policy ...
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c198
1 files changed, 195 insertions, 3 deletions
diff --git a/mm/memory.c b/mm/memory.c
index db2e9e797a05..e6a3b933517e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -57,6 +57,7 @@
57#include <linux/swapops.h> 57#include <linux/swapops.h>
58#include <linux/elf.h> 58#include <linux/elf.h>
59#include <linux/gfp.h> 59#include <linux/gfp.h>
60#include <linux/migrate.h>
60 61
61#include <asm/io.h> 62#include <asm/io.h>
62#include <asm/pgalloc.h> 63#include <asm/pgalloc.h>
@@ -1503,6 +1504,8 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1503 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); 1504 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
1504 goto out; 1505 goto out;
1505 } 1506 }
1507 if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
1508 goto no_page_table;
1506 if (pmd_trans_huge(*pmd)) { 1509 if (pmd_trans_huge(*pmd)) {
1507 if (flags & FOLL_SPLIT) { 1510 if (flags & FOLL_SPLIT) {
1508 split_huge_page_pmd(vma, address, pmd); 1511 split_huge_page_pmd(vma, address, pmd);
@@ -1532,6 +1535,8 @@ split_fallthrough:
1532 pte = *ptep; 1535 pte = *ptep;
1533 if (!pte_present(pte)) 1536 if (!pte_present(pte))
1534 goto no_page; 1537 goto no_page;
1538 if ((flags & FOLL_NUMA) && pte_numa(pte))
1539 goto no_page;
1535 if ((flags & FOLL_WRITE) && !pte_write(pte)) 1540 if ((flags & FOLL_WRITE) && !pte_write(pte))
1536 goto unlock; 1541 goto unlock;
1537 1542
@@ -1683,6 +1688,19 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1683 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); 1688 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
1684 vm_flags &= (gup_flags & FOLL_FORCE) ? 1689 vm_flags &= (gup_flags & FOLL_FORCE) ?
1685 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); 1690 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
1691
1692 /*
1693 * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault
1694 * would be called on PROT_NONE ranges. We must never invoke
1695 * handle_mm_fault on PROT_NONE ranges or the NUMA hinting
1696 * page faults would unprotect the PROT_NONE ranges if
1697 * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd
1698 * bitflag. So to avoid that, don't set FOLL_NUMA if
1699 * FOLL_FORCE is set.
1700 */
1701 if (!(gup_flags & FOLL_FORCE))
1702 gup_flags |= FOLL_NUMA;
1703
1686 i = 0; 1704 i = 0;
1687 1705
1688 do { 1706 do {
@@ -3412,6 +3430,169 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3412 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); 3430 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3413} 3431}
3414 3432
3433int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
3434 unsigned long addr, int current_nid)
3435{
3436 get_page(page);
3437
3438 count_vm_numa_event(NUMA_HINT_FAULTS);
3439 if (current_nid == numa_node_id())
3440 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
3441
3442 return mpol_misplaced(page, vma, addr);
3443}
3444
3445int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3446 unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
3447{
3448 struct page *page = NULL;
3449 spinlock_t *ptl;
3450 int current_nid = -1;
3451 int target_nid;
3452 bool migrated = false;
3453
3454 /*
3455 * The "pte" at this point cannot be used safely without
3456 * validation through pte_unmap_same(). It's of NUMA type but
3457 * the pfn may be screwed if the read is non atomic.
3458 *
3459 * ptep_modify_prot_start is not called as this is clearing
3460 * the _PAGE_NUMA bit and it is not really expected that there
3461 * would be concurrent hardware modifications to the PTE.
3462 */
3463 ptl = pte_lockptr(mm, pmd);
3464 spin_lock(ptl);
3465 if (unlikely(!pte_same(*ptep, pte))) {
3466 pte_unmap_unlock(ptep, ptl);
3467 goto out;
3468 }
3469
3470 pte = pte_mknonnuma(pte);
3471 set_pte_at(mm, addr, ptep, pte);
3472 update_mmu_cache(vma, addr, ptep);
3473
3474 page = vm_normal_page(vma, addr, pte);
3475 if (!page) {
3476 pte_unmap_unlock(ptep, ptl);
3477 return 0;
3478 }
3479
3480 current_nid = page_to_nid(page);
3481 target_nid = numa_migrate_prep(page, vma, addr, current_nid);
3482 pte_unmap_unlock(ptep, ptl);
3483 if (target_nid == -1) {
3484 /*
3485 * Account for the fault against the current node if it not
3486 * being replaced regardless of where the page is located.
3487 */
3488 current_nid = numa_node_id();
3489 put_page(page);
3490 goto out;
3491 }
3492
3493 /* Migrate to the requested node */
3494 migrated = migrate_misplaced_page(page, target_nid);
3495 if (migrated)
3496 current_nid = target_nid;
3497
3498out:
3499 if (current_nid != -1)
3500 task_numa_fault(current_nid, 1, migrated);
3501 return 0;
3502}
3503
3504/* NUMA hinting page fault entry point for regular pmds */
3505#ifdef CONFIG_NUMA_BALANCING
3506static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3507 unsigned long addr, pmd_t *pmdp)
3508{
3509 pmd_t pmd;
3510 pte_t *pte, *orig_pte;
3511 unsigned long _addr = addr & PMD_MASK;
3512 unsigned long offset;
3513 spinlock_t *ptl;
3514 bool numa = false;
3515 int local_nid = numa_node_id();
3516
3517 spin_lock(&mm->page_table_lock);
3518 pmd = *pmdp;
3519 if (pmd_numa(pmd)) {
3520 set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
3521 numa = true;
3522 }
3523 spin_unlock(&mm->page_table_lock);
3524
3525 if (!numa)
3526 return 0;
3527
3528 /* we're in a page fault so some vma must be in the range */
3529 BUG_ON(!vma);
3530 BUG_ON(vma->vm_start >= _addr + PMD_SIZE);
3531 offset = max(_addr, vma->vm_start) & ~PMD_MASK;
3532 VM_BUG_ON(offset >= PMD_SIZE);
3533 orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
3534 pte += offset >> PAGE_SHIFT;
3535 for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
3536 pte_t pteval = *pte;
3537 struct page *page;
3538 int curr_nid = local_nid;
3539 int target_nid;
3540 bool migrated;
3541 if (!pte_present(pteval))
3542 continue;
3543 if (!pte_numa(pteval))
3544 continue;
3545 if (addr >= vma->vm_end) {
3546 vma = find_vma(mm, addr);
3547 /* there's a pte present so there must be a vma */
3548 BUG_ON(!vma);
3549 BUG_ON(addr < vma->vm_start);
3550 }
3551 if (pte_numa(pteval)) {
3552 pteval = pte_mknonnuma(pteval);
3553 set_pte_at(mm, addr, pte, pteval);
3554 }
3555 page = vm_normal_page(vma, addr, pteval);
3556 if (unlikely(!page))
3557 continue;
3558 /* only check non-shared pages */
3559 if (unlikely(page_mapcount(page) != 1))
3560 continue;
3561
3562 /*
3563 * Note that the NUMA fault is later accounted to either
3564 * the node that is currently running or where the page is
3565 * migrated to.
3566 */
3567 curr_nid = local_nid;
3568 target_nid = numa_migrate_prep(page, vma, addr,
3569 page_to_nid(page));
3570 if (target_nid == -1) {
3571 put_page(page);
3572 continue;
3573 }
3574
3575 /* Migrate to the requested node */
3576 pte_unmap_unlock(pte, ptl);
3577 migrated = migrate_misplaced_page(page, target_nid);
3578 if (migrated)
3579 curr_nid = target_nid;
3580 task_numa_fault(curr_nid, 1, migrated);
3581
3582 pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
3583 }
3584 pte_unmap_unlock(orig_pte, ptl);
3585
3586 return 0;
3587}
3588#else
3589static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3590 unsigned long addr, pmd_t *pmdp)
3591{
3592 BUG();
3593}
3594#endif /* CONFIG_NUMA_BALANCING */
3595
3415/* 3596/*
3416 * These routines also need to handle stuff like marking pages dirty 3597 * These routines also need to handle stuff like marking pages dirty
3417 * and/or accessed for architectures that don't do it in hardware (most 3598 * and/or accessed for architectures that don't do it in hardware (most
@@ -3450,6 +3631,9 @@ int handle_pte_fault(struct mm_struct *mm,
3450 pte, pmd, flags, entry); 3631 pte, pmd, flags, entry);
3451 } 3632 }
3452 3633
3634 if (pte_numa(entry))
3635 return do_numa_page(mm, vma, address, entry, pte, pmd);
3636
3453 ptl = pte_lockptr(mm, pmd); 3637 ptl = pte_lockptr(mm, pmd);
3454 spin_lock(ptl); 3638 spin_lock(ptl);
3455 if (unlikely(!pte_same(*pte, entry))) 3639 if (unlikely(!pte_same(*pte, entry)))
@@ -3520,8 +3704,11 @@ retry:
3520 if (pmd_trans_huge(orig_pmd)) { 3704 if (pmd_trans_huge(orig_pmd)) {
3521 unsigned int dirty = flags & FAULT_FLAG_WRITE; 3705 unsigned int dirty = flags & FAULT_FLAG_WRITE;
3522 3706
3523 if (dirty && !pmd_write(orig_pmd) && 3707 if (pmd_numa(orig_pmd))
3524 !pmd_trans_splitting(orig_pmd)) { 3708 return do_huge_pmd_numa_page(mm, vma, address,
3709 orig_pmd, pmd);
3710
3711 if (dirty && !pmd_write(orig_pmd)) {
3525 ret = do_huge_pmd_wp_page(mm, vma, address, pmd, 3712 ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
3526 orig_pmd); 3713 orig_pmd);
3527 /* 3714 /*
@@ -3536,16 +3723,21 @@ retry:
3536 huge_pmd_set_accessed(mm, vma, address, pmd, 3723 huge_pmd_set_accessed(mm, vma, address, pmd,
3537 orig_pmd, dirty); 3724 orig_pmd, dirty);
3538 } 3725 }
3726
3539 return 0; 3727 return 0;
3540 } 3728 }
3541 } 3729 }
3542 3730
3731 if (pmd_numa(*pmd))
3732 return do_pmd_numa_page(mm, vma, address, pmd);
3733
3543 /* 3734 /*
3544 * Use __pte_alloc instead of pte_alloc_map, because we can't 3735 * Use __pte_alloc instead of pte_alloc_map, because we can't
3545 * run pte_offset_map on the pmd, if an huge pmd could 3736 * run pte_offset_map on the pmd, if an huge pmd could
3546 * materialize from under us from a different thread. 3737 * materialize from under us from a different thread.
3547 */ 3738 */
3548 if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address)) 3739 if (unlikely(pmd_none(*pmd)) &&
3740 unlikely(__pte_alloc(mm, vma, pmd, address)))
3549 return VM_FAULT_OOM; 3741 return VM_FAULT_OOM;
3550 /* if an huge pmd materialized from under us just retry later */ 3742 /* if an huge pmd materialized from under us just retry later */
3551 if (unlikely(pmd_trans_huge(*pmd))) 3743 if (unlikely(pmd_trans_huge(*pmd)))