aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c198
1 files changed, 195 insertions, 3 deletions
diff --git a/mm/memory.c b/mm/memory.c
index db2e9e797a05..e6a3b933517e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -57,6 +57,7 @@
57#include <linux/swapops.h> 57#include <linux/swapops.h>
58#include <linux/elf.h> 58#include <linux/elf.h>
59#include <linux/gfp.h> 59#include <linux/gfp.h>
60#include <linux/migrate.h>
60 61
61#include <asm/io.h> 62#include <asm/io.h>
62#include <asm/pgalloc.h> 63#include <asm/pgalloc.h>
@@ -1503,6 +1504,8 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1503 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); 1504 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
1504 goto out; 1505 goto out;
1505 } 1506 }
1507 if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
1508 goto no_page_table;
1506 if (pmd_trans_huge(*pmd)) { 1509 if (pmd_trans_huge(*pmd)) {
1507 if (flags & FOLL_SPLIT) { 1510 if (flags & FOLL_SPLIT) {
1508 split_huge_page_pmd(vma, address, pmd); 1511 split_huge_page_pmd(vma, address, pmd);
@@ -1532,6 +1535,8 @@ split_fallthrough:
1532 pte = *ptep; 1535 pte = *ptep;
1533 if (!pte_present(pte)) 1536 if (!pte_present(pte))
1534 goto no_page; 1537 goto no_page;
1538 if ((flags & FOLL_NUMA) && pte_numa(pte))
1539 goto no_page;
1535 if ((flags & FOLL_WRITE) && !pte_write(pte)) 1540 if ((flags & FOLL_WRITE) && !pte_write(pte))
1536 goto unlock; 1541 goto unlock;
1537 1542
@@ -1683,6 +1688,19 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1683 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); 1688 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
1684 vm_flags &= (gup_flags & FOLL_FORCE) ? 1689 vm_flags &= (gup_flags & FOLL_FORCE) ?
1685 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); 1690 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
1691
1692 /*
1693 * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault
1694 * would be called on PROT_NONE ranges. We must never invoke
1695 * handle_mm_fault on PROT_NONE ranges or the NUMA hinting
1696 * page faults would unprotect the PROT_NONE ranges if
1697 * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd
1698 * bitflag. So to avoid that, don't set FOLL_NUMA if
1699 * FOLL_FORCE is set.
1700 */
1701 if (!(gup_flags & FOLL_FORCE))
1702 gup_flags |= FOLL_NUMA;
1703
1686 i = 0; 1704 i = 0;
1687 1705
1688 do { 1706 do {
@@ -3412,6 +3430,169 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3412 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); 3430 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3413} 3431}
3414 3432
3433int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
3434 unsigned long addr, int current_nid)
3435{
3436 get_page(page);
3437
3438 count_vm_numa_event(NUMA_HINT_FAULTS);
3439 if (current_nid == numa_node_id())
3440 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
3441
3442 return mpol_misplaced(page, vma, addr);
3443}
3444
3445int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3446 unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
3447{
3448 struct page *page = NULL;
3449 spinlock_t *ptl;
3450 int current_nid = -1;
3451 int target_nid;
3452 bool migrated = false;
3453
3454 /*
3455 * The "pte" at this point cannot be used safely without
3456 * validation through pte_unmap_same(). It's of NUMA type but
3457 * the pfn may be screwed if the read is non atomic.
3458 *
3459 * ptep_modify_prot_start is not called as this is clearing
3460 * the _PAGE_NUMA bit and it is not really expected that there
3461 * would be concurrent hardware modifications to the PTE.
3462 */
3463 ptl = pte_lockptr(mm, pmd);
3464 spin_lock(ptl);
3465 if (unlikely(!pte_same(*ptep, pte))) {
3466 pte_unmap_unlock(ptep, ptl);
3467 goto out;
3468 }
3469
3470 pte = pte_mknonnuma(pte);
3471 set_pte_at(mm, addr, ptep, pte);
3472 update_mmu_cache(vma, addr, ptep);
3473
3474 page = vm_normal_page(vma, addr, pte);
3475 if (!page) {
3476 pte_unmap_unlock(ptep, ptl);
3477 return 0;
3478 }
3479
3480 current_nid = page_to_nid(page);
3481 target_nid = numa_migrate_prep(page, vma, addr, current_nid);
3482 pte_unmap_unlock(ptep, ptl);
3483 if (target_nid == -1) {
3484 /*
3485 * Account for the fault against the current node if it not
3486 * being replaced regardless of where the page is located.
3487 */
3488 current_nid = numa_node_id();
3489 put_page(page);
3490 goto out;
3491 }
3492
3493 /* Migrate to the requested node */
3494 migrated = migrate_misplaced_page(page, target_nid);
3495 if (migrated)
3496 current_nid = target_nid;
3497
3498out:
3499 if (current_nid != -1)
3500 task_numa_fault(current_nid, 1, migrated);
3501 return 0;
3502}
3503
3504/* NUMA hinting page fault entry point for regular pmds */
3505#ifdef CONFIG_NUMA_BALANCING
3506static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3507 unsigned long addr, pmd_t *pmdp)
3508{
3509 pmd_t pmd;
3510 pte_t *pte, *orig_pte;
3511 unsigned long _addr = addr & PMD_MASK;
3512 unsigned long offset;
3513 spinlock_t *ptl;
3514 bool numa = false;
3515 int local_nid = numa_node_id();
3516
3517 spin_lock(&mm->page_table_lock);
3518 pmd = *pmdp;
3519 if (pmd_numa(pmd)) {
3520 set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
3521 numa = true;
3522 }
3523 spin_unlock(&mm->page_table_lock);
3524
3525 if (!numa)
3526 return 0;
3527
3528 /* we're in a page fault so some vma must be in the range */
3529 BUG_ON(!vma);
3530 BUG_ON(vma->vm_start >= _addr + PMD_SIZE);
3531 offset = max(_addr, vma->vm_start) & ~PMD_MASK;
3532 VM_BUG_ON(offset >= PMD_SIZE);
3533 orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
3534 pte += offset >> PAGE_SHIFT;
3535 for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
3536 pte_t pteval = *pte;
3537 struct page *page;
3538 int curr_nid = local_nid;
3539 int target_nid;
3540 bool migrated;
3541 if (!pte_present(pteval))
3542 continue;
3543 if (!pte_numa(pteval))
3544 continue;
3545 if (addr >= vma->vm_end) {
3546 vma = find_vma(mm, addr);
3547 /* there's a pte present so there must be a vma */
3548 BUG_ON(!vma);
3549 BUG_ON(addr < vma->vm_start);
3550 }
3551 if (pte_numa(pteval)) {
3552 pteval = pte_mknonnuma(pteval);
3553 set_pte_at(mm, addr, pte, pteval);
3554 }
3555 page = vm_normal_page(vma, addr, pteval);
3556 if (unlikely(!page))
3557 continue;
3558 /* only check non-shared pages */
3559 if (unlikely(page_mapcount(page) != 1))
3560 continue;
3561
3562 /*
3563 * Note that the NUMA fault is later accounted to either
3564 * the node that is currently running or where the page is
3565 * migrated to.
3566 */
3567 curr_nid = local_nid;
3568 target_nid = numa_migrate_prep(page, vma, addr,
3569 page_to_nid(page));
3570 if (target_nid == -1) {
3571 put_page(page);
3572 continue;
3573 }
3574
3575 /* Migrate to the requested node */
3576 pte_unmap_unlock(pte, ptl);
3577 migrated = migrate_misplaced_page(page, target_nid);
3578 if (migrated)
3579 curr_nid = target_nid;
3580 task_numa_fault(curr_nid, 1, migrated);
3581
3582 pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
3583 }
3584 pte_unmap_unlock(orig_pte, ptl);
3585
3586 return 0;
3587}
3588#else
3589static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3590 unsigned long addr, pmd_t *pmdp)
3591{
3592 BUG();
3593}
3594#endif /* CONFIG_NUMA_BALANCING */
3595
3415/* 3596/*
3416 * These routines also need to handle stuff like marking pages dirty 3597 * These routines also need to handle stuff like marking pages dirty
3417 * and/or accessed for architectures that don't do it in hardware (most 3598 * and/or accessed for architectures that don't do it in hardware (most
@@ -3450,6 +3631,9 @@ int handle_pte_fault(struct mm_struct *mm,
3450 pte, pmd, flags, entry); 3631 pte, pmd, flags, entry);
3451 } 3632 }
3452 3633
3634 if (pte_numa(entry))
3635 return do_numa_page(mm, vma, address, entry, pte, pmd);
3636
3453 ptl = pte_lockptr(mm, pmd); 3637 ptl = pte_lockptr(mm, pmd);
3454 spin_lock(ptl); 3638 spin_lock(ptl);
3455 if (unlikely(!pte_same(*pte, entry))) 3639 if (unlikely(!pte_same(*pte, entry)))
@@ -3520,8 +3704,11 @@ retry:
3520 if (pmd_trans_huge(orig_pmd)) { 3704 if (pmd_trans_huge(orig_pmd)) {
3521 unsigned int dirty = flags & FAULT_FLAG_WRITE; 3705 unsigned int dirty = flags & FAULT_FLAG_WRITE;
3522 3706
3523 if (dirty && !pmd_write(orig_pmd) && 3707 if (pmd_numa(orig_pmd))
3524 !pmd_trans_splitting(orig_pmd)) { 3708 return do_huge_pmd_numa_page(mm, vma, address,
3709 orig_pmd, pmd);
3710
3711 if (dirty && !pmd_write(orig_pmd)) {
3525 ret = do_huge_pmd_wp_page(mm, vma, address, pmd, 3712 ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
3526 orig_pmd); 3713 orig_pmd);
3527 /* 3714 /*
@@ -3536,16 +3723,21 @@ retry:
3536 huge_pmd_set_accessed(mm, vma, address, pmd, 3723 huge_pmd_set_accessed(mm, vma, address, pmd,
3537 orig_pmd, dirty); 3724 orig_pmd, dirty);
3538 } 3725 }
3726
3539 return 0; 3727 return 0;
3540 } 3728 }
3541 } 3729 }
3542 3730
3731 if (pmd_numa(*pmd))
3732 return do_pmd_numa_page(mm, vma, address, pmd);
3733
3543 /* 3734 /*
3544 * Use __pte_alloc instead of pte_alloc_map, because we can't 3735 * Use __pte_alloc instead of pte_alloc_map, because we can't
3545 * run pte_offset_map on the pmd, if an huge pmd could 3736 * run pte_offset_map on the pmd, if an huge pmd could
3546 * materialize from under us from a different thread. 3737 * materialize from under us from a different thread.
3547 */ 3738 */
3548 if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address)) 3739 if (unlikely(pmd_none(*pmd)) &&
3740 unlikely(__pte_alloc(mm, vma, pmd, address)))
3549 return VM_FAULT_OOM; 3741 return VM_FAULT_OOM;
3550 /* if an huge pmd materialized from under us just retry later */ 3742 /* if an huge pmd materialized from under us just retry later */
3551 if (unlikely(pmd_trans_huge(*pmd))) 3743 if (unlikely(pmd_trans_huge(*pmd)))