diff options
Diffstat (limited to 'mm/memory.c')
-rw-r--r-- | mm/memory.c | 198 |
1 files changed, 195 insertions, 3 deletions
diff --git a/mm/memory.c b/mm/memory.c index db2e9e797a05..e6a3b933517e 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -57,6 +57,7 @@ | |||
57 | #include <linux/swapops.h> | 57 | #include <linux/swapops.h> |
58 | #include <linux/elf.h> | 58 | #include <linux/elf.h> |
59 | #include <linux/gfp.h> | 59 | #include <linux/gfp.h> |
60 | #include <linux/migrate.h> | ||
60 | 61 | ||
61 | #include <asm/io.h> | 62 | #include <asm/io.h> |
62 | #include <asm/pgalloc.h> | 63 | #include <asm/pgalloc.h> |
@@ -1503,6 +1504,8 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | |||
1503 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); | 1504 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); |
1504 | goto out; | 1505 | goto out; |
1505 | } | 1506 | } |
1507 | if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) | ||
1508 | goto no_page_table; | ||
1506 | if (pmd_trans_huge(*pmd)) { | 1509 | if (pmd_trans_huge(*pmd)) { |
1507 | if (flags & FOLL_SPLIT) { | 1510 | if (flags & FOLL_SPLIT) { |
1508 | split_huge_page_pmd(vma, address, pmd); | 1511 | split_huge_page_pmd(vma, address, pmd); |
@@ -1532,6 +1535,8 @@ split_fallthrough: | |||
1532 | pte = *ptep; | 1535 | pte = *ptep; |
1533 | if (!pte_present(pte)) | 1536 | if (!pte_present(pte)) |
1534 | goto no_page; | 1537 | goto no_page; |
1538 | if ((flags & FOLL_NUMA) && pte_numa(pte)) | ||
1539 | goto no_page; | ||
1535 | if ((flags & FOLL_WRITE) && !pte_write(pte)) | 1540 | if ((flags & FOLL_WRITE) && !pte_write(pte)) |
1536 | goto unlock; | 1541 | goto unlock; |
1537 | 1542 | ||
@@ -1683,6 +1688,19 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1683 | (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); | 1688 | (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); |
1684 | vm_flags &= (gup_flags & FOLL_FORCE) ? | 1689 | vm_flags &= (gup_flags & FOLL_FORCE) ? |
1685 | (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); | 1690 | (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); |
1691 | |||
1692 | /* | ||
1693 | * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault | ||
1694 | * would be called on PROT_NONE ranges. We must never invoke | ||
1695 | * handle_mm_fault on PROT_NONE ranges or the NUMA hinting | ||
1696 | * page faults would unprotect the PROT_NONE ranges if | ||
1697 | * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd | ||
1698 | * bitflag. So to avoid that, don't set FOLL_NUMA if | ||
1699 | * FOLL_FORCE is set. | ||
1700 | */ | ||
1701 | if (!(gup_flags & FOLL_FORCE)) | ||
1702 | gup_flags |= FOLL_NUMA; | ||
1703 | |||
1686 | i = 0; | 1704 | i = 0; |
1687 | 1705 | ||
1688 | do { | 1706 | do { |
@@ -3412,6 +3430,169 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3412 | return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); | 3430 | return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); |
3413 | } | 3431 | } |
3414 | 3432 | ||
3433 | int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, | ||
3434 | unsigned long addr, int current_nid) | ||
3435 | { | ||
3436 | get_page(page); | ||
3437 | |||
3438 | count_vm_numa_event(NUMA_HINT_FAULTS); | ||
3439 | if (current_nid == numa_node_id()) | ||
3440 | count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); | ||
3441 | |||
3442 | return mpol_misplaced(page, vma, addr); | ||
3443 | } | ||
3444 | |||
3445 | int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
3446 | unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd) | ||
3447 | { | ||
3448 | struct page *page = NULL; | ||
3449 | spinlock_t *ptl; | ||
3450 | int current_nid = -1; | ||
3451 | int target_nid; | ||
3452 | bool migrated = false; | ||
3453 | |||
3454 | /* | ||
3455 | * The "pte" at this point cannot be used safely without | ||
3456 | * validation through pte_unmap_same(). It's of NUMA type but | ||
3457 | * the pfn may be screwed if the read is non atomic. | ||
3458 | * | ||
3459 | * ptep_modify_prot_start is not called as this is clearing | ||
3460 | * the _PAGE_NUMA bit and it is not really expected that there | ||
3461 | * would be concurrent hardware modifications to the PTE. | ||
3462 | */ | ||
3463 | ptl = pte_lockptr(mm, pmd); | ||
3464 | spin_lock(ptl); | ||
3465 | if (unlikely(!pte_same(*ptep, pte))) { | ||
3466 | pte_unmap_unlock(ptep, ptl); | ||
3467 | goto out; | ||
3468 | } | ||
3469 | |||
3470 | pte = pte_mknonnuma(pte); | ||
3471 | set_pte_at(mm, addr, ptep, pte); | ||
3472 | update_mmu_cache(vma, addr, ptep); | ||
3473 | |||
3474 | page = vm_normal_page(vma, addr, pte); | ||
3475 | if (!page) { | ||
3476 | pte_unmap_unlock(ptep, ptl); | ||
3477 | return 0; | ||
3478 | } | ||
3479 | |||
3480 | current_nid = page_to_nid(page); | ||
3481 | target_nid = numa_migrate_prep(page, vma, addr, current_nid); | ||
3482 | pte_unmap_unlock(ptep, ptl); | ||
3483 | if (target_nid == -1) { | ||
3484 | /* | ||
3485 | * Account for the fault against the current node if it not | ||
3486 | * being replaced regardless of where the page is located. | ||
3487 | */ | ||
3488 | current_nid = numa_node_id(); | ||
3489 | put_page(page); | ||
3490 | goto out; | ||
3491 | } | ||
3492 | |||
3493 | /* Migrate to the requested node */ | ||
3494 | migrated = migrate_misplaced_page(page, target_nid); | ||
3495 | if (migrated) | ||
3496 | current_nid = target_nid; | ||
3497 | |||
3498 | out: | ||
3499 | if (current_nid != -1) | ||
3500 | task_numa_fault(current_nid, 1, migrated); | ||
3501 | return 0; | ||
3502 | } | ||
3503 | |||
3504 | /* NUMA hinting page fault entry point for regular pmds */ | ||
3505 | #ifdef CONFIG_NUMA_BALANCING | ||
3506 | static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
3507 | unsigned long addr, pmd_t *pmdp) | ||
3508 | { | ||
3509 | pmd_t pmd; | ||
3510 | pte_t *pte, *orig_pte; | ||
3511 | unsigned long _addr = addr & PMD_MASK; | ||
3512 | unsigned long offset; | ||
3513 | spinlock_t *ptl; | ||
3514 | bool numa = false; | ||
3515 | int local_nid = numa_node_id(); | ||
3516 | |||
3517 | spin_lock(&mm->page_table_lock); | ||
3518 | pmd = *pmdp; | ||
3519 | if (pmd_numa(pmd)) { | ||
3520 | set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd)); | ||
3521 | numa = true; | ||
3522 | } | ||
3523 | spin_unlock(&mm->page_table_lock); | ||
3524 | |||
3525 | if (!numa) | ||
3526 | return 0; | ||
3527 | |||
3528 | /* we're in a page fault so some vma must be in the range */ | ||
3529 | BUG_ON(!vma); | ||
3530 | BUG_ON(vma->vm_start >= _addr + PMD_SIZE); | ||
3531 | offset = max(_addr, vma->vm_start) & ~PMD_MASK; | ||
3532 | VM_BUG_ON(offset >= PMD_SIZE); | ||
3533 | orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl); | ||
3534 | pte += offset >> PAGE_SHIFT; | ||
3535 | for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) { | ||
3536 | pte_t pteval = *pte; | ||
3537 | struct page *page; | ||
3538 | int curr_nid = local_nid; | ||
3539 | int target_nid; | ||
3540 | bool migrated; | ||
3541 | if (!pte_present(pteval)) | ||
3542 | continue; | ||
3543 | if (!pte_numa(pteval)) | ||
3544 | continue; | ||
3545 | if (addr >= vma->vm_end) { | ||
3546 | vma = find_vma(mm, addr); | ||
3547 | /* there's a pte present so there must be a vma */ | ||
3548 | BUG_ON(!vma); | ||
3549 | BUG_ON(addr < vma->vm_start); | ||
3550 | } | ||
3551 | if (pte_numa(pteval)) { | ||
3552 | pteval = pte_mknonnuma(pteval); | ||
3553 | set_pte_at(mm, addr, pte, pteval); | ||
3554 | } | ||
3555 | page = vm_normal_page(vma, addr, pteval); | ||
3556 | if (unlikely(!page)) | ||
3557 | continue; | ||
3558 | /* only check non-shared pages */ | ||
3559 | if (unlikely(page_mapcount(page) != 1)) | ||
3560 | continue; | ||
3561 | |||
3562 | /* | ||
3563 | * Note that the NUMA fault is later accounted to either | ||
3564 | * the node that is currently running or where the page is | ||
3565 | * migrated to. | ||
3566 | */ | ||
3567 | curr_nid = local_nid; | ||
3568 | target_nid = numa_migrate_prep(page, vma, addr, | ||
3569 | page_to_nid(page)); | ||
3570 | if (target_nid == -1) { | ||
3571 | put_page(page); | ||
3572 | continue; | ||
3573 | } | ||
3574 | |||
3575 | /* Migrate to the requested node */ | ||
3576 | pte_unmap_unlock(pte, ptl); | ||
3577 | migrated = migrate_misplaced_page(page, target_nid); | ||
3578 | if (migrated) | ||
3579 | curr_nid = target_nid; | ||
3580 | task_numa_fault(curr_nid, 1, migrated); | ||
3581 | |||
3582 | pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); | ||
3583 | } | ||
3584 | pte_unmap_unlock(orig_pte, ptl); | ||
3585 | |||
3586 | return 0; | ||
3587 | } | ||
3588 | #else | ||
3589 | static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
3590 | unsigned long addr, pmd_t *pmdp) | ||
3591 | { | ||
3592 | BUG(); | ||
3593 | } | ||
3594 | #endif /* CONFIG_NUMA_BALANCING */ | ||
3595 | |||
3415 | /* | 3596 | /* |
3416 | * These routines also need to handle stuff like marking pages dirty | 3597 | * These routines also need to handle stuff like marking pages dirty |
3417 | * and/or accessed for architectures that don't do it in hardware (most | 3598 | * and/or accessed for architectures that don't do it in hardware (most |
@@ -3450,6 +3631,9 @@ int handle_pte_fault(struct mm_struct *mm, | |||
3450 | pte, pmd, flags, entry); | 3631 | pte, pmd, flags, entry); |
3451 | } | 3632 | } |
3452 | 3633 | ||
3634 | if (pte_numa(entry)) | ||
3635 | return do_numa_page(mm, vma, address, entry, pte, pmd); | ||
3636 | |||
3453 | ptl = pte_lockptr(mm, pmd); | 3637 | ptl = pte_lockptr(mm, pmd); |
3454 | spin_lock(ptl); | 3638 | spin_lock(ptl); |
3455 | if (unlikely(!pte_same(*pte, entry))) | 3639 | if (unlikely(!pte_same(*pte, entry))) |
@@ -3520,8 +3704,11 @@ retry: | |||
3520 | if (pmd_trans_huge(orig_pmd)) { | 3704 | if (pmd_trans_huge(orig_pmd)) { |
3521 | unsigned int dirty = flags & FAULT_FLAG_WRITE; | 3705 | unsigned int dirty = flags & FAULT_FLAG_WRITE; |
3522 | 3706 | ||
3523 | if (dirty && !pmd_write(orig_pmd) && | 3707 | if (pmd_numa(orig_pmd)) |
3524 | !pmd_trans_splitting(orig_pmd)) { | 3708 | return do_huge_pmd_numa_page(mm, vma, address, |
3709 | orig_pmd, pmd); | ||
3710 | |||
3711 | if (dirty && !pmd_write(orig_pmd)) { | ||
3525 | ret = do_huge_pmd_wp_page(mm, vma, address, pmd, | 3712 | ret = do_huge_pmd_wp_page(mm, vma, address, pmd, |
3526 | orig_pmd); | 3713 | orig_pmd); |
3527 | /* | 3714 | /* |
@@ -3536,16 +3723,21 @@ retry: | |||
3536 | huge_pmd_set_accessed(mm, vma, address, pmd, | 3723 | huge_pmd_set_accessed(mm, vma, address, pmd, |
3537 | orig_pmd, dirty); | 3724 | orig_pmd, dirty); |
3538 | } | 3725 | } |
3726 | |||
3539 | return 0; | 3727 | return 0; |
3540 | } | 3728 | } |
3541 | } | 3729 | } |
3542 | 3730 | ||
3731 | if (pmd_numa(*pmd)) | ||
3732 | return do_pmd_numa_page(mm, vma, address, pmd); | ||
3733 | |||
3543 | /* | 3734 | /* |
3544 | * Use __pte_alloc instead of pte_alloc_map, because we can't | 3735 | * Use __pte_alloc instead of pte_alloc_map, because we can't |
3545 | * run pte_offset_map on the pmd, if an huge pmd could | 3736 | * run pte_offset_map on the pmd, if an huge pmd could |
3546 | * materialize from under us from a different thread. | 3737 | * materialize from under us from a different thread. |
3547 | */ | 3738 | */ |
3548 | if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address)) | 3739 | if (unlikely(pmd_none(*pmd)) && |
3740 | unlikely(__pte_alloc(mm, vma, pmd, address))) | ||
3549 | return VM_FAULT_OOM; | 3741 | return VM_FAULT_OOM; |
3550 | /* if an huge pmd materialized from under us just retry later */ | 3742 | /* if an huge pmd materialized from under us just retry later */ |
3551 | if (unlikely(pmd_trans_huge(*pmd))) | 3743 | if (unlikely(pmd_trans_huge(*pmd))) |