aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c261
1 files changed, 224 insertions, 37 deletions
diff --git a/mm/memory.c b/mm/memory.c
index fb135ba4aba9..bb1369f7b9b4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -57,6 +57,8 @@
57#include <linux/swapops.h> 57#include <linux/swapops.h>
58#include <linux/elf.h> 58#include <linux/elf.h>
59#include <linux/gfp.h> 59#include <linux/gfp.h>
60#include <linux/migrate.h>
61#include <linux/string.h>
60 62
61#include <asm/io.h> 63#include <asm/io.h>
62#include <asm/pgalloc.h> 64#include <asm/pgalloc.h>
@@ -182,10 +184,14 @@ static int tlb_next_batch(struct mmu_gather *tlb)
182 return 1; 184 return 1;
183 } 185 }
184 186
187 if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
188 return 0;
189
185 batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); 190 batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
186 if (!batch) 191 if (!batch)
187 return 0; 192 return 0;
188 193
194 tlb->batch_count++;
189 batch->next = NULL; 195 batch->next = NULL;
190 batch->nr = 0; 196 batch->nr = 0;
191 batch->max = MAX_GATHER_BATCH; 197 batch->max = MAX_GATHER_BATCH;
@@ -214,6 +220,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
214 tlb->local.nr = 0; 220 tlb->local.nr = 0;
215 tlb->local.max = ARRAY_SIZE(tlb->__pages); 221 tlb->local.max = ARRAY_SIZE(tlb->__pages);
216 tlb->active = &tlb->local; 222 tlb->active = &tlb->local;
223 tlb->batch_count = 0;
217 224
218#ifdef CONFIG_HAVE_RCU_TABLE_FREE 225#ifdef CONFIG_HAVE_RCU_TABLE_FREE
219 tlb->batch = NULL; 226 tlb->batch = NULL;
@@ -717,20 +724,6 @@ static inline bool is_cow_mapping(vm_flags_t flags)
717 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 724 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
718} 725}
719 726
720#ifndef is_zero_pfn
721static inline int is_zero_pfn(unsigned long pfn)
722{
723 return pfn == zero_pfn;
724}
725#endif
726
727#ifndef my_zero_pfn
728static inline unsigned long my_zero_pfn(unsigned long addr)
729{
730 return zero_pfn;
731}
732#endif
733
734/* 727/*
735 * vm_normal_page -- This function gets the "struct page" associated with a pte. 728 * vm_normal_page -- This function gets the "struct page" associated with a pte.
736 * 729 *
@@ -1250,7 +1243,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1250 BUG(); 1243 BUG();
1251 } 1244 }
1252#endif 1245#endif
1253 split_huge_page_pmd(vma->vm_mm, pmd); 1246 split_huge_page_pmd(vma, addr, pmd);
1254 } else if (zap_huge_pmd(tlb, vma, pmd, addr)) 1247 } else if (zap_huge_pmd(tlb, vma, pmd, addr))
1255 goto next; 1248 goto next;
1256 /* fall through */ 1249 /* fall through */
@@ -1517,9 +1510,11 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1517 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); 1510 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
1518 goto out; 1511 goto out;
1519 } 1512 }
1513 if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
1514 goto no_page_table;
1520 if (pmd_trans_huge(*pmd)) { 1515 if (pmd_trans_huge(*pmd)) {
1521 if (flags & FOLL_SPLIT) { 1516 if (flags & FOLL_SPLIT) {
1522 split_huge_page_pmd(mm, pmd); 1517 split_huge_page_pmd(vma, address, pmd);
1523 goto split_fallthrough; 1518 goto split_fallthrough;
1524 } 1519 }
1525 spin_lock(&mm->page_table_lock); 1520 spin_lock(&mm->page_table_lock);
@@ -1546,6 +1541,8 @@ split_fallthrough:
1546 pte = *ptep; 1541 pte = *ptep;
1547 if (!pte_present(pte)) 1542 if (!pte_present(pte))
1548 goto no_page; 1543 goto no_page;
1544 if ((flags & FOLL_NUMA) && pte_numa(pte))
1545 goto no_page;
1549 if ((flags & FOLL_WRITE) && !pte_write(pte)) 1546 if ((flags & FOLL_WRITE) && !pte_write(pte))
1550 goto unlock; 1547 goto unlock;
1551 1548
@@ -1697,6 +1694,19 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1697 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); 1694 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
1698 vm_flags &= (gup_flags & FOLL_FORCE) ? 1695 vm_flags &= (gup_flags & FOLL_FORCE) ?
1699 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); 1696 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
1697
1698 /*
1699 * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault
1700 * would be called on PROT_NONE ranges. We must never invoke
1701 * handle_mm_fault on PROT_NONE ranges or the NUMA hinting
1702 * page faults would unprotect the PROT_NONE ranges if
1703 * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd
1704 * bitflag. So to avoid that, don't set FOLL_NUMA if
1705 * FOLL_FORCE is set.
1706 */
1707 if (!(gup_flags & FOLL_FORCE))
1708 gup_flags |= FOLL_NUMA;
1709
1700 i = 0; 1710 i = 0;
1701 1711
1702 do { 1712 do {
@@ -2527,9 +2537,8 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2527 int ret = 0; 2537 int ret = 0;
2528 int page_mkwrite = 0; 2538 int page_mkwrite = 0;
2529 struct page *dirty_page = NULL; 2539 struct page *dirty_page = NULL;
2530 unsigned long mmun_start; /* For mmu_notifiers */ 2540 unsigned long mmun_start = 0; /* For mmu_notifiers */
2531 unsigned long mmun_end; /* For mmu_notifiers */ 2541 unsigned long mmun_end = 0; /* For mmu_notifiers */
2532 bool mmun_called = false; /* For mmu_notifiers */
2533 2542
2534 old_page = vm_normal_page(vma, address, orig_pte); 2543 old_page = vm_normal_page(vma, address, orig_pte);
2535 if (!old_page) { 2544 if (!old_page) {
@@ -2708,8 +2717,7 @@ gotten:
2708 goto oom_free_new; 2717 goto oom_free_new;
2709 2718
2710 mmun_start = address & PAGE_MASK; 2719 mmun_start = address & PAGE_MASK;
2711 mmun_end = (address & PAGE_MASK) + PAGE_SIZE; 2720 mmun_end = mmun_start + PAGE_SIZE;
2712 mmun_called = true;
2713 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 2721 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2714 2722
2715 /* 2723 /*
@@ -2778,7 +2786,7 @@ gotten:
2778 page_cache_release(new_page); 2786 page_cache_release(new_page);
2779unlock: 2787unlock:
2780 pte_unmap_unlock(page_table, ptl); 2788 pte_unmap_unlock(page_table, ptl);
2781 if (mmun_called) 2789 if (mmun_end > mmun_start)
2782 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2790 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2783 if (old_page) { 2791 if (old_page) {
2784 /* 2792 /*
@@ -2796,13 +2804,8 @@ unlock:
2796oom_free_new: 2804oom_free_new:
2797 page_cache_release(new_page); 2805 page_cache_release(new_page);
2798oom: 2806oom:
2799 if (old_page) { 2807 if (old_page)
2800 if (page_mkwrite) {
2801 unlock_page(old_page);
2802 page_cache_release(old_page);
2803 }
2804 page_cache_release(old_page); 2808 page_cache_release(old_page);
2805 }
2806 return VM_FAULT_OOM; 2809 return VM_FAULT_OOM;
2807 2810
2808unwritable_page: 2811unwritable_page:
@@ -3433,6 +3436,170 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3433 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); 3436 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3434} 3437}
3435 3438
3439int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
3440 unsigned long addr, int current_nid)
3441{
3442 get_page(page);
3443
3444 count_vm_numa_event(NUMA_HINT_FAULTS);
3445 if (current_nid == numa_node_id())
3446 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
3447
3448 return mpol_misplaced(page, vma, addr);
3449}
3450
3451int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3452 unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
3453{
3454 struct page *page = NULL;
3455 spinlock_t *ptl;
3456 int current_nid = -1;
3457 int target_nid;
3458 bool migrated = false;
3459
3460 /*
3461 * The "pte" at this point cannot be used safely without
3462 * validation through pte_unmap_same(). It's of NUMA type but
3463 * the pfn may be screwed if the read is non atomic.
3464 *
3465 * ptep_modify_prot_start is not called as this is clearing
3466 * the _PAGE_NUMA bit and it is not really expected that there
3467 * would be concurrent hardware modifications to the PTE.
3468 */
3469 ptl = pte_lockptr(mm, pmd);
3470 spin_lock(ptl);
3471 if (unlikely(!pte_same(*ptep, pte))) {
3472 pte_unmap_unlock(ptep, ptl);
3473 goto out;
3474 }
3475
3476 pte = pte_mknonnuma(pte);
3477 set_pte_at(mm, addr, ptep, pte);
3478 update_mmu_cache(vma, addr, ptep);
3479
3480 page = vm_normal_page(vma, addr, pte);
3481 if (!page) {
3482 pte_unmap_unlock(ptep, ptl);
3483 return 0;
3484 }
3485
3486 current_nid = page_to_nid(page);
3487 target_nid = numa_migrate_prep(page, vma, addr, current_nid);
3488 pte_unmap_unlock(ptep, ptl);
3489 if (target_nid == -1) {
3490 /*
3491 * Account for the fault against the current node if it not
3492 * being replaced regardless of where the page is located.
3493 */
3494 current_nid = numa_node_id();
3495 put_page(page);
3496 goto out;
3497 }
3498
3499 /* Migrate to the requested node */
3500 migrated = migrate_misplaced_page(page, target_nid);
3501 if (migrated)
3502 current_nid = target_nid;
3503
3504out:
3505 if (current_nid != -1)
3506 task_numa_fault(current_nid, 1, migrated);
3507 return 0;
3508}
3509
3510/* NUMA hinting page fault entry point for regular pmds */
3511#ifdef CONFIG_NUMA_BALANCING
3512static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3513 unsigned long addr, pmd_t *pmdp)
3514{
3515 pmd_t pmd;
3516 pte_t *pte, *orig_pte;
3517 unsigned long _addr = addr & PMD_MASK;
3518 unsigned long offset;
3519 spinlock_t *ptl;
3520 bool numa = false;
3521 int local_nid = numa_node_id();
3522
3523 spin_lock(&mm->page_table_lock);
3524 pmd = *pmdp;
3525 if (pmd_numa(pmd)) {
3526 set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
3527 numa = true;
3528 }
3529 spin_unlock(&mm->page_table_lock);
3530
3531 if (!numa)
3532 return 0;
3533
3534 /* we're in a page fault so some vma must be in the range */
3535 BUG_ON(!vma);
3536 BUG_ON(vma->vm_start >= _addr + PMD_SIZE);
3537 offset = max(_addr, vma->vm_start) & ~PMD_MASK;
3538 VM_BUG_ON(offset >= PMD_SIZE);
3539 orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
3540 pte += offset >> PAGE_SHIFT;
3541 for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
3542 pte_t pteval = *pte;
3543 struct page *page;
3544 int curr_nid = local_nid;
3545 int target_nid;
3546 bool migrated;
3547 if (!pte_present(pteval))
3548 continue;
3549 if (!pte_numa(pteval))
3550 continue;
3551 if (addr >= vma->vm_end) {
3552 vma = find_vma(mm, addr);
3553 /* there's a pte present so there must be a vma */
3554 BUG_ON(!vma);
3555 BUG_ON(addr < vma->vm_start);
3556 }
3557 if (pte_numa(pteval)) {
3558 pteval = pte_mknonnuma(pteval);
3559 set_pte_at(mm, addr, pte, pteval);
3560 }
3561 page = vm_normal_page(vma, addr, pteval);
3562 if (unlikely(!page))
3563 continue;
3564 /* only check non-shared pages */
3565 if (unlikely(page_mapcount(page) != 1))
3566 continue;
3567
3568 /*
3569 * Note that the NUMA fault is later accounted to either
3570 * the node that is currently running or where the page is
3571 * migrated to.
3572 */
3573 curr_nid = local_nid;
3574 target_nid = numa_migrate_prep(page, vma, addr,
3575 page_to_nid(page));
3576 if (target_nid == -1) {
3577 put_page(page);
3578 continue;
3579 }
3580
3581 /* Migrate to the requested node */
3582 pte_unmap_unlock(pte, ptl);
3583 migrated = migrate_misplaced_page(page, target_nid);
3584 if (migrated)
3585 curr_nid = target_nid;
3586 task_numa_fault(curr_nid, 1, migrated);
3587
3588 pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
3589 }
3590 pte_unmap_unlock(orig_pte, ptl);
3591
3592 return 0;
3593}
3594#else
3595static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3596 unsigned long addr, pmd_t *pmdp)
3597{
3598 BUG();
3599 return 0;
3600}
3601#endif /* CONFIG_NUMA_BALANCING */
3602
3436/* 3603/*
3437 * These routines also need to handle stuff like marking pages dirty 3604 * These routines also need to handle stuff like marking pages dirty
3438 * and/or accessed for architectures that don't do it in hardware (most 3605 * and/or accessed for architectures that don't do it in hardware (most
@@ -3471,6 +3638,9 @@ int handle_pte_fault(struct mm_struct *mm,
3471 pte, pmd, flags, entry); 3638 pte, pmd, flags, entry);
3472 } 3639 }
3473 3640
3641 if (pte_numa(entry))
3642 return do_numa_page(mm, vma, address, entry, pte, pmd);
3643
3474 ptl = pte_lockptr(mm, pmd); 3644 ptl = pte_lockptr(mm, pmd);
3475 spin_lock(ptl); 3645 spin_lock(ptl);
3476 if (unlikely(!pte_same(*pte, entry))) 3646 if (unlikely(!pte_same(*pte, entry)))
@@ -3539,9 +3709,21 @@ retry:
3539 3709
3540 barrier(); 3710 barrier();
3541 if (pmd_trans_huge(orig_pmd)) { 3711 if (pmd_trans_huge(orig_pmd)) {
3542 if (flags & FAULT_FLAG_WRITE && 3712 unsigned int dirty = flags & FAULT_FLAG_WRITE;
3543 !pmd_write(orig_pmd) && 3713
3544 !pmd_trans_splitting(orig_pmd)) { 3714 /*
3715 * If the pmd is splitting, return and retry the
3716 * the fault. Alternative: wait until the split
3717 * is done, and goto retry.
3718 */
3719 if (pmd_trans_splitting(orig_pmd))
3720 return 0;
3721
3722 if (pmd_numa(orig_pmd))
3723 return do_huge_pmd_numa_page(mm, vma, address,
3724 orig_pmd, pmd);
3725
3726 if (dirty && !pmd_write(orig_pmd)) {
3545 ret = do_huge_pmd_wp_page(mm, vma, address, pmd, 3727 ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
3546 orig_pmd); 3728 orig_pmd);
3547 /* 3729 /*
@@ -3552,17 +3734,25 @@ retry:
3552 if (unlikely(ret & VM_FAULT_OOM)) 3734 if (unlikely(ret & VM_FAULT_OOM))
3553 goto retry; 3735 goto retry;
3554 return ret; 3736 return ret;
3737 } else {
3738 huge_pmd_set_accessed(mm, vma, address, pmd,
3739 orig_pmd, dirty);
3555 } 3740 }
3741
3556 return 0; 3742 return 0;
3557 } 3743 }
3558 } 3744 }
3559 3745
3746 if (pmd_numa(*pmd))
3747 return do_pmd_numa_page(mm, vma, address, pmd);
3748
3560 /* 3749 /*
3561 * Use __pte_alloc instead of pte_alloc_map, because we can't 3750 * Use __pte_alloc instead of pte_alloc_map, because we can't
3562 * run pte_offset_map on the pmd, if an huge pmd could 3751 * run pte_offset_map on the pmd, if an huge pmd could
3563 * materialize from under us from a different thread. 3752 * materialize from under us from a different thread.
3564 */ 3753 */
3565 if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address)) 3754 if (unlikely(pmd_none(*pmd)) &&
3755 unlikely(__pte_alloc(mm, vma, pmd, address)))
3566 return VM_FAULT_OOM; 3756 return VM_FAULT_OOM;
3567 /* if an huge pmd materialized from under us just retry later */ 3757 /* if an huge pmd materialized from under us just retry later */
3568 if (unlikely(pmd_trans_huge(*pmd))) 3758 if (unlikely(pmd_trans_huge(*pmd)))
@@ -3942,15 +4132,12 @@ void print_vma_addr(char *prefix, unsigned long ip)
3942 struct file *f = vma->vm_file; 4132 struct file *f = vma->vm_file;
3943 char *buf = (char *)__get_free_page(GFP_KERNEL); 4133 char *buf = (char *)__get_free_page(GFP_KERNEL);
3944 if (buf) { 4134 if (buf) {
3945 char *p, *s; 4135 char *p;
3946 4136
3947 p = d_path(&f->f_path, buf, PAGE_SIZE); 4137 p = d_path(&f->f_path, buf, PAGE_SIZE);
3948 if (IS_ERR(p)) 4138 if (IS_ERR(p))
3949 p = "?"; 4139 p = "?";
3950 s = strrchr(p, '/'); 4140 printk("%s%s[%lx+%lx]", prefix, kbasename(p),
3951 if (s)
3952 p = s+1;
3953 printk("%s%s[%lx+%lx]", prefix, p,
3954 vma->vm_start, 4141 vma->vm_start,
3955 vma->vm_end - vma->vm_start); 4142 vma->vm_end - vma->vm_start);
3956 free_page((unsigned long)buf); 4143 free_page((unsigned long)buf);