1 files changed, 224 insertions, 37 deletions
diff --git a/mm/memory.c b/mm/memory.c
index fb135ba4aba9..bb1369f7b9b4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -57,6 +57,8 @@
 #include <linux/swapops.h>
 #include <linux/elf.h>
 #include <linux/gfp.h>
+#include <linux/migrate.h>
+#include <linux/string.h>
 #include <asm/io.h>
 #include <asm/pgalloc.h>
@@ -182,10 +184,14 @@ static int tlb_next_batch(struct mmu_gather *tlb)
                return 1;
        }
+        if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
+                return 0;
        batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
        if (!batch)
                return 0;
+        tlb->batch_count++;
        batch->next = NULL;
        batch->nr   = 0;
        batch->max  = MAX_GATHER_BATCH;
@@ -214,6 +220,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
        tlb->local.nr   = 0;
        tlb->local.max  = ARRAY_SIZE(tlb->__pages);
        tlb->active     = &tlb->local;
+        tlb->batch_count = 0;
 #ifdef CONFIG_HAVE_RCU_TABLE_FREE
        tlb->batch = NULL;
@@ -717,20 +724,6 @@ static inline bool is_cow_mapping(vm_flags_t flags)
        return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 }
-#ifndef is_zero_pfn
-static inline int is_zero_pfn(unsigned long pfn)
-{
-        return pfn == zero_pfn;
-}
-#endif
-#ifndef my_zero_pfn
-static inline unsigned long my_zero_pfn(unsigned long addr)
-{
-        return zero_pfn;
-}
-#endif
 /*
 * vm_normal_page -- This function gets the "struct page" associated with a pte.
 *
@@ -1250,7 +1243,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
                                        BUG();
                                }
 #endif
-                                split_huge_page_pmd(vma->vm_mm, pmd);
+                                split_huge_page_pmd(vma, addr, pmd);
                        } else if (zap_huge_pmd(tlb, vma, pmd, addr))
                                goto next;
                        /* fall through */
@@ -1517,9 +1510,11 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
                page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
                goto out;
        }
+        if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
+                goto no_page_table;
        if (pmd_trans_huge(*pmd)) {
                if (flags & FOLL_SPLIT) {
-                        split_huge_page_pmd(mm, pmd);
+                        split_huge_page_pmd(vma, address, pmd);
                        goto split_fallthrough;
                }
                spin_lock(&mm->page_table_lock);
@@ -1546,6 +1541,8 @@ split_fallthrough:
        pte = *ptep;
        if (!pte_present(pte))
                goto no_page;
+        if ((flags & FOLL_NUMA) && pte_numa(pte))
+                goto no_page;
        if ((flags & FOLL_WRITE) && !pte_write(pte))
                goto unlock;
@@ -1697,6 +1694,19 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                        (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
        vm_flags &= (gup_flags & FOLL_FORCE) ?
                        (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
+        /*
+         * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault
+         * would be called on PROT_NONE ranges. We must never invoke
+         * handle_mm_fault on PROT_NONE ranges or the NUMA hinting
+         * page faults would unprotect the PROT_NONE ranges if
+         * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd
+         * bitflag. So to avoid that, don't set FOLL_NUMA if
+         * FOLL_FORCE is set.
+         */
+        if (!(gup_flags & FOLL_FORCE))
+                gup_flags |= FOLL_NUMA;
        i = 0;
        do {
@@ -2527,9 +2537,8 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
        int ret = 0;
        int page_mkwrite = 0;
        struct page *dirty_page = NULL;
-        unsigned long mmun_start;       /* For mmu_notifiers */
+        unsigned long mmun_start = 0;   /* For mmu_notifiers */
-        unsigned long mmun_end;         /* For mmu_notifiers */
+        unsigned long mmun_end = 0;     /* For mmu_notifiers */
-        bool mmun_called = false;       /* For mmu_notifiers */
        old_page = vm_normal_page(vma, address, orig_pte);
        if (!old_page) {
@@ -2708,8 +2717,7 @@ gotten:
                goto oom_free_new;
        mmun_start  = address & PAGE_MASK;
-        mmun_end    = (address & PAGE_MASK) + PAGE_SIZE;
+        mmun_end    = mmun_start + PAGE_SIZE;
-        mmun_called = true;
        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        /*
@@ -2778,7 +2786,7 @@ gotten:
                page_cache_release(new_page);
 unlock:
        pte_unmap_unlock(page_table, ptl);
-        if (mmun_called)
+        if (mmun_end > mmun_start)
                mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        if (old_page) {
                /*
@@ -2796,13 +2804,8 @@ unlock:
 oom_free_new:
        page_cache_release(new_page);
 oom:
-        if (old_page) {
+        if (old_page)
-                if (page_mkwrite) {
-                        unlock_page(old_page);
-                        page_cache_release(old_page);
-                }
                page_cache_release(old_page);
-        }
        return VM_FAULT_OOM;
 unwritable_page:
@@ -3433,6 +3436,170 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
 }
+int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
+                                unsigned long addr, int current_nid)
+{
+        get_page(page);
+        count_vm_numa_event(NUMA_HINT_FAULTS);
+        if (current_nid == numa_node_id())
+                count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+        return mpol_misplaced(page, vma, addr);
+}
+int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+                   unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
+{
+        struct page *page = NULL;
+        spinlock_t *ptl;
+        int current_nid = -1;
+        int target_nid;
+        bool migrated = false;
+        /*
+        * The "pte" at this point cannot be used safely without
+        * validation through pte_unmap_same(). It's of NUMA type but
+        * the pfn may be screwed if the read is non atomic.
+        *
+        * ptep_modify_prot_start is not called as this is clearing
+        * the _PAGE_NUMA bit and it is not really expected that there
+        * would be concurrent hardware modifications to the PTE.
+        */
+        ptl = pte_lockptr(mm, pmd);
+        spin_lock(ptl);
+        if (unlikely(!pte_same(*ptep, pte))) {
+                pte_unmap_unlock(ptep, ptl);
+                goto out;
+        }
+        pte = pte_mknonnuma(pte);
+        set_pte_at(mm, addr, ptep, pte);
+        update_mmu_cache(vma, addr, ptep);
+        page = vm_normal_page(vma, addr, pte);
+        if (!page) {
+                pte_unmap_unlock(ptep, ptl);
+                return 0;
+        }
+        current_nid = page_to_nid(page);
+        target_nid = numa_migrate_prep(page, vma, addr, current_nid);
+        pte_unmap_unlock(ptep, ptl);
+        if (target_nid == -1) {
+                /*
+                 * Account for the fault against the current node if it not
+                 * being replaced regardless of where the page is located.
+                 */
+                current_nid = numa_node_id();
+                put_page(page);
+                goto out;
+        }
+        /* Migrate to the requested node */
+        migrated = migrate_misplaced_page(page, target_nid);
+        if (migrated)
+                current_nid = target_nid;
+out:
+        if (current_nid != -1)
+                task_numa_fault(current_nid, 1, migrated);
+        return 0;
+}
+/* NUMA hinting page fault entry point for regular pmds */
+#ifdef CONFIG_NUMA_BALANCING
+static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+                     unsigned long addr, pmd_t *pmdp)
+{
+        pmd_t pmd;
+        pte_t *pte, *orig_pte;
+        unsigned long _addr = addr & PMD_MASK;
+        unsigned long offset;
+        spinlock_t *ptl;
+        bool numa = false;
+        int local_nid = numa_node_id();
+        spin_lock(&mm->page_table_lock);
+        pmd = *pmdp;
+        if (pmd_numa(pmd)) {
+                set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
+                numa = true;
+        }
+        spin_unlock(&mm->page_table_lock);
+        if (!numa)
+                return 0;
+        /* we're in a page fault so some vma must be in the range */
+        BUG_ON(!vma);
+        BUG_ON(vma->vm_start >= _addr + PMD_SIZE);
+        offset = max(_addr, vma->vm_start) & ~PMD_MASK;
+        VM_BUG_ON(offset >= PMD_SIZE);
+        orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
+        pte += offset >> PAGE_SHIFT;
+        for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
+                pte_t pteval = *pte;
+                struct page *page;
+                int curr_nid = local_nid;
+                int target_nid;
+                bool migrated;
+                if (!pte_present(pteval))
+                        continue;
+                if (!pte_numa(pteval))
+                        continue;
+                if (addr >= vma->vm_end) {
+                        vma = find_vma(mm, addr);
+                        /* there's a pte present so there must be a vma */
+                        BUG_ON(!vma);
+                        BUG_ON(addr < vma->vm_start);
+                }
+                if (pte_numa(pteval)) {
+                        pteval = pte_mknonnuma(pteval);
+                        set_pte_at(mm, addr, pte, pteval);
+                }
+                page = vm_normal_page(vma, addr, pteval);
+                if (unlikely(!page))
+                        continue;
+                /* only check non-shared pages */
+                if (unlikely(page_mapcount(page) != 1))
+                        continue;
+                /*
+                 * Note that the NUMA fault is later accounted to either
+                 * the node that is currently running or where the page is
+                 * migrated to.
+                 */
+                curr_nid = local_nid;
+                target_nid = numa_migrate_prep(page, vma, addr,
+                                               page_to_nid(page));
+                if (target_nid == -1) {
+                        put_page(page);
+                        continue;
+                }
+                /* Migrate to the requested node */
+                pte_unmap_unlock(pte, ptl);
+                migrated = migrate_misplaced_page(page, target_nid);
+                if (migrated)
+                        curr_nid = target_nid;
+                task_numa_fault(curr_nid, 1, migrated);
+                pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+        }
+        pte_unmap_unlock(orig_pte, ptl);
+        return 0;
+}
+#else
+static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+                     unsigned long addr, pmd_t *pmdp)
+{
+        BUG();
+        return 0;
+}
+#endif /* CONFIG_NUMA_BALANCING */
 /*
 * These routines also need to handle stuff like marking pages dirty
 * and/or accessed for architectures that don't do it in hardware (most
@@ -3471,6 +3638,9 @@ int handle_pte_fault(struct mm_struct *mm,
                                        pte, pmd, flags, entry);
        }
+        if (pte_numa(entry))
+                return do_numa_page(mm, vma, address, entry, pte, pmd);
        ptl = pte_lockptr(mm, pmd);
        spin_lock(ptl);
        if (unlikely(!pte_same(*pte, entry)))
@@ -3539,9 +3709,21 @@ retry:
                barrier();
                if (pmd_trans_huge(orig_pmd)) {
-                        if (flags & FAULT_FLAG_WRITE &&
+                        unsigned int dirty = flags & FAULT_FLAG_WRITE;
-                            !pmd_write(orig_pmd) &&
-                            !pmd_trans_splitting(orig_pmd)) {
+                        /*
+                         * If the pmd is splitting, return and retry the
+                         * the fault.  Alternative: wait until the split
+                         * is done, and goto retry.
+                         */
+                        if (pmd_trans_splitting(orig_pmd))
+                                return 0;
+                        if (pmd_numa(orig_pmd))
+                                return do_huge_pmd_numa_page(mm, vma, address,
+                                                             orig_pmd, pmd);
+                        if (dirty && !pmd_write(orig_pmd)) {
                                ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
                                                          orig_pmd);
                                /*
@@ -3552,17 +3734,25 @@ retry:
                                if (unlikely(ret & VM_FAULT_OOM))
                                        goto retry;
                                return ret;
+                        } else {
+                                huge_pmd_set_accessed(mm, vma, address, pmd,
+                                                      orig_pmd, dirty);
                        }
                        return 0;
                }
        }
+        if (pmd_numa(*pmd))
+                return do_pmd_numa_page(mm, vma, address, pmd);
        /*
         * Use __pte_alloc instead of pte_alloc_map, because we can't
         * run pte_offset_map on the pmd, if an huge pmd could
         * materialize from under us from a different thread.
         */
-        if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address))
+        if (unlikely(pmd_none(*pmd)) &&
+            unlikely(__pte_alloc(mm, vma, pmd, address)))
                return VM_FAULT_OOM;
        /* if an huge pmd materialized from under us just retry later */
        if (unlikely(pmd_trans_huge(*pmd)))
@@ -3942,15 +4132,12 @@ void print_vma_addr(char *prefix, unsigned long ip)
                struct file *f = vma->vm_file;
                char *buf = (char *)__get_free_page(GFP_KERNEL);
                if (buf) {
-                        char *p, *s;
+                        char *p;
                        p = d_path(&f->f_path, buf, PAGE_SIZE);
                        if (IS_ERR(p))
                                p = "?";
-                        s = strrchr(p, '/');
+                        printk("%s%s[%lx+%lx]", prefix, kbasename(p),
-                        if (s)
-                                p = s+1;
-                        printk("%s%s[%lx+%lx]", prefix, p,
                                        vma->vm_start,
                                        vma->vm_end - vma->vm_start);
                        free_page((unsigned long)buf);

diff --git a/mm/memory.c b/mm/memory.c index fb135ba4aba9..bb1369f7b9b4 100644 --- a/mm/memory.c +++ b/mm/memory.c
@@ -57,6 +57,8 @@
57	#include <linux/swapops.h>	57	#include <linux/swapops.h>
58	#include <linux/elf.h>	58	#include <linux/elf.h>
59	#include <linux/gfp.h>	59	#include <linux/gfp.h>
		60	#include <linux/migrate.h>
		61	#include <linux/string.h>
60		62
61	#include <asm/io.h>	63	#include <asm/io.h>
62	#include <asm/pgalloc.h>	64	#include <asm/pgalloc.h>
@@ -182,10 +184,14 @@ static int tlb_next_batch(struct mmu_gather *tlb)
182	return 1;	184	return 1;
183	}	185	}
184		186
		187	if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
		188	return 0;
		189
185	batch = (void *)__get_free_pages(GFP_NOWAIT \| __GFP_NOWARN, 0);	190	batch = (void *)__get_free_pages(GFP_NOWAIT \| __GFP_NOWARN, 0);
186	if (!batch)	191	if (!batch)
187	return 0;	192	return 0;
188		193
		194	tlb->batch_count++;
189	batch->next = NULL;	195	batch->next = NULL;
190	batch->nr = 0;	196	batch->nr = 0;
191	batch->max = MAX_GATHER_BATCH;	197	batch->max = MAX_GATHER_BATCH;
@@ -214,6 +220,7 @@ void tlb_gather_mmu(struct mmu_gather tlb, struct mm_struct mm, bool fullmm)
214	tlb->local.nr = 0;	220	tlb->local.nr = 0;
215	tlb->local.max = ARRAY_SIZE(tlb->__pages);	221	tlb->local.max = ARRAY_SIZE(tlb->__pages);
216	tlb->active = &tlb->local;	222	tlb->active = &tlb->local;
		223	tlb->batch_count = 0;
217		224
218	#ifdef CONFIG_HAVE_RCU_TABLE_FREE	225	#ifdef CONFIG_HAVE_RCU_TABLE_FREE
219	tlb->batch = NULL;	226	tlb->batch = NULL;
@@ -717,20 +724,6 @@ static inline bool is_cow_mapping(vm_flags_t flags)
717	return (flags & (VM_SHARED \| VM_MAYWRITE)) == VM_MAYWRITE;	724	return (flags & (VM_SHARED \| VM_MAYWRITE)) == VM_MAYWRITE;
718	}	725	}
719		726
720	#ifndef is_zero_pfn
721	static inline int is_zero_pfn(unsigned long pfn)
722	{
723	return pfn == zero_pfn;
724	}
725	#endif
726
727	#ifndef my_zero_pfn
728	static inline unsigned long my_zero_pfn(unsigned long addr)
729	{
730	return zero_pfn;
731	}
732	#endif
733
734	/*	727	/*
735	* vm_normal_page -- This function gets the "struct page" associated with a pte.	728	* vm_normal_page -- This function gets the "struct page" associated with a pte.
736	*	729	*
@@ -1250,7 +1243,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1250	BUG();	1243	BUG();
1251	}	1244	}
1252	#endif	1245	#endif
1253	split_huge_page_pmd(vma->vm_mm, pmd);	1246	split_huge_page_pmd(vma, addr, pmd);
1254	} else if (zap_huge_pmd(tlb, vma, pmd, addr))	1247	} else if (zap_huge_pmd(tlb, vma, pmd, addr))
1255	goto next;	1248	goto next;
1256	/* fall through */	1249	/* fall through */
@@ -1517,9 +1510,11 @@ struct page follow_page(struct vm_area_struct vma, unsigned long address,
1517	page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);	1510	page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
1518	goto out;	1511	goto out;
1519	}	1512	}
		1513	if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
		1514	goto no_page_table;
1520	if (pmd_trans_huge(*pmd)) {	1515	if (pmd_trans_huge(*pmd)) {
1521	if (flags & FOLL_SPLIT) {	1516	if (flags & FOLL_SPLIT) {
1522	split_huge_page_pmd(mm, pmd);	1517	split_huge_page_pmd(vma, address, pmd);
1523	goto split_fallthrough;	1518	goto split_fallthrough;
1524	}	1519	}
1525	spin_lock(&mm->page_table_lock);	1520	spin_lock(&mm->page_table_lock);
@@ -1546,6 +1541,8 @@ split_fallthrough:
1546	pte = *ptep;	1541	pte = *ptep;
1547	if (!pte_present(pte))	1542	if (!pte_present(pte))
1548	goto no_page;	1543	goto no_page;
		1544	if ((flags & FOLL_NUMA) && pte_numa(pte))
		1545	goto no_page;
1549	if ((flags & FOLL_WRITE) && !pte_write(pte))	1546	if ((flags & FOLL_WRITE) && !pte_write(pte))
1550	goto unlock;	1547	goto unlock;
1551		1548
@@ -1697,6 +1694,19 @@ int __get_user_pages(struct task_struct tsk, struct mm_struct mm,
1697	(VM_WRITE \| VM_MAYWRITE) : (VM_READ \| VM_MAYREAD);	1694	(VM_WRITE \| VM_MAYWRITE) : (VM_READ \| VM_MAYREAD);
1698	vm_flags &= (gup_flags & FOLL_FORCE) ?	1695	vm_flags &= (gup_flags & FOLL_FORCE) ?
1699	(VM_MAYREAD \| VM_MAYWRITE) : (VM_READ \| VM_WRITE);	1696	(VM_MAYREAD \| VM_MAYWRITE) : (VM_READ \| VM_WRITE);
		1697
		1698	/*
		1699	* If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault
		1700	* would be called on PROT_NONE ranges. We must never invoke
		1701	* handle_mm_fault on PROT_NONE ranges or the NUMA hinting
		1702	* page faults would unprotect the PROT_NONE ranges if
		1703	* _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd
		1704	* bitflag. So to avoid that, don't set FOLL_NUMA if
		1705	* FOLL_FORCE is set.
		1706	*/
		1707	if (!(gup_flags & FOLL_FORCE))
		1708	gup_flags \|= FOLL_NUMA;
		1709
1700	i = 0;	1710	i = 0;
1701		1711
1702	do {	1712	do {
@@ -2527,9 +2537,8 @@ static int do_wp_page(struct mm_struct mm, struct vm_area_struct vma,
2527	int ret = 0;	2537	int ret = 0;
2528	int page_mkwrite = 0;	2538	int page_mkwrite = 0;
2529	struct page *dirty_page = NULL;	2539	struct page *dirty_page = NULL;
2530	unsigned long mmun_start; /* For mmu_notifiers */	2540	unsigned long mmun_start = 0; /* For mmu_notifiers */
2531	unsigned long mmun_end; /* For mmu_notifiers */	2541	unsigned long mmun_end = 0; /* For mmu_notifiers */
2532	bool mmun_called = false; /* For mmu_notifiers */
2533		2542
2534	old_page = vm_normal_page(vma, address, orig_pte);	2543	old_page = vm_normal_page(vma, address, orig_pte);
2535	if (!old_page) {	2544	if (!old_page) {
@@ -2708,8 +2717,7 @@ gotten:
2708	goto oom_free_new;	2717	goto oom_free_new;
2709		2718
2710	mmun_start = address & PAGE_MASK;	2719	mmun_start = address & PAGE_MASK;
2711	mmun_end = (address & PAGE_MASK) + PAGE_SIZE;	2720	mmun_end = mmun_start + PAGE_SIZE;
2712	mmun_called = true;
2713	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);	2721	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2714		2722
2715	/*	2723	/*
@@ -2778,7 +2786,7 @@ gotten:
2778	page_cache_release(new_page);	2786	page_cache_release(new_page);
2779	unlock:	2787	unlock:
2780	pte_unmap_unlock(page_table, ptl);	2788	pte_unmap_unlock(page_table, ptl);
2781	if (mmun_called)	2789	if (mmun_end > mmun_start)
2782	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);	2790	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2783	if (old_page) {	2791	if (old_page) {
2784	/*	2792	/*
@@ -2796,13 +2804,8 @@ unlock:
2796	oom_free_new:	2804	oom_free_new:
2797	page_cache_release(new_page);	2805	page_cache_release(new_page);
2798	oom:	2806	oom:
2799	if (old_page) {	2807	if (old_page)
2800	if (page_mkwrite) {
2801	unlock_page(old_page);
2802	page_cache_release(old_page);
2803	}
2804	page_cache_release(old_page);	2808	page_cache_release(old_page);
2805	}
2806	return VM_FAULT_OOM;	2809	return VM_FAULT_OOM;
2807		2810
2808	unwritable_page:	2811	unwritable_page:
@@ -3433,6 +3436,170 @@ static int do_nonlinear_fault(struct mm_struct mm, struct vm_area_struct vma,
3433	return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);	3436	return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3434	}	3437	}
3435		3438
		3439	int numa_migrate_prep(struct page page, struct vm_area_struct vma,
		3440	unsigned long addr, int current_nid)
		3441	{
		3442	get_page(page);
		3443
		3444	count_vm_numa_event(NUMA_HINT_FAULTS);
		3445	if (current_nid == numa_node_id())
		3446	count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
		3447
		3448	return mpol_misplaced(page, vma, addr);
		3449	}
		3450
		3451	int do_numa_page(struct mm_struct mm, struct vm_area_struct vma,
		3452	unsigned long addr, pte_t pte, pte_t ptep, pmd_t pmd)
		3453	{
		3454	struct page *page = NULL;
		3455	spinlock_t *ptl;
		3456	int current_nid = -1;
		3457	int target_nid;
		3458	bool migrated = false;
		3459
		3460	/*
		3461	* The "pte" at this point cannot be used safely without
		3462	* validation through pte_unmap_same(). It's of NUMA type but
		3463	* the pfn may be screwed if the read is non atomic.
		3464	*
		3465	* ptep_modify_prot_start is not called as this is clearing
		3466	* the _PAGE_NUMA bit and it is not really expected that there
		3467	* would be concurrent hardware modifications to the PTE.
		3468	*/
		3469	ptl = pte_lockptr(mm, pmd);
		3470	spin_lock(ptl);
		3471	if (unlikely(!pte_same(*ptep, pte))) {
		3472	pte_unmap_unlock(ptep, ptl);
		3473	goto out;
		3474	}
		3475
		3476	pte = pte_mknonnuma(pte);
		3477	set_pte_at(mm, addr, ptep, pte);
		3478	update_mmu_cache(vma, addr, ptep);
		3479
		3480	page = vm_normal_page(vma, addr, pte);
		3481	if (!page) {
		3482	pte_unmap_unlock(ptep, ptl);
		3483	return 0;
		3484	}
		3485
		3486	current_nid = page_to_nid(page);
		3487	target_nid = numa_migrate_prep(page, vma, addr, current_nid);
		3488	pte_unmap_unlock(ptep, ptl);
		3489	if (target_nid == -1) {
		3490	/*
		3491	* Account for the fault against the current node if it not
		3492	* being replaced regardless of where the page is located.
		3493	*/
		3494	current_nid = numa_node_id();
		3495	put_page(page);
		3496	goto out;
		3497	}
		3498
		3499	/* Migrate to the requested node */
		3500	migrated = migrate_misplaced_page(page, target_nid);
		3501	if (migrated)
		3502	current_nid = target_nid;
		3503
		3504	out:
		3505	if (current_nid != -1)
		3506	task_numa_fault(current_nid, 1, migrated);
		3507	return 0;
		3508	}
		3509
		3510	/* NUMA hinting page fault entry point for regular pmds */
		3511	#ifdef CONFIG_NUMA_BALANCING
		3512	static int do_pmd_numa_page(struct mm_struct mm, struct vm_area_struct vma,
		3513	unsigned long addr, pmd_t *pmdp)
		3514	{
		3515	pmd_t pmd;
		3516	pte_t pte, orig_pte;
		3517	unsigned long _addr = addr & PMD_MASK;
		3518	unsigned long offset;
		3519	spinlock_t *ptl;
		3520	bool numa = false;
		3521	int local_nid = numa_node_id();
		3522
		3523	spin_lock(&mm->page_table_lock);
		3524	pmd = *pmdp;
		3525	if (pmd_numa(pmd)) {
		3526	set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
		3527	numa = true;
		3528	}
		3529	spin_unlock(&mm->page_table_lock);
		3530
		3531	if (!numa)
		3532	return 0;
		3533
		3534	/* we're in a page fault so some vma must be in the range */
		3535	BUG_ON(!vma);
		3536	BUG_ON(vma->vm_start >= _addr + PMD_SIZE);
		3537	offset = max(_addr, vma->vm_start) & ~PMD_MASK;
		3538	VM_BUG_ON(offset >= PMD_SIZE);
		3539	orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
		3540	pte += offset >> PAGE_SHIFT;
		3541	for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
		3542	pte_t pteval = *pte;
		3543	struct page *page;
		3544	int curr_nid = local_nid;
		3545	int target_nid;
		3546	bool migrated;
		3547	if (!pte_present(pteval))
		3548	continue;
		3549	if (!pte_numa(pteval))
		3550	continue;
		3551	if (addr >= vma->vm_end) {
		3552	vma = find_vma(mm, addr);
		3553	/* there's a pte present so there must be a vma */
		3554	BUG_ON(!vma);
		3555	BUG_ON(addr < vma->vm_start);
		3556	}
		3557	if (pte_numa(pteval)) {
		3558	pteval = pte_mknonnuma(pteval);
		3559	set_pte_at(mm, addr, pte, pteval);
		3560	}
		3561	page = vm_normal_page(vma, addr, pteval);
		3562	if (unlikely(!page))
		3563	continue;
		3564	/* only check non-shared pages */
		3565	if (unlikely(page_mapcount(page) != 1))
		3566	continue;
		3567
		3568	/*
		3569	* Note that the NUMA fault is later accounted to either
		3570	* the node that is currently running or where the page is
		3571	* migrated to.
		3572	*/
		3573	curr_nid = local_nid;
		3574	target_nid = numa_migrate_prep(page, vma, addr,
		3575	page_to_nid(page));
		3576	if (target_nid == -1) {
		3577	put_page(page);
		3578	continue;
		3579	}
		3580
		3581	/* Migrate to the requested node */
		3582	pte_unmap_unlock(pte, ptl);
		3583	migrated = migrate_misplaced_page(page, target_nid);
		3584	if (migrated)
		3585	curr_nid = target_nid;
		3586	task_numa_fault(curr_nid, 1, migrated);
		3587
		3588	pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
		3589	}
		3590	pte_unmap_unlock(orig_pte, ptl);
		3591
		3592	return 0;
		3593	}
		3594	#else
		3595	static int do_pmd_numa_page(struct mm_struct mm, struct vm_area_struct vma,
		3596	unsigned long addr, pmd_t *pmdp)
		3597	{
		3598	BUG();
		3599	return 0;
		3600	}
		3601	#endif /* CONFIG_NUMA_BALANCING */
		3602
3436	/*	3603	/*
3437	* These routines also need to handle stuff like marking pages dirty	3604	* These routines also need to handle stuff like marking pages dirty
3438	* and/or accessed for architectures that don't do it in hardware (most	3605	* and/or accessed for architectures that don't do it in hardware (most
@@ -3471,6 +3638,9 @@ int handle_pte_fault(struct mm_struct *mm,
3471	pte, pmd, flags, entry);	3638	pte, pmd, flags, entry);
3472	}	3639	}
3473		3640
		3641	if (pte_numa(entry))
		3642	return do_numa_page(mm, vma, address, entry, pte, pmd);
		3643
3474	ptl = pte_lockptr(mm, pmd);	3644	ptl = pte_lockptr(mm, pmd);
3475	spin_lock(ptl);	3645	spin_lock(ptl);
3476	if (unlikely(!pte_same(*pte, entry)))	3646	if (unlikely(!pte_same(*pte, entry)))
@@ -3539,9 +3709,21 @@ retry:
3539		3709
3540	barrier();	3710	barrier();
3541	if (pmd_trans_huge(orig_pmd)) {	3711	if (pmd_trans_huge(orig_pmd)) {
3542	if (flags & FAULT_FLAG_WRITE &&	3712	unsigned int dirty = flags & FAULT_FLAG_WRITE;
3543	!pmd_write(orig_pmd) &&	3713
3544	!pmd_trans_splitting(orig_pmd)) {	3714	/*
		3715	* If the pmd is splitting, return and retry the
		3716	* the fault. Alternative: wait until the split
		3717	* is done, and goto retry.
		3718	*/
		3719	if (pmd_trans_splitting(orig_pmd))
		3720	return 0;
		3721
		3722	if (pmd_numa(orig_pmd))
		3723	return do_huge_pmd_numa_page(mm, vma, address,
		3724	orig_pmd, pmd);
		3725
		3726	if (dirty && !pmd_write(orig_pmd)) {
3545	ret = do_huge_pmd_wp_page(mm, vma, address, pmd,	3727	ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
3546	orig_pmd);	3728	orig_pmd);
3547	/*	3729	/*
@@ -3552,17 +3734,25 @@ retry:
3552	if (unlikely(ret & VM_FAULT_OOM))	3734	if (unlikely(ret & VM_FAULT_OOM))
3553	goto retry;	3735	goto retry;
3554	return ret;	3736	return ret;
		3737	} else {
		3738	huge_pmd_set_accessed(mm, vma, address, pmd,
		3739	orig_pmd, dirty);
3555	}	3740	}
		3741
3556	return 0;	3742	return 0;
3557	}	3743	}
3558	}	3744	}
3559		3745
		3746	if (pmd_numa(*pmd))
		3747	return do_pmd_numa_page(mm, vma, address, pmd);
		3748
3560	/*	3749	/*
3561	* Use __pte_alloc instead of pte_alloc_map, because we can't	3750	* Use __pte_alloc instead of pte_alloc_map, because we can't
3562	* run pte_offset_map on the pmd, if an huge pmd could	3751	* run pte_offset_map on the pmd, if an huge pmd could
3563	* materialize from under us from a different thread.	3752	* materialize from under us from a different thread.
3564	*/	3753	*/
3565	if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address))	3754	if (unlikely(pmd_none(*pmd)) &&
		3755	unlikely(__pte_alloc(mm, vma, pmd, address)))
3566	return VM_FAULT_OOM;	3756	return VM_FAULT_OOM;
3567	/* if an huge pmd materialized from under us just retry later */	3757	/* if an huge pmd materialized from under us just retry later */
3568	if (unlikely(pmd_trans_huge(*pmd)))	3758	if (unlikely(pmd_trans_huge(*pmd)))
@@ -3942,15 +4132,12 @@ void print_vma_addr(char *prefix, unsigned long ip)
3942	struct file *f = vma->vm_file;	4132	struct file *f = vma->vm_file;
3943	char buf = (char )__get_free_page(GFP_KERNEL);	4133	char buf = (char )__get_free_page(GFP_KERNEL);
3944	if (buf) {	4134	if (buf) {
3945	char p, s;	4135	char *p;
3946		4136
3947	p = d_path(&f->f_path, buf, PAGE_SIZE);	4137	p = d_path(&f->f_path, buf, PAGE_SIZE);
3948	if (IS_ERR(p))	4138	if (IS_ERR(p))
3949	p = "?";	4139	p = "?";
3950	s = strrchr(p, '/');	4140	printk("%s%s[%lx+%lx]", prefix, kbasename(p),
3951	if (s)
3952	p = s+1;
3953	printk("%s%s[%lx+%lx]", prefix, p,
3954	vma->vm_start,	4141	vma->vm_start,
3955	vma->vm_end - vma->vm_start);	4142	vma->vm_end - vma->vm_start);
3956	free_page((unsigned long)buf);	4143	free_page((unsigned long)buf);