1 files changed, 128 insertions, 149 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0e0961b8c39c..f1bd23803576 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -471,24 +471,34 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 static void migrate_page_add(struct page *page, struct list_head *pagelist,
                                unsigned long flags);
+struct queue_pages {
+        struct list_head *pagelist;
+        unsigned long flags;
+        nodemask_t *nmask;
+        struct vm_area_struct *prev;
+};
 /*
 * Scan through pages checking if pages follow certain conditions,
 * and move them to the pagelist if they do.
 */
-static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
-                unsigned long addr, unsigned long end,
+                        unsigned long end, struct mm_walk *walk)
-                const nodemask_t *nodes, unsigned long flags,
-                void *private)
 {
-        pte_t *orig_pte;
+        struct vm_area_struct *vma = walk->vma;
+        struct page *page;
+        struct queue_pages *qp = walk->private;
+        unsigned long flags = qp->flags;
+        int nid;
        pte_t *pte;
        spinlock_t *ptl;
-        orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+        split_huge_page_pmd(vma, addr, pmd);
-        do {
+        if (pmd_trans_unstable(pmd))
-                struct page *page;
+                return 0;
-                int nid;
+        pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+        for (; addr != end; pte++, addr += PAGE_SIZE) {
                if (!pte_present(*pte))
                        continue;
                page = vm_normal_page(vma, addr, *pte);
@@ -501,114 +511,46 @@ static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                if (PageReserved(page))
                        continue;
                nid = page_to_nid(page);
-                if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
+                if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
                        continue;
                if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
-                        migrate_page_add(page, private, flags);
+                        migrate_page_add(page, qp->pagelist, flags);
-                else
+        }
-                        break;
+        pte_unmap_unlock(pte - 1, ptl);
-        } while (pte++, addr += PAGE_SIZE, addr != end);
+        cond_resched();
-        pte_unmap_unlock(orig_pte, ptl);
+        return 0;
-        return addr != end;
 }
-static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
+static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
-                pmd_t *pmd, const nodemask_t *nodes, unsigned long flags,
+                               unsigned long addr, unsigned long end,
-                                    void *private)
+                               struct mm_walk *walk)
 {
 #ifdef CONFIG_HUGETLB_PAGE
+        struct queue_pages *qp = walk->private;
+        unsigned long flags = qp->flags;
        int nid;
        struct page *page;
        spinlock_t *ptl;
        pte_t entry;
-        ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
+        ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
-        entry = huge_ptep_get((pte_t *)pmd);
+        entry = huge_ptep_get(pte);
        if (!pte_present(entry))
                goto unlock;
        page = pte_page(entry);
        nid = page_to_nid(page);
-        if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
+        if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
                goto unlock;
        /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
        if (flags & (MPOL_MF_MOVE_ALL) ||
            (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
-                isolate_huge_page(page, private);
+                isolate_huge_page(page, qp->pagelist);
 unlock:
        spin_unlock(ptl);
 #else
        BUG();
 #endif
-}
-static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud,
-                unsigned long addr, unsigned long end,
-                const nodemask_t *nodes, unsigned long flags,
-                void *private)
-{
-        pmd_t *pmd;
-        unsigned long next;
-        pmd = pmd_offset(pud, addr);
-        do {
-                next = pmd_addr_end(addr, end);
-                if (!pmd_present(*pmd))
-                        continue;
-                if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) {
-                        queue_pages_hugetlb_pmd_range(vma, pmd, nodes,
-                                                flags, private);
-                        continue;
-                }
-                split_huge_page_pmd(vma, addr, pmd);
-                if (pmd_none_or_trans_huge_or_clear_bad(pmd))
-                        continue;
-                if (queue_pages_pte_range(vma, pmd, addr, next, nodes,
-                                    flags, private))
-                        return -EIO;
-        } while (pmd++, addr = next, addr != end);
-        return 0;
-}
-static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
-                unsigned long addr, unsigned long end,
-                const nodemask_t *nodes, unsigned long flags,
-                void *private)
-{
-        pud_t *pud;
-        unsigned long next;
-        pud = pud_offset(pgd, addr);
-        do {
-                next = pud_addr_end(addr, end);
-                if (pud_huge(*pud) && is_vm_hugetlb_page(vma))
-                        continue;
-                if (pud_none_or_clear_bad(pud))
-                        continue;
-                if (queue_pages_pmd_range(vma, pud, addr, next, nodes,
-                                    flags, private))
-                        return -EIO;
-        } while (pud++, addr = next, addr != end);
-        return 0;
-}
-static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
-                unsigned long addr, unsigned long end,
-                const nodemask_t *nodes, unsigned long flags,
-                void *private)
-{
-        pgd_t *pgd;
-        unsigned long next;
-        pgd = pgd_offset(vma->vm_mm, addr);
-        do {
-                next = pgd_addr_end(addr, end);
-                if (pgd_none_or_clear_bad(pgd))
-                        continue;
-                if (queue_pages_pud_range(vma, pgd, addr, next, nodes,
-                                    flags, private))
-                        return -EIO;
-        } while (pgd++, addr = next, addr != end);
        return 0;
 }
@@ -641,6 +583,49 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
 }
 #endif /* CONFIG_NUMA_BALANCING */
+static int queue_pages_test_walk(unsigned long start, unsigned long end,
+                                struct mm_walk *walk)
+{
+        struct vm_area_struct *vma = walk->vma;
+        struct queue_pages *qp = walk->private;
+        unsigned long endvma = vma->vm_end;
+        unsigned long flags = qp->flags;
+        if (vma->vm_flags & VM_PFNMAP)
+                return 1;
+        if (endvma > end)
+                endvma = end;
+        if (vma->vm_start > start)
+                start = vma->vm_start;
+        if (!(flags & MPOL_MF_DISCONTIG_OK)) {
+                if (!vma->vm_next && vma->vm_end < end)
+                        return -EFAULT;
+                if (qp->prev && qp->prev->vm_end < vma->vm_start)
+                        return -EFAULT;
+        }
+        qp->prev = vma;
+        if (vma->vm_flags & VM_PFNMAP)
+                return 1;
+        if (flags & MPOL_MF_LAZY) {
+                /* Similar to task_numa_work, skip inaccessible VMAs */
+                if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
+                        change_prot_numa(vma, start, endvma);
+                return 1;
+        }
+        if ((flags & MPOL_MF_STRICT) ||
+            ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
+             vma_migratable(vma)))
+                /* queue pages from current vma */
+                return 0;
+        return 1;
+}
 /*
 * Walk through page tables and collect pages to be migrated.
 *
@@ -650,50 +635,24 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
 */
 static int
 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
-                const nodemask_t *nodes, unsigned long flags, void *private)
+                nodemask_t *nodes, unsigned long flags,
-{
+                struct list_head *pagelist)
-        int err = 0;
+{
-        struct vm_area_struct *vma, *prev;
+        struct queue_pages qp = {
+                .pagelist = pagelist,
-        vma = find_vma(mm, start);
+                .flags = flags,
-        if (!vma)
+                .nmask = nodes,
-                return -EFAULT;
+                .prev = NULL,
-        prev = NULL;
+        };
-        for (; vma && vma->vm_start < end; vma = vma->vm_next) {
+        struct mm_walk queue_pages_walk = {
-                unsigned long endvma = vma->vm_end;
+                .hugetlb_entry = queue_pages_hugetlb,
+                .pmd_entry = queue_pages_pte_range,
-                if (endvma > end)
+                .test_walk = queue_pages_test_walk,
-                        endvma = end;
+                .mm = mm,
-                if (vma->vm_start > start)
+                .private = &qp,
-                        start = vma->vm_start;
+        };
-                if (!(flags & MPOL_MF_DISCONTIG_OK)) {
+        return walk_page_range(start, end, &queue_pages_walk);
-                        if (!vma->vm_next && vma->vm_end < end)
-                                return -EFAULT;
-                        if (prev && prev->vm_end < vma->vm_start)
-                                return -EFAULT;
-                }
-                if (flags & MPOL_MF_LAZY) {
-                        /* Similar to task_numa_work, skip inaccessible VMAs */
-                        if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
-                                change_prot_numa(vma, start, endvma);
-                        goto next;
-                }
-                if ((flags & MPOL_MF_STRICT) ||
-                     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
-                      vma_migratable(vma))) {
-                        err = queue_pages_pgd_range(vma, start, endvma, nodes,
-                                                flags, private);
-                        if (err)
-                                break;
-                }
-next:
-                prev = vma;
-        }
-        return err;
 }
 /*
@@ -1988,43 +1947,63 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
 *      @order:Order of the GFP allocation.
 *      @vma:  Pointer to VMA or NULL if not available.
 *      @addr: Virtual Address of the allocation. Must be inside the VMA.
+ *      @node: Which node to prefer for allocation (modulo policy).
+ *      @hugepage: for hugepages try only the preferred node if possible
 *
 *      This function allocates a page from the kernel page pool and applies
 *      a NUMA policy associated with the VMA or the current process.
 *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
 *      mm_struct of the VMA to prevent it from going away. Should be used for
- *      all allocations for pages that will be mapped into
+ *      all allocations for pages that will be mapped into user space. Returns
- *      user space. Returns NULL when no page can be allocated.
+ *      NULL when no page can be allocated.
- *
- *      Should be called with the mm_sem of the vma hold.
 */
 struct page *
 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
-                unsigned long addr, int node)
+                unsigned long addr, int node, bool hugepage)
 {
        struct mempolicy *pol;
        struct page *page;
        unsigned int cpuset_mems_cookie;
+        struct zonelist *zl;
+        nodemask_t *nmask;
 retry_cpuset:
        pol = get_vma_policy(vma, addr);
        cpuset_mems_cookie = read_mems_allowed_begin();
-        if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
+        if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage &&
+                                        pol->mode != MPOL_INTERLEAVE)) {
+                /*
+                 * For hugepage allocation and non-interleave policy which
+                 * allows the current node, we only try to allocate from the
+                 * current node and don't fall back to other nodes, as the
+                 * cost of remote accesses would likely offset THP benefits.
+                 *
+                 * If the policy is interleave, or does not allow the current
+                 * node in its nodemask, we allocate the standard way.
+                 */
+                nmask = policy_nodemask(gfp, pol);
+                if (!nmask || node_isset(node, *nmask)) {
+                        mpol_cond_put(pol);
+                        page = alloc_pages_exact_node(node, gfp, order);
+                        goto out;
+                }
+        }
+        if (pol->mode == MPOL_INTERLEAVE) {
                unsigned nid;
                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
                mpol_cond_put(pol);
                page = alloc_page_interleave(gfp, order, nid);
-                if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
+                goto out;
-                        goto retry_cpuset;
-                return page;
        }
-        page = __alloc_pages_nodemask(gfp, order,
-                                      policy_zonelist(gfp, pol, node),
+        nmask = policy_nodemask(gfp, pol);
-                                      policy_nodemask(gfp, pol));
+        zl = policy_zonelist(gfp, pol, node);
        mpol_cond_put(pol);
+        page = __alloc_pages_nodemask(gfp, order, zl, nmask);
+out:
        if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
                goto retry_cpuset;
        return page;

diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0e0961b8c39c..f1bd23803576 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c
@@ -471,24 +471,34 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
471	static void migrate_page_add(struct page page, struct list_head pagelist,	471	static void migrate_page_add(struct page page, struct list_head pagelist,
472	unsigned long flags);	472	unsigned long flags);
473		473
		474	struct queue_pages {
		475	struct list_head *pagelist;
		476	unsigned long flags;
		477	nodemask_t *nmask;
		478	struct vm_area_struct *prev;
		479	};
		480
474	/*	481	/*
475	* Scan through pages checking if pages follow certain conditions,	482	* Scan through pages checking if pages follow certain conditions,
476	* and move them to the pagelist if they do.	483	* and move them to the pagelist if they do.
477	*/	484	*/
478	static int queue_pages_pte_range(struct vm_area_struct vma, pmd_t pmd,	485	static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
479	unsigned long addr, unsigned long end,	486	unsigned long end, struct mm_walk *walk)
480	const nodemask_t *nodes, unsigned long flags,
481	void *private)
482	{	487	{
483	pte_t *orig_pte;	488	struct vm_area_struct *vma = walk->vma;
		489	struct page *page;
		490	struct queue_pages *qp = walk->private;
		491	unsigned long flags = qp->flags;
		492	int nid;
484	pte_t *pte;	493	pte_t *pte;
485	spinlock_t *ptl;	494	spinlock_t *ptl;
486		495
487	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);	496	split_huge_page_pmd(vma, addr, pmd);
488	do {	497	if (pmd_trans_unstable(pmd))
489	struct page *page;	498	return 0;
490	int nid;
491		499
		500	pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
		501	for (; addr != end; pte++, addr += PAGE_SIZE) {
492	if (!pte_present(*pte))	502	if (!pte_present(*pte))
493	continue;	503	continue;
494	page = vm_normal_page(vma, addr, *pte);	504	page = vm_normal_page(vma, addr, *pte);
@@ -501,114 +511,46 @@ static int queue_pages_pte_range(struct vm_area_struct vma, pmd_t pmd,
501	if (PageReserved(page))	511	if (PageReserved(page))
502	continue;	512	continue;
503	nid = page_to_nid(page);	513	nid = page_to_nid(page);
504	if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))	514	if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
505	continue;	515	continue;
506		516
507	if (flags & (MPOL_MF_MOVE \| MPOL_MF_MOVE_ALL))	517	if (flags & (MPOL_MF_MOVE \| MPOL_MF_MOVE_ALL))
508	migrate_page_add(page, private, flags);	518	migrate_page_add(page, qp->pagelist, flags);
509	else	519	}
510	break;	520	pte_unmap_unlock(pte - 1, ptl);
511	} while (pte++, addr += PAGE_SIZE, addr != end);	521	cond_resched();
512	pte_unmap_unlock(orig_pte, ptl);	522	return 0;
513	return addr != end;
514	}	523	}
515		524
516	static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,	525	static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
517	pmd_t pmd, const nodemask_t nodes, unsigned long flags,	526	unsigned long addr, unsigned long end,
518	void *private)	527	struct mm_walk *walk)
519	{	528	{
520	#ifdef CONFIG_HUGETLB_PAGE	529	#ifdef CONFIG_HUGETLB_PAGE
		530	struct queue_pages *qp = walk->private;
		531	unsigned long flags = qp->flags;
521	int nid;	532	int nid;
522	struct page *page;	533	struct page *page;
523	spinlock_t *ptl;	534	spinlock_t *ptl;
524	pte_t entry;	535	pte_t entry;
525		536
526	ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);	537	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
527	entry = huge_ptep_get((pte_t *)pmd);	538	entry = huge_ptep_get(pte);
528	if (!pte_present(entry))	539	if (!pte_present(entry))
529	goto unlock;	540	goto unlock;
530	page = pte_page(entry);	541	page = pte_page(entry);
531	nid = page_to_nid(page);	542	nid = page_to_nid(page);
532	if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))	543	if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
533	goto unlock;	544	goto unlock;
534	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */	545	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
535	if (flags & (MPOL_MF_MOVE_ALL) \|\|	546	if (flags & (MPOL_MF_MOVE_ALL) \|\|
536	(flags & MPOL_MF_MOVE && page_mapcount(page) == 1))	547	(flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
537	isolate_huge_page(page, private);	548	isolate_huge_page(page, qp->pagelist);
538	unlock:	549	unlock:
539	spin_unlock(ptl);	550	spin_unlock(ptl);
540	#else	551	#else
541	BUG();	552	BUG();
542	#endif	553	#endif
543	}
544
545	static inline int queue_pages_pmd_range(struct vm_area_struct vma, pud_t pud,
546	unsigned long addr, unsigned long end,
547	const nodemask_t *nodes, unsigned long flags,
548	void *private)
549	{
550	pmd_t *pmd;
551	unsigned long next;
552
553	pmd = pmd_offset(pud, addr);
554	do {
555	next = pmd_addr_end(addr, end);
556	if (!pmd_present(*pmd))
557	continue;
558	if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) {
559	queue_pages_hugetlb_pmd_range(vma, pmd, nodes,
560	flags, private);
561	continue;
562	}
563	split_huge_page_pmd(vma, addr, pmd);
564	if (pmd_none_or_trans_huge_or_clear_bad(pmd))
565	continue;
566	if (queue_pages_pte_range(vma, pmd, addr, next, nodes,
567	flags, private))
568	return -EIO;
569	} while (pmd++, addr = next, addr != end);
570	return 0;
571	}
572
573	static inline int queue_pages_pud_range(struct vm_area_struct vma, pgd_t pgd,
574	unsigned long addr, unsigned long end,
575	const nodemask_t *nodes, unsigned long flags,
576	void *private)
577	{
578	pud_t *pud;
579	unsigned long next;
580
581	pud = pud_offset(pgd, addr);
582	do {
583	next = pud_addr_end(addr, end);
584	if (pud_huge(*pud) && is_vm_hugetlb_page(vma))
585	continue;
586	if (pud_none_or_clear_bad(pud))
587	continue;
588	if (queue_pages_pmd_range(vma, pud, addr, next, nodes,
589	flags, private))
590	return -EIO;
591	} while (pud++, addr = next, addr != end);
592	return 0;
593	}
594
595	static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
596	unsigned long addr, unsigned long end,
597	const nodemask_t *nodes, unsigned long flags,
598	void *private)
599	{
600	pgd_t *pgd;
601	unsigned long next;
602
603	pgd = pgd_offset(vma->vm_mm, addr);
604	do {
605	next = pgd_addr_end(addr, end);
606	if (pgd_none_or_clear_bad(pgd))
607	continue;
608	if (queue_pages_pud_range(vma, pgd, addr, next, nodes,
609	flags, private))
610	return -EIO;
611	} while (pgd++, addr = next, addr != end);
612	return 0;	554	return 0;
613	}	555	}
614		556
@@ -641,6 +583,49 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
641	}	583	}
642	#endif /* CONFIG_NUMA_BALANCING */	584	#endif /* CONFIG_NUMA_BALANCING */
643		585
		586	static int queue_pages_test_walk(unsigned long start, unsigned long end,
		587	struct mm_walk *walk)
		588	{
		589	struct vm_area_struct *vma = walk->vma;
		590	struct queue_pages *qp = walk->private;
		591	unsigned long endvma = vma->vm_end;
		592	unsigned long flags = qp->flags;
		593
		594	if (vma->vm_flags & VM_PFNMAP)
		595	return 1;
		596
		597	if (endvma > end)
		598	endvma = end;
		599	if (vma->vm_start > start)
		600	start = vma->vm_start;
		601
		602	if (!(flags & MPOL_MF_DISCONTIG_OK)) {
		603	if (!vma->vm_next && vma->vm_end < end)
		604	return -EFAULT;
		605	if (qp->prev && qp->prev->vm_end < vma->vm_start)
		606	return -EFAULT;
		607	}
		608
		609	qp->prev = vma;
		610
		611	if (vma->vm_flags & VM_PFNMAP)
		612	return 1;
		613
		614	if (flags & MPOL_MF_LAZY) {
		615	/* Similar to task_numa_work, skip inaccessible VMAs */
		616	if (vma->vm_flags & (VM_READ \| VM_EXEC \| VM_WRITE))
		617	change_prot_numa(vma, start, endvma);
		618	return 1;
		619	}
		620
		621	if ((flags & MPOL_MF_STRICT) \|\|
		622	((flags & (MPOL_MF_MOVE \| MPOL_MF_MOVE_ALL)) &&
		623	vma_migratable(vma)))
		624	/* queue pages from current vma */
		625	return 0;
		626	return 1;
		627	}
		628
644	/*	629	/*
645	* Walk through page tables and collect pages to be migrated.	630	* Walk through page tables and collect pages to be migrated.
646	*	631	*
@@ -650,50 +635,24 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
650	*/	635	*/
651	static int	636	static int
652	queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,	637	queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
653	const nodemask_t nodes, unsigned long flags, void private)	638	nodemask_t *nodes, unsigned long flags,
654	{	639	struct list_head *pagelist)
655	int err = 0;	640	{
656	struct vm_area_struct vma, prev;	641	struct queue_pages qp = {
657		642	.pagelist = pagelist,
658	vma = find_vma(mm, start);	643	.flags = flags,
659	if (!vma)	644	.nmask = nodes,
660	return -EFAULT;	645	.prev = NULL,
661	prev = NULL;	646	};
662	for (; vma && vma->vm_start < end; vma = vma->vm_next) {	647	struct mm_walk queue_pages_walk = {
663	unsigned long endvma = vma->vm_end;	648	.hugetlb_entry = queue_pages_hugetlb,
664		649	.pmd_entry = queue_pages_pte_range,
665	if (endvma > end)	650	.test_walk = queue_pages_test_walk,
666	endvma = end;	651	.mm = mm,
667	if (vma->vm_start > start)	652	.private = &qp,
668	start = vma->vm_start;	653	};
669		654
670	if (!(flags & MPOL_MF_DISCONTIG_OK)) {	655	return walk_page_range(start, end, &queue_pages_walk);
671	if (!vma->vm_next && vma->vm_end < end)
672	return -EFAULT;
673	if (prev && prev->vm_end < vma->vm_start)
674	return -EFAULT;
675	}
676
677	if (flags & MPOL_MF_LAZY) {
678	/* Similar to task_numa_work, skip inaccessible VMAs */
679	if (vma->vm_flags & (VM_READ \| VM_EXEC \| VM_WRITE))
680	change_prot_numa(vma, start, endvma);
681	goto next;
682	}
683
684	if ((flags & MPOL_MF_STRICT) \|\|
685	((flags & (MPOL_MF_MOVE \| MPOL_MF_MOVE_ALL)) &&
686	vma_migratable(vma))) {
687
688	err = queue_pages_pgd_range(vma, start, endvma, nodes,
689	flags, private);
690	if (err)
691	break;
692	}
693	next:
694	prev = vma;
695	}
696	return err;
697	}	656	}
698		657
699	/*	658	/*
@@ -1988,43 +1947,63 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1988	* @order:Order of the GFP allocation.	1947	* @order:Order of the GFP allocation.
1989	* @vma: Pointer to VMA or NULL if not available.	1948	* @vma: Pointer to VMA or NULL if not available.
1990	* @addr: Virtual Address of the allocation. Must be inside the VMA.	1949	* @addr: Virtual Address of the allocation. Must be inside the VMA.
		1950	* @node: Which node to prefer for allocation (modulo policy).
		1951	* @hugepage: for hugepages try only the preferred node if possible
1991	*	1952	*
1992	* This function allocates a page from the kernel page pool and applies	1953	* This function allocates a page from the kernel page pool and applies
1993	* a NUMA policy associated with the VMA or the current process.	1954	* a NUMA policy associated with the VMA or the current process.
1994	* When VMA is not NULL caller must hold down_read on the mmap_sem of the	1955	* When VMA is not NULL caller must hold down_read on the mmap_sem of the
1995	* mm_struct of the VMA to prevent it from going away. Should be used for	1956	* mm_struct of the VMA to prevent it from going away. Should be used for
1996	* all allocations for pages that will be mapped into	1957	* all allocations for pages that will be mapped into user space. Returns
1997	* user space. Returns NULL when no page can be allocated.	1958	* NULL when no page can be allocated.
1998	*
1999	* Should be called with the mm_sem of the vma hold.
2000	*/	1959	*/
2001	struct page *	1960	struct page *
2002	alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,	1961	alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2003	unsigned long addr, int node)	1962	unsigned long addr, int node, bool hugepage)
2004	{	1963	{
2005	struct mempolicy *pol;	1964	struct mempolicy *pol;
2006	struct page *page;	1965	struct page *page;
2007	unsigned int cpuset_mems_cookie;	1966	unsigned int cpuset_mems_cookie;
		1967	struct zonelist *zl;
		1968	nodemask_t *nmask;
2008		1969
2009	retry_cpuset:	1970	retry_cpuset:
2010	pol = get_vma_policy(vma, addr);	1971	pol = get_vma_policy(vma, addr);
2011	cpuset_mems_cookie = read_mems_allowed_begin();	1972	cpuset_mems_cookie = read_mems_allowed_begin();
2012		1973
2013	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {	1974	if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage &&
		1975	pol->mode != MPOL_INTERLEAVE)) {
		1976	/*
		1977	* For hugepage allocation and non-interleave policy which
		1978	* allows the current node, we only try to allocate from the
		1979	* current node and don't fall back to other nodes, as the
		1980	* cost of remote accesses would likely offset THP benefits.
		1981	*
		1982	* If the policy is interleave, or does not allow the current
		1983	* node in its nodemask, we allocate the standard way.
		1984	*/
		1985	nmask = policy_nodemask(gfp, pol);
		1986	if (!nmask \|\| node_isset(node, *nmask)) {
		1987	mpol_cond_put(pol);
		1988	page = alloc_pages_exact_node(node, gfp, order);
		1989	goto out;
		1990	}
		1991	}
		1992
		1993	if (pol->mode == MPOL_INTERLEAVE) {
2014	unsigned nid;	1994	unsigned nid;
2015		1995
2016	nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);	1996	nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2017	mpol_cond_put(pol);	1997	mpol_cond_put(pol);
2018	page = alloc_page_interleave(gfp, order, nid);	1998	page = alloc_page_interleave(gfp, order, nid);
2019	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))	1999	goto out;
2020	goto retry_cpuset;
2021
2022	return page;
2023	}	2000	}
2024	page = __alloc_pages_nodemask(gfp, order,	2001
2025	policy_zonelist(gfp, pol, node),	2002	nmask = policy_nodemask(gfp, pol);
2026	policy_nodemask(gfp, pol));	2003	zl = policy_zonelist(gfp, pol, node);
2027	mpol_cond_put(pol);	2004	mpol_cond_put(pol);
		2005	page = __alloc_pages_nodemask(gfp, order, zl, nmask);
		2006	out:
2028	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))	2007	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2029	goto retry_cpuset;	2008	goto retry_cpuset;
2030	return page;	2009	return page;