7 files changed, 101 insertions, 82 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 03a89a2f464b..362c329b83fe 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2325,7 +2325,12 @@ static void collapse_huge_page(struct mm_struct *mm,
                pte_unmap(pte);
                spin_lock(&mm->page_table_lock);
                BUG_ON(!pmd_none(*pmd));
-                set_pmd_at(mm, address, pmd, _pmd);
+                /*
+                 * We can only use set_pmd_at when establishing
+                 * hugepmds and never for establishing regular pmds that
+                 * points to regular pagetables. Use pmd_populate for that
+                 */
+                pmd_populate(mm, pmd, pmd_pgtable(_pmd));
                spin_unlock(&mm->page_table_lock);
                anon_vma_unlock_write(vma->anon_vma);
                goto out;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index cb1c9dedf9b6..010d6c14129a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4108,8 +4108,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
        if (mem_cgroup_disabled())
                return NULL;
-        VM_BUG_ON(PageSwapCache(page));
        if (PageTransHuge(page)) {
                nr_pages <<= compound_order(page);
                VM_BUG_ON(!PageTransHuge(page));
@@ -4205,6 +4203,18 @@ void mem_cgroup_uncharge_page(struct page *page)
        if (page_mapped(page))
                return;
        VM_BUG_ON(page->mapping && !PageAnon(page));
+        /*
+         * If the page is in swap cache, uncharge should be deferred
+         * to the swap path, which also properly accounts swap usage
+         * and handles memcg lifetime.
+         *
+         * Note that this check is not stable and reclaim may add the
+         * page to swap cache at any time after this.  However, if the
+         * page is not in swap cache by the time page->mapcount hits
+         * 0, there won't be any page table references to the swap
+         * slot, and reclaim will free it and not actually write the
+         * page to disk.
+         */
        if (PageSwapCache(page))
                return;
        __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a221fac1f47d..1ad92b46753e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -720,9 +720,12 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
        start = phys_start_pfn << PAGE_SHIFT;
        size = nr_pages * PAGE_SIZE;
        ret = release_mem_region_adjustable(&iomem_resource, start, size);
-        if (ret)
+        if (ret) {
-                pr_warn("Unable to release resource <%016llx-%016llx> (%d)\n",
+                resource_size_t endres = start + size - 1;
-                                start, start + size - 1, ret);
+                pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
+                                &start, &endres, ret);
+        }
        sections_to_remove = nr_pages / PAGES_PER_SECTION;
        for (i = 0; i < sections_to_remove; i++) {
diff --git a/mm/migrate.c b/mm/migrate.c
index 27ed22579fd9..b1f57501de9c 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -165,7 +165,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
                pte = arch_make_huge_pte(pte, vma, new, 0);
        }
 #endif
-        flush_cache_page(vma, addr, pte_pfn(pte));
+        flush_dcache_page(new);
        set_pte_at(mm, addr, ptep, pte);
        if (PageHuge(new)) {
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index be04122fb277..6725ff183374 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -40,48 +40,44 @@ void __mmu_notifier_release(struct mm_struct *mm)
        int id;
        /*
-         * srcu_read_lock() here will block synchronize_srcu() in
+         * SRCU here will block mmu_notifier_unregister until
-         * mmu_notifier_unregister() until all registered
+         * ->release returns.
-         * ->release() callouts this function makes have
-         * returned.
         */
        id = srcu_read_lock(&srcu);
+        hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist)
+                /*
+                 * If ->release runs before mmu_notifier_unregister it must be
+                 * handled, as it's the only way for the driver to flush all
+                 * existing sptes and stop the driver from establishing any more
+                 * sptes before all the pages in the mm are freed.
+                 */
+                if (mn->ops->release)
+                        mn->ops->release(mn, mm);
+        srcu_read_unlock(&srcu, id);
        spin_lock(&mm->mmu_notifier_mm->lock);
        while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
                mn = hlist_entry(mm->mmu_notifier_mm->list.first,
                                 struct mmu_notifier,
                                 hlist);
                /*
-                 * Unlink.  This will prevent mmu_notifier_unregister()
+                 * We arrived before mmu_notifier_unregister so
-                 * from also making the ->release() callout.
+                 * mmu_notifier_unregister will do nothing other than to wait
+                 * for ->release to finish and for mmu_notifier_unregister to
+                 * return.
                 */
                hlist_del_init_rcu(&mn->hlist);
-                spin_unlock(&mm->mmu_notifier_mm->lock);
-                /*
-                 * Clear sptes. (see 'release' description in mmu_notifier.h)
-                 */
-                if (mn->ops->release)
-                        mn->ops->release(mn, mm);
-                spin_lock(&mm->mmu_notifier_mm->lock);
        }
        spin_unlock(&mm->mmu_notifier_mm->lock);
        /*
-         * All callouts to ->release() which we have done are complete.
+         * synchronize_srcu here prevents mmu_notifier_release from returning to
-         * Allow synchronize_srcu() in mmu_notifier_unregister() to complete
+         * exit_mmap (which would proceed with freeing all pages in the mm)
-         */
+         * until the ->release method returns, if it was invoked by
-        srcu_read_unlock(&srcu, id);
+         * mmu_notifier_unregister.
+         *
-        /*
+         * The mmu_notifier_mm can't go away from under us because one mm_count
-         * mmu_notifier_unregister() may have unlinked a notifier and may
+         * is held by exit_mmap.
-         * still be calling out to it.  Additionally, other notifiers
-         * may have been active via vmtruncate() et. al. Block here
-         * to ensure that all notifier callouts for this mm have been
-         * completed and the sptes are really cleaned up before returning
-         * to exit_mmap().
         */
        synchronize_srcu(&srcu);
 }
@@ -292,31 +288,34 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
 {
        BUG_ON(atomic_read(&mm->mm_count) <= 0);
-        spin_lock(&mm->mmu_notifier_mm->lock);
        if (!hlist_unhashed(&mn->hlist)) {
+                /*
+                 * SRCU here will force exit_mmap to wait for ->release to
+                 * finish before freeing the pages.
+                 */
                int id;
+                id = srcu_read_lock(&srcu);
                /*
-                 * Ensure we synchronize up with __mmu_notifier_release().
+                 * exit_mmap will block in mmu_notifier_release to guarantee
+                 * that ->release is called before freeing the pages.
                 */
-                id = srcu_read_lock(&srcu);
-                hlist_del_rcu(&mn->hlist);
-                spin_unlock(&mm->mmu_notifier_mm->lock);
                if (mn->ops->release)
                        mn->ops->release(mn, mm);
+                srcu_read_unlock(&srcu, id);
+                spin_lock(&mm->mmu_notifier_mm->lock);
                /*
-                 * Allow __mmu_notifier_release() to complete.
+                 * Can not use list_del_rcu() since __mmu_notifier_release
+                 * can delete it before we hold the lock.
                 */
-                srcu_read_unlock(&srcu, id);
+                hlist_del_init_rcu(&mn->hlist);
-        } else
                spin_unlock(&mm->mmu_notifier_mm->lock);
+        }
        /*
-         * Wait for any running method to finish, including ->release() if it
+         * Wait for any running method to finish, of course including
-         * was run by __mmu_notifier_release() instead of us.
+         * ->release if it was run by mmu_notifier_relase instead of us.
         */
        synchronize_srcu(&srcu);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 98cbdf6e5532..378a15bcd649 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5158,7 +5158,7 @@ unsigned long free_reserved_area(unsigned long start, unsigned long end,
        for (pages = 0; pos < end; pos += PAGE_SIZE, pages++) {
                if (poison)
                        memset((void *)pos, poison, PAGE_SIZE);
-                free_reserved_page(virt_to_page(pos));
+                free_reserved_page(virt_to_page((void *)pos));
        }
        if (pages && s)
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 35aa294656cd..5da2cbcfdbb5 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -127,28 +127,7 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
        return 0;
 }
-static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk)
-{
-        struct vm_area_struct *vma;
-        /* We don't need vma lookup at all. */
-        if (!walk->hugetlb_entry)
-                return NULL;
-        VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));
-        vma = find_vma(walk->mm, addr);
-        if (vma && vma->vm_start <= addr && is_vm_hugetlb_page(vma))
-                return vma;
-        return NULL;
-}
 #else /* CONFIG_HUGETLB_PAGE */
-static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk)
-{
-        return NULL;
-}
 static int walk_hugetlb_range(struct vm_area_struct *vma,
                              unsigned long addr, unsigned long end,
                              struct mm_walk *walk)
@@ -198,30 +177,53 @@ int walk_page_range(unsigned long addr, unsigned long end,
        if (!walk->mm)
                return -EINVAL;
+        VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));
        pgd = pgd_offset(walk->mm, addr);
        do {
-                struct vm_area_struct *vma;
+                struct vm_area_struct *vma = NULL;
                next = pgd_addr_end(addr, end);
                /*
-                 * handle hugetlb vma individually because pagetable walk for
+                 * This function was not intended to be vma based.
-                 * the hugetlb page is dependent on the architecture and
+                 * But there are vma special cases to be handled:
-                 * we can't handled it in the same manner as non-huge pages.
+                 * - hugetlb vma's
+                 * - VM_PFNMAP vma's
                 */
-                vma = hugetlb_vma(addr, walk);
+                vma = find_vma(walk->mm, addr);
                if (vma) {
-                        if (vma->vm_end < next)
+                        /*
+                         * There are no page structures backing a VM_PFNMAP
+                         * range, so do not allow split_huge_page_pmd().
+                         */
+                        if ((vma->vm_start <= addr) &&
+                            (vma->vm_flags & VM_PFNMAP)) {
                                next = vma->vm_end;
+                                pgd = pgd_offset(walk->mm, next);
+                                continue;
+                        }
                        /*
-                         * Hugepage is very tightly coupled with vma, so
+                         * Handle hugetlb vma individually because pagetable
-                         * walk through hugetlb entries within a given vma.
+                         * walk for the hugetlb page is dependent on the
+                         * architecture and we can't handled it in the same
+                         * manner as non-huge pages.
                         */
-                        err = walk_hugetlb_range(vma, addr, next, walk);
+                        if (walk->hugetlb_entry && (vma->vm_start <= addr) &&
-                        if (err)
+                            is_vm_hugetlb_page(vma)) {
-                                break;
+                                if (vma->vm_end < next)
-                        pgd = pgd_offset(walk->mm, next);
+                                        next = vma->vm_end;
-                        continue;
+                                /*
+                                 * Hugepage is very tightly coupled with vma,
+                                 * so walk through hugetlb entries within a
+                                 * given vma.
+                                 */
+                                err = walk_hugetlb_range(vma, addr, next, walk);
+                                if (err)
+                                        break;
+                                pgd = pgd_offset(walk->mm, next);
+                                continue;
+                        }
                }
                if (pgd_none_or_clear_bad(pgd)) {

diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 03a89a2f464b..362c329b83fe 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c
@@ -2325,7 +2325,12 @@ static void collapse_huge_page(struct mm_struct *mm,
2325	pte_unmap(pte);	2325	pte_unmap(pte);
2326	spin_lock(&mm->page_table_lock);	2326	spin_lock(&mm->page_table_lock);
2327	BUG_ON(!pmd_none(*pmd));	2327	BUG_ON(!pmd_none(*pmd));
2328	set_pmd_at(mm, address, pmd, _pmd);	2328	/*
		2329	* We can only use set_pmd_at when establishing
		2330	* hugepmds and never for establishing regular pmds that
		2331	* points to regular pagetables. Use pmd_populate for that
		2332	*/
		2333	pmd_populate(mm, pmd, pmd_pgtable(_pmd));
2329	spin_unlock(&mm->page_table_lock);	2334	spin_unlock(&mm->page_table_lock);
2330	anon_vma_unlock_write(vma->anon_vma);	2335	anon_vma_unlock_write(vma->anon_vma);
2331	goto out;	2336	goto out;


diff --git a/mm/memcontrol.c b/mm/memcontrol.c index cb1c9dedf9b6..010d6c14129a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c
@@ -4108,8 +4108,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
4108	if (mem_cgroup_disabled())	4108	if (mem_cgroup_disabled())
4109	return NULL;	4109	return NULL;
4110		4110
4111	VM_BUG_ON(PageSwapCache(page));
4112
4113	if (PageTransHuge(page)) {	4111	if (PageTransHuge(page)) {
4114	nr_pages <<= compound_order(page);	4112	nr_pages <<= compound_order(page);
4115	VM_BUG_ON(!PageTransHuge(page));	4113	VM_BUG_ON(!PageTransHuge(page));
@@ -4205,6 +4203,18 @@ void mem_cgroup_uncharge_page(struct page *page)
4205	if (page_mapped(page))	4203	if (page_mapped(page))
4206	return;	4204	return;
4207	VM_BUG_ON(page->mapping && !PageAnon(page));	4205	VM_BUG_ON(page->mapping && !PageAnon(page));
		4206	/*
		4207	* If the page is in swap cache, uncharge should be deferred
		4208	* to the swap path, which also properly accounts swap usage
		4209	* and handles memcg lifetime.
		4210	*
		4211	* Note that this check is not stable and reclaim may add the
		4212	* page to swap cache at any time after this. However, if the
		4213	* page is not in swap cache by the time page->mapcount hits
		4214	* 0, there won't be any page table references to the swap
		4215	* slot, and reclaim will free it and not actually write the
		4216	* page to disk.
		4217	*/
4208	if (PageSwapCache(page))	4218	if (PageSwapCache(page))
4209	return;	4219	return;
4210	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);	4220	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);


diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a221fac1f47d..1ad92b46753e 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c
@@ -720,9 +720,12 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
720	start = phys_start_pfn << PAGE_SHIFT;	720	start = phys_start_pfn << PAGE_SHIFT;
721	size = nr_pages * PAGE_SIZE;	721	size = nr_pages * PAGE_SIZE;
722	ret = release_mem_region_adjustable(&iomem_resource, start, size);	722	ret = release_mem_region_adjustable(&iomem_resource, start, size);
723	if (ret)	723	if (ret) {
724	pr_warn("Unable to release resource <%016llx-%016llx> (%d)\n",	724	resource_size_t endres = start + size - 1;
725	start, start + size - 1, ret);	725
		726	pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
		727	&start, &endres, ret);
		728	}
726		729
727	sections_to_remove = nr_pages / PAGES_PER_SECTION;	730	sections_to_remove = nr_pages / PAGES_PER_SECTION;
728	for (i = 0; i < sections_to_remove; i++) {	731	for (i = 0; i < sections_to_remove; i++) {


diff --git a/mm/migrate.c b/mm/migrate.c index 27ed22579fd9..b1f57501de9c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c
@@ -165,7 +165,7 @@ static int remove_migration_pte(struct page new, struct vm_area_struct vma,
165	pte = arch_make_huge_pte(pte, vma, new, 0);	165	pte = arch_make_huge_pte(pte, vma, new, 0);
166	}	166	}
167	#endif	167	#endif
168	flush_cache_page(vma, addr, pte_pfn(pte));	168	flush_dcache_page(new);
169	set_pte_at(mm, addr, ptep, pte);	169	set_pte_at(mm, addr, ptep, pte);
170		170
171	if (PageHuge(new)) {	171	if (PageHuge(new)) {


diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index be04122fb277..6725ff183374 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c
@@ -40,48 +40,44 @@ void __mmu_notifier_release(struct mm_struct *mm)
40	int id;	40	int id;
41		41
42	/*	42	/*
43	* srcu_read_lock() here will block synchronize_srcu() in	43	* SRCU here will block mmu_notifier_unregister until
44	* mmu_notifier_unregister() until all registered	44	* ->release returns.
45	* ->release() callouts this function makes have
46	* returned.
47	*/	45	*/
48	id = srcu_read_lock(&srcu);	46	id = srcu_read_lock(&srcu);
		47	hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist)
		48	/*
		49	* If ->release runs before mmu_notifier_unregister it must be
		50	* handled, as it's the only way for the driver to flush all
		51	* existing sptes and stop the driver from establishing any more
		52	* sptes before all the pages in the mm are freed.
		53	*/
		54	if (mn->ops->release)
		55	mn->ops->release(mn, mm);
		56	srcu_read_unlock(&srcu, id);
		57
49	spin_lock(&mm->mmu_notifier_mm->lock);	58	spin_lock(&mm->mmu_notifier_mm->lock);
50	while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {	59	while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
51	mn = hlist_entry(mm->mmu_notifier_mm->list.first,	60	mn = hlist_entry(mm->mmu_notifier_mm->list.first,
52	struct mmu_notifier,	61	struct mmu_notifier,
53	hlist);	62	hlist);
54
55	/*	63	/*
56	* Unlink. This will prevent mmu_notifier_unregister()	64	* We arrived before mmu_notifier_unregister so
57	* from also making the ->release() callout.	65	* mmu_notifier_unregister will do nothing other than to wait
		66	* for ->release to finish and for mmu_notifier_unregister to
		67	* return.
58	*/	68	*/
59	hlist_del_init_rcu(&mn->hlist);	69	hlist_del_init_rcu(&mn->hlist);
60	spin_unlock(&mm->mmu_notifier_mm->lock);
61
62	/*
63	* Clear sptes. (see 'release' description in mmu_notifier.h)
64	*/
65	if (mn->ops->release)
66	mn->ops->release(mn, mm);
67
68	spin_lock(&mm->mmu_notifier_mm->lock);
69	}	70	}
70	spin_unlock(&mm->mmu_notifier_mm->lock);	71	spin_unlock(&mm->mmu_notifier_mm->lock);
71		72
72	/*	73	/*
73	* All callouts to ->release() which we have done are complete.	74	* synchronize_srcu here prevents mmu_notifier_release from returning to
74	* Allow synchronize_srcu() in mmu_notifier_unregister() to complete	75	* exit_mmap (which would proceed with freeing all pages in the mm)
75	*/	76	* until the ->release method returns, if it was invoked by
76	srcu_read_unlock(&srcu, id);	77	* mmu_notifier_unregister.
77		78	*
78	/*	79	* The mmu_notifier_mm can't go away from under us because one mm_count
79	* mmu_notifier_unregister() may have unlinked a notifier and may	80	* is held by exit_mmap.
80	* still be calling out to it. Additionally, other notifiers
81	* may have been active via vmtruncate() et. al. Block here
82	* to ensure that all notifier callouts for this mm have been
83	* completed and the sptes are really cleaned up before returning
84	* to exit_mmap().
85	*/	81	*/
86	synchronize_srcu(&srcu);	82	synchronize_srcu(&srcu);
87	}	83	}
@@ -292,31 +288,34 @@ void mmu_notifier_unregister(struct mmu_notifier mn, struct mm_struct mm)
292	{	288	{
293	BUG_ON(atomic_read(&mm->mm_count) <= 0);	289	BUG_ON(atomic_read(&mm->mm_count) <= 0);
294		290
295	spin_lock(&mm->mmu_notifier_mm->lock);
296	if (!hlist_unhashed(&mn->hlist)) {	291	if (!hlist_unhashed(&mn->hlist)) {
		292	/*
		293	* SRCU here will force exit_mmap to wait for ->release to
		294	* finish before freeing the pages.
		295	*/
297	int id;	296	int id;
298		297
		298	id = srcu_read_lock(&srcu);
299	/*	299	/*
300	* Ensure we synchronize up with __mmu_notifier_release().	300	* exit_mmap will block in mmu_notifier_release to guarantee
		301	* that ->release is called before freeing the pages.
301	*/	302	*/
302	id = srcu_read_lock(&srcu);
303
304	hlist_del_rcu(&mn->hlist);
305	spin_unlock(&mm->mmu_notifier_mm->lock);
306
307	if (mn->ops->release)	303	if (mn->ops->release)
308	mn->ops->release(mn, mm);	304	mn->ops->release(mn, mm);
		305	srcu_read_unlock(&srcu, id);
309		306
		307	spin_lock(&mm->mmu_notifier_mm->lock);
310	/*	308	/*
311	* Allow __mmu_notifier_release() to complete.	309	* Can not use list_del_rcu() since __mmu_notifier_release
		310	* can delete it before we hold the lock.
312	*/	311	*/
313	srcu_read_unlock(&srcu, id);	312	hlist_del_init_rcu(&mn->hlist);
314	} else
315	spin_unlock(&mm->mmu_notifier_mm->lock);	313	spin_unlock(&mm->mmu_notifier_mm->lock);
		314	}
316		315
317	/*	316	/*
318	* Wait for any running method to finish, including ->release() if it	317	* Wait for any running method to finish, of course including
319	* was run by __mmu_notifier_release() instead of us.	318	* ->release if it was run by mmu_notifier_relase instead of us.
320	*/	319	*/
321	synchronize_srcu(&srcu);	320	synchronize_srcu(&srcu);
322		321


diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 98cbdf6e5532..378a15bcd649 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c
@@ -5158,7 +5158,7 @@ unsigned long free_reserved_area(unsigned long start, unsigned long end,
5158	for (pages = 0; pos < end; pos += PAGE_SIZE, pages++) {	5158	for (pages = 0; pos < end; pos += PAGE_SIZE, pages++) {
5159	if (poison)	5159	if (poison)
5160	memset((void *)pos, poison, PAGE_SIZE);	5160	memset((void *)pos, poison, PAGE_SIZE);
5161	free_reserved_page(virt_to_page(pos));	5161	free_reserved_page(virt_to_page((void *)pos));
5162	}	5162	}
5163		5163
5164	if (pages && s)	5164	if (pages && s)


diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 35aa294656cd..5da2cbcfdbb5 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c
@@ -127,28 +127,7 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
127	return 0;	127	return 0;
128	}	128	}
129		129
130	static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk)
131	{
132	struct vm_area_struct *vma;
133
134	/* We don't need vma lookup at all. */
135	if (!walk->hugetlb_entry)
136	return NULL;
137
138	VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));
139	vma = find_vma(walk->mm, addr);
140	if (vma && vma->vm_start <= addr && is_vm_hugetlb_page(vma))
141	return vma;
142
143	return NULL;
144	}
145
146	#else /* CONFIG_HUGETLB_PAGE */	130	#else /* CONFIG_HUGETLB_PAGE */
147	static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk)
148	{
149	return NULL;
150	}
151
152	static int walk_hugetlb_range(struct vm_area_struct *vma,	131	static int walk_hugetlb_range(struct vm_area_struct *vma,
153	unsigned long addr, unsigned long end,	132	unsigned long addr, unsigned long end,
154	struct mm_walk *walk)	133	struct mm_walk *walk)
@@ -198,30 +177,53 @@ int walk_page_range(unsigned long addr, unsigned long end,
198	if (!walk->mm)	177	if (!walk->mm)
199	return -EINVAL;	178	return -EINVAL;
200		179
		180	VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));
		181
201	pgd = pgd_offset(walk->mm, addr);	182	pgd = pgd_offset(walk->mm, addr);
202	do {	183	do {
203	struct vm_area_struct *vma;	184	struct vm_area_struct *vma = NULL;
204		185
205	next = pgd_addr_end(addr, end);	186	next = pgd_addr_end(addr, end);
206		187
207	/*	188	/*
208	* handle hugetlb vma individually because pagetable walk for	189	* This function was not intended to be vma based.
209	* the hugetlb page is dependent on the architecture and	190	* But there are vma special cases to be handled:
210	* we can't handled it in the same manner as non-huge pages.	191	* - hugetlb vma's
		192	* - VM_PFNMAP vma's
211	*/	193	*/
212	vma = hugetlb_vma(addr, walk);	194	vma = find_vma(walk->mm, addr);
213	if (vma) {	195	if (vma) {
214	if (vma->vm_end < next)	196	/*
		197	* There are no page structures backing a VM_PFNMAP
		198	* range, so do not allow split_huge_page_pmd().
		199	*/
		200	if ((vma->vm_start <= addr) &&
		201	(vma->vm_flags & VM_PFNMAP)) {
215	next = vma->vm_end;	202	next = vma->vm_end;
		203	pgd = pgd_offset(walk->mm, next);
		204	continue;
		205	}
216	/*	206	/*
217	* Hugepage is very tightly coupled with vma, so	207	* Handle hugetlb vma individually because pagetable
218	* walk through hugetlb entries within a given vma.	208	* walk for the hugetlb page is dependent on the
		209	* architecture and we can't handled it in the same
		210	* manner as non-huge pages.
219	*/	211	*/
220	err = walk_hugetlb_range(vma, addr, next, walk);	212	if (walk->hugetlb_entry && (vma->vm_start <= addr) &&
221	if (err)	213	is_vm_hugetlb_page(vma)) {
222	break;	214	if (vma->vm_end < next)
223	pgd = pgd_offset(walk->mm, next);	215	next = vma->vm_end;
224	continue;	216	/*
		217	* Hugepage is very tightly coupled with vma,
		218	* so walk through hugetlb entries within a
		219	* given vma.
		220	*/
		221	err = walk_hugetlb_range(vma, addr, next, walk);
		222	if (err)
		223	break;
		224	pgd = pgd_offset(walk->mm, next);
		225	continue;
		226	}
225	}	227	}
226		228
227	if (pgd_none_or_clear_bad(pgd)) {	229	if (pgd_none_or_clear_bad(pgd)) {