summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChristoph Hellwig <hch@lst.de>2019-08-28 10:19:54 -0400
committerJason Gunthorpe <jgg@mellanox.com>2019-09-07 03:28:04 -0400
commit7b86ac3371b70c3fd8fd95501719beb1faab719f (patch)
treeb7f61e4615d249563f09567a22ee399634c898dd
parenta520110e4a15ceb385304d9cab22bb51438f6080 (diff)
pagewalk: separate function pointers from iterator data
The mm_walk structure currently mixed data and code. Split out the operations vectors into a new mm_walk_ops structure, and while we are changing the API also declare the mm_walk structure inside the walk_page_range and walk_page_vma functions. Based on patch from Linus Torvalds. Link: https://lore.kernel.org/r/20190828141955.22210-3-hch@lst.de Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Thomas Hellstrom <thellstrom@vmware.com> Reviewed-by: Steven Price <steven.price@arm.com> Reviewed-by: Jason Gunthorpe <jgg@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
-rw-r--r--arch/openrisc/kernel/dma.c22
-rw-r--r--arch/powerpc/mm/book3s64/subpage_prot.c10
-rw-r--r--arch/s390/mm/gmap.c33
-rw-r--r--fs/proc/task_mmu.c78
-rw-r--r--include/linux/pagewalk.h64
-rw-r--r--mm/hmm.c23
-rw-r--r--mm/madvise.c41
-rw-r--r--mm/memcontrol.c23
-rw-r--r--mm/mempolicy.c15
-rw-r--r--mm/migrate.c23
-rw-r--r--mm/mincore.c15
-rw-r--r--mm/mprotect.c24
-rw-r--r--mm/pagewalk.c124
13 files changed, 251 insertions, 244 deletions
diff --git a/arch/openrisc/kernel/dma.c b/arch/openrisc/kernel/dma.c
index c7812e6effa2..4d5b8bd1d795 100644
--- a/arch/openrisc/kernel/dma.c
+++ b/arch/openrisc/kernel/dma.c
@@ -44,6 +44,10 @@ page_set_nocache(pte_t *pte, unsigned long addr,
44 return 0; 44 return 0;
45} 45}
46 46
47static const struct mm_walk_ops set_nocache_walk_ops = {
48 .pte_entry = page_set_nocache,
49};
50
47static int 51static int
48page_clear_nocache(pte_t *pte, unsigned long addr, 52page_clear_nocache(pte_t *pte, unsigned long addr,
49 unsigned long next, struct mm_walk *walk) 53 unsigned long next, struct mm_walk *walk)
@@ -59,6 +63,10 @@ page_clear_nocache(pte_t *pte, unsigned long addr,
59 return 0; 63 return 0;
60} 64}
61 65
66static const struct mm_walk_ops clear_nocache_walk_ops = {
67 .pte_entry = page_clear_nocache,
68};
69
62/* 70/*
63 * Alloc "coherent" memory, which for OpenRISC means simply uncached. 71 * Alloc "coherent" memory, which for OpenRISC means simply uncached.
64 * 72 *
@@ -81,10 +89,6 @@ arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
81{ 89{
82 unsigned long va; 90 unsigned long va;
83 void *page; 91 void *page;
84 struct mm_walk walk = {
85 .pte_entry = page_set_nocache,
86 .mm = &init_mm
87 };
88 92
89 page = alloc_pages_exact(size, gfp | __GFP_ZERO); 93 page = alloc_pages_exact(size, gfp | __GFP_ZERO);
90 if (!page) 94 if (!page)
@@ -99,7 +103,8 @@ arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
99 * We need to iterate through the pages, clearing the dcache for 103 * We need to iterate through the pages, clearing the dcache for
100 * them and setting the cache-inhibit bit. 104 * them and setting the cache-inhibit bit.
101 */ 105 */
102 if (walk_page_range(va, va + size, &walk)) { 106 if (walk_page_range(&init_mm, va, va + size, &set_nocache_walk_ops,
107 NULL)) {
103 free_pages_exact(page, size); 108 free_pages_exact(page, size);
104 return NULL; 109 return NULL;
105 } 110 }
@@ -112,13 +117,10 @@ arch_dma_free(struct device *dev, size_t size, void *vaddr,
112 dma_addr_t dma_handle, unsigned long attrs) 117 dma_addr_t dma_handle, unsigned long attrs)
113{ 118{
114 unsigned long va = (unsigned long)vaddr; 119 unsigned long va = (unsigned long)vaddr;
115 struct mm_walk walk = {
116 .pte_entry = page_clear_nocache,
117 .mm = &init_mm
118 };
119 120
120 /* walk_page_range shouldn't be able to fail here */ 121 /* walk_page_range shouldn't be able to fail here */
121 WARN_ON(walk_page_range(va, va + size, &walk)); 122 WARN_ON(walk_page_range(&init_mm, va, va + size,
123 &clear_nocache_walk_ops, NULL));
122 124
123 free_pages_exact(vaddr, size); 125 free_pages_exact(vaddr, size);
124} 126}
diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c
index 236f0a861ecc..2ef24a53f4c9 100644
--- a/arch/powerpc/mm/book3s64/subpage_prot.c
+++ b/arch/powerpc/mm/book3s64/subpage_prot.c
@@ -139,14 +139,14 @@ static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
139 return 0; 139 return 0;
140} 140}
141 141
142static const struct mm_walk_ops subpage_walk_ops = {
143 .pmd_entry = subpage_walk_pmd_entry,
144};
145
142static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, 146static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
143 unsigned long len) 147 unsigned long len)
144{ 148{
145 struct vm_area_struct *vma; 149 struct vm_area_struct *vma;
146 struct mm_walk subpage_proto_walk = {
147 .mm = mm,
148 .pmd_entry = subpage_walk_pmd_entry,
149 };
150 150
151 /* 151 /*
152 * We don't try too hard, we just mark all the vma in that range 152 * We don't try too hard, we just mark all the vma in that range
@@ -163,7 +163,7 @@ static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
163 if (vma->vm_start >= (addr + len)) 163 if (vma->vm_start >= (addr + len))
164 break; 164 break;
165 vma->vm_flags |= VM_NOHUGEPAGE; 165 vma->vm_flags |= VM_NOHUGEPAGE;
166 walk_page_vma(vma, &subpage_proto_walk); 166 walk_page_vma(vma, &subpage_walk_ops, NULL);
167 vma = vma->vm_next; 167 vma = vma->vm_next;
168 } 168 }
169} 169}
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index cf80feae970d..bd78d504fdad 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -2521,13 +2521,9 @@ static int __zap_zero_pages(pmd_t *pmd, unsigned long start,
2521 return 0; 2521 return 0;
2522} 2522}
2523 2523
2524static inline void zap_zero_pages(struct mm_struct *mm) 2524static const struct mm_walk_ops zap_zero_walk_ops = {
2525{ 2525 .pmd_entry = __zap_zero_pages,
2526 struct mm_walk walk = { .pmd_entry = __zap_zero_pages }; 2526};
2527
2528 walk.mm = mm;
2529 walk_page_range(0, TASK_SIZE, &walk);
2530}
2531 2527
2532/* 2528/*
2533 * switch on pgstes for its userspace process (for kvm) 2529 * switch on pgstes for its userspace process (for kvm)
@@ -2546,7 +2542,7 @@ int s390_enable_sie(void)
2546 mm->context.has_pgste = 1; 2542 mm->context.has_pgste = 1;
2547 /* split thp mappings and disable thp for future mappings */ 2543 /* split thp mappings and disable thp for future mappings */
2548 thp_split_mm(mm); 2544 thp_split_mm(mm);
2549 zap_zero_pages(mm); 2545 walk_page_range(mm, 0, TASK_SIZE, &zap_zero_walk_ops, NULL);
2550 up_write(&mm->mmap_sem); 2546 up_write(&mm->mmap_sem);
2551 return 0; 2547 return 0;
2552} 2548}
@@ -2589,12 +2585,13 @@ static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr,
2589 return 0; 2585 return 0;
2590} 2586}
2591 2587
2588static const struct mm_walk_ops enable_skey_walk_ops = {
2589 .hugetlb_entry = __s390_enable_skey_hugetlb,
2590 .pte_entry = __s390_enable_skey_pte,
2591};
2592
2592int s390_enable_skey(void) 2593int s390_enable_skey(void)
2593{ 2594{
2594 struct mm_walk walk = {
2595 .hugetlb_entry = __s390_enable_skey_hugetlb,
2596 .pte_entry = __s390_enable_skey_pte,
2597 };
2598 struct mm_struct *mm = current->mm; 2595 struct mm_struct *mm = current->mm;
2599 struct vm_area_struct *vma; 2596 struct vm_area_struct *vma;
2600 int rc = 0; 2597 int rc = 0;
@@ -2614,8 +2611,7 @@ int s390_enable_skey(void)
2614 } 2611 }
2615 mm->def_flags &= ~VM_MERGEABLE; 2612 mm->def_flags &= ~VM_MERGEABLE;
2616 2613
2617 walk.mm = mm; 2614 walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL);
2618 walk_page_range(0, TASK_SIZE, &walk);
2619 2615
2620out_up: 2616out_up:
2621 up_write(&mm->mmap_sem); 2617 up_write(&mm->mmap_sem);
@@ -2633,13 +2629,14 @@ static int __s390_reset_cmma(pte_t *pte, unsigned long addr,
2633 return 0; 2629 return 0;
2634} 2630}
2635 2631
2632static const struct mm_walk_ops reset_cmma_walk_ops = {
2633 .pte_entry = __s390_reset_cmma,
2634};
2635
2636void s390_reset_cmma(struct mm_struct *mm) 2636void s390_reset_cmma(struct mm_struct *mm)
2637{ 2637{
2638 struct mm_walk walk = { .pte_entry = __s390_reset_cmma };
2639
2640 down_write(&mm->mmap_sem); 2638 down_write(&mm->mmap_sem);
2641 walk.mm = mm; 2639 walk_page_range(mm, 0, TASK_SIZE, &reset_cmma_walk_ops, NULL);
2642 walk_page_range(0, TASK_SIZE, &walk);
2643 up_write(&mm->mmap_sem); 2640 up_write(&mm->mmap_sem);
2644} 2641}
2645EXPORT_SYMBOL_GPL(s390_reset_cmma); 2642EXPORT_SYMBOL_GPL(s390_reset_cmma);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 8857da830b86..bf43d1d60059 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -513,7 +513,9 @@ static int smaps_pte_hole(unsigned long addr, unsigned long end,
513 513
514 return 0; 514 return 0;
515} 515}
516#endif 516#else
517#define smaps_pte_hole NULL
518#endif /* CONFIG_SHMEM */
517 519
518static void smaps_pte_entry(pte_t *pte, unsigned long addr, 520static void smaps_pte_entry(pte_t *pte, unsigned long addr,
519 struct mm_walk *walk) 521 struct mm_walk *walk)
@@ -729,21 +731,24 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
729 } 731 }
730 return 0; 732 return 0;
731} 733}
734#else
735#define smaps_hugetlb_range NULL
732#endif /* HUGETLB_PAGE */ 736#endif /* HUGETLB_PAGE */
733 737
738static const struct mm_walk_ops smaps_walk_ops = {
739 .pmd_entry = smaps_pte_range,
740 .hugetlb_entry = smaps_hugetlb_range,
741};
742
743static const struct mm_walk_ops smaps_shmem_walk_ops = {
744 .pmd_entry = smaps_pte_range,
745 .hugetlb_entry = smaps_hugetlb_range,
746 .pte_hole = smaps_pte_hole,
747};
748
734static void smap_gather_stats(struct vm_area_struct *vma, 749static void smap_gather_stats(struct vm_area_struct *vma,
735 struct mem_size_stats *mss) 750 struct mem_size_stats *mss)
736{ 751{
737 struct mm_walk smaps_walk = {
738 .pmd_entry = smaps_pte_range,
739#ifdef CONFIG_HUGETLB_PAGE
740 .hugetlb_entry = smaps_hugetlb_range,
741#endif
742 .mm = vma->vm_mm,
743 };
744
745 smaps_walk.private = mss;
746
747#ifdef CONFIG_SHMEM 752#ifdef CONFIG_SHMEM
748 /* In case of smaps_rollup, reset the value from previous vma */ 753 /* In case of smaps_rollup, reset the value from previous vma */
749 mss->check_shmem_swap = false; 754 mss->check_shmem_swap = false;
@@ -765,12 +770,13 @@ static void smap_gather_stats(struct vm_area_struct *vma,
765 mss->swap += shmem_swapped; 770 mss->swap += shmem_swapped;
766 } else { 771 } else {
767 mss->check_shmem_swap = true; 772 mss->check_shmem_swap = true;
768 smaps_walk.pte_hole = smaps_pte_hole; 773 walk_page_vma(vma, &smaps_shmem_walk_ops, mss);
774 return;
769 } 775 }
770 } 776 }
771#endif 777#endif
772 /* mmap_sem is held in m_start */ 778 /* mmap_sem is held in m_start */
773 walk_page_vma(vma, &smaps_walk); 779 walk_page_vma(vma, &smaps_walk_ops, mss);
774} 780}
775 781
776#define SEQ_PUT_DEC(str, val) \ 782#define SEQ_PUT_DEC(str, val) \
@@ -1118,6 +1124,11 @@ static int clear_refs_test_walk(unsigned long start, unsigned long end,
1118 return 0; 1124 return 0;
1119} 1125}
1120 1126
1127static const struct mm_walk_ops clear_refs_walk_ops = {
1128 .pmd_entry = clear_refs_pte_range,
1129 .test_walk = clear_refs_test_walk,
1130};
1131
1121static ssize_t clear_refs_write(struct file *file, const char __user *buf, 1132static ssize_t clear_refs_write(struct file *file, const char __user *buf,
1122 size_t count, loff_t *ppos) 1133 size_t count, loff_t *ppos)
1123{ 1134{
@@ -1151,12 +1162,6 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
1151 struct clear_refs_private cp = { 1162 struct clear_refs_private cp = {
1152 .type = type, 1163 .type = type,
1153 }; 1164 };
1154 struct mm_walk clear_refs_walk = {
1155 .pmd_entry = clear_refs_pte_range,
1156 .test_walk = clear_refs_test_walk,
1157 .mm = mm,
1158 .private = &cp,
1159 };
1160 1165
1161 if (type == CLEAR_REFS_MM_HIWATER_RSS) { 1166 if (type == CLEAR_REFS_MM_HIWATER_RSS) {
1162 if (down_write_killable(&mm->mmap_sem)) { 1167 if (down_write_killable(&mm->mmap_sem)) {
@@ -1217,7 +1222,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
1217 0, NULL, mm, 0, -1UL); 1222 0, NULL, mm, 0, -1UL);
1218 mmu_notifier_invalidate_range_start(&range); 1223 mmu_notifier_invalidate_range_start(&range);
1219 } 1224 }
1220 walk_page_range(0, mm->highest_vm_end, &clear_refs_walk); 1225 walk_page_range(mm, 0, mm->highest_vm_end, &clear_refs_walk_ops,
1226 &cp);
1221 if (type == CLEAR_REFS_SOFT_DIRTY) 1227 if (type == CLEAR_REFS_SOFT_DIRTY)
1222 mmu_notifier_invalidate_range_end(&range); 1228 mmu_notifier_invalidate_range_end(&range);
1223 tlb_finish_mmu(&tlb, 0, -1); 1229 tlb_finish_mmu(&tlb, 0, -1);
@@ -1489,8 +1495,16 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
1489 1495
1490 return err; 1496 return err;
1491} 1497}
1498#else
1499#define pagemap_hugetlb_range NULL
1492#endif /* HUGETLB_PAGE */ 1500#endif /* HUGETLB_PAGE */
1493 1501
1502static const struct mm_walk_ops pagemap_ops = {
1503 .pmd_entry = pagemap_pmd_range,
1504 .pte_hole = pagemap_pte_hole,
1505 .hugetlb_entry = pagemap_hugetlb_range,
1506};
1507
1494/* 1508/*
1495 * /proc/pid/pagemap - an array mapping virtual pages to pfns 1509 * /proc/pid/pagemap - an array mapping virtual pages to pfns
1496 * 1510 *
@@ -1522,7 +1536,6 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
1522{ 1536{
1523 struct mm_struct *mm = file->private_data; 1537 struct mm_struct *mm = file->private_data;
1524 struct pagemapread pm; 1538 struct pagemapread pm;
1525 struct mm_walk pagemap_walk = {};
1526 unsigned long src; 1539 unsigned long src;
1527 unsigned long svpfn; 1540 unsigned long svpfn;
1528 unsigned long start_vaddr; 1541 unsigned long start_vaddr;
@@ -1550,14 +1563,6 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
1550 if (!pm.buffer) 1563 if (!pm.buffer)
1551 goto out_mm; 1564 goto out_mm;
1552 1565
1553 pagemap_walk.pmd_entry = pagemap_pmd_range;
1554 pagemap_walk.pte_hole = pagemap_pte_hole;
1555#ifdef CONFIG_HUGETLB_PAGE
1556 pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
1557#endif
1558 pagemap_walk.mm = mm;
1559 pagemap_walk.private = &pm;
1560
1561 src = *ppos; 1566 src = *ppos;
1562 svpfn = src / PM_ENTRY_BYTES; 1567 svpfn = src / PM_ENTRY_BYTES;
1563 start_vaddr = svpfn << PAGE_SHIFT; 1568 start_vaddr = svpfn << PAGE_SHIFT;
@@ -1586,7 +1591,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
1586 ret = down_read_killable(&mm->mmap_sem); 1591 ret = down_read_killable(&mm->mmap_sem);
1587 if (ret) 1592 if (ret)
1588 goto out_free; 1593 goto out_free;
1589 ret = walk_page_range(start_vaddr, end, &pagemap_walk); 1594 ret = walk_page_range(mm, start_vaddr, end, &pagemap_ops, &pm);
1590 up_read(&mm->mmap_sem); 1595 up_read(&mm->mmap_sem);
1591 start_vaddr = end; 1596 start_vaddr = end;
1592 1597
@@ -1798,6 +1803,11 @@ static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
1798} 1803}
1799#endif 1804#endif
1800 1805
1806static const struct mm_walk_ops show_numa_ops = {
1807 .hugetlb_entry = gather_hugetlb_stats,
1808 .pmd_entry = gather_pte_stats,
1809};
1810
1801/* 1811/*
1802 * Display pages allocated per node and memory policy via /proc. 1812 * Display pages allocated per node and memory policy via /proc.
1803 */ 1813 */
@@ -1809,12 +1819,6 @@ static int show_numa_map(struct seq_file *m, void *v)
1809 struct numa_maps *md = &numa_priv->md; 1819 struct numa_maps *md = &numa_priv->md;
1810 struct file *file = vma->vm_file; 1820 struct file *file = vma->vm_file;
1811 struct mm_struct *mm = vma->vm_mm; 1821 struct mm_struct *mm = vma->vm_mm;
1812 struct mm_walk walk = {
1813 .hugetlb_entry = gather_hugetlb_stats,
1814 .pmd_entry = gather_pte_stats,
1815 .private = md,
1816 .mm = mm,
1817 };
1818 struct mempolicy *pol; 1822 struct mempolicy *pol;
1819 char buffer[64]; 1823 char buffer[64];
1820 int nid; 1824 int nid;
@@ -1848,7 +1852,7 @@ static int show_numa_map(struct seq_file *m, void *v)
1848 seq_puts(m, " huge"); 1852 seq_puts(m, " huge");
1849 1853
1850 /* mmap_sem is held by m_start */ 1854 /* mmap_sem is held by m_start */
1851 walk_page_vma(vma, &walk); 1855 walk_page_vma(vma, &show_numa_ops, md);
1852 1856
1853 if (!md->pages) 1857 if (!md->pages)
1854 goto out; 1858 goto out;
diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
index df278a94086d..bddd9759bab9 100644
--- a/include/linux/pagewalk.h
+++ b/include/linux/pagewalk.h
@@ -4,31 +4,28 @@
4 4
5#include <linux/mm.h> 5#include <linux/mm.h>
6 6
7struct mm_walk;
8
7/** 9/**
8 * mm_walk - callbacks for walk_page_range 10 * mm_walk_ops - callbacks for walk_page_range
9 * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry 11 * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry
10 * this handler should only handle pud_trans_huge() puds. 12 * this handler should only handle pud_trans_huge() puds.
11 * the pmd_entry or pte_entry callbacks will be used for 13 * the pmd_entry or pte_entry callbacks will be used for
12 * regular PUDs. 14 * regular PUDs.
13 * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry 15 * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry
14 * this handler is required to be able to handle 16 * this handler is required to be able to handle
15 * pmd_trans_huge() pmds. They may simply choose to 17 * pmd_trans_huge() pmds. They may simply choose to
16 * split_huge_page() instead of handling it explicitly. 18 * split_huge_page() instead of handling it explicitly.
17 * @pte_entry: if set, called for each non-empty PTE (4th-level) entry 19 * @pte_entry: if set, called for each non-empty PTE (4th-level) entry
18 * @pte_hole: if set, called for each hole at all levels 20 * @pte_hole: if set, called for each hole at all levels
19 * @hugetlb_entry: if set, called for each hugetlb entry 21 * @hugetlb_entry: if set, called for each hugetlb entry
20 * @test_walk: caller specific callback function to determine whether 22 * @test_walk: caller specific callback function to determine whether
21 * we walk over the current vma or not. Returning 0 23 * we walk over the current vma or not. Returning 0 means
22 * value means "do page table walk over the current vma," 24 * "do page table walk over the current vma", returning
23 * and a negative one means "abort current page table walk 25 * a negative value means "abort current page table walk
24 * right now." 1 means "skip the current vma." 26 * right now" and returning 1 means "skip the current vma"
25 * @mm: mm_struct representing the target process of page table walk
26 * @vma: vma currently walked (NULL if walking outside vmas)
27 * @private: private data for callbacks' usage
28 *
29 * (see the comment on walk_page_range() for more details)
30 */ 27 */
31struct mm_walk { 28struct mm_walk_ops {
32 int (*pud_entry)(pud_t *pud, unsigned long addr, 29 int (*pud_entry)(pud_t *pud, unsigned long addr,
33 unsigned long next, struct mm_walk *walk); 30 unsigned long next, struct mm_walk *walk);
34 int (*pmd_entry)(pmd_t *pmd, unsigned long addr, 31 int (*pmd_entry)(pmd_t *pmd, unsigned long addr,
@@ -42,13 +39,28 @@ struct mm_walk {
42 struct mm_walk *walk); 39 struct mm_walk *walk);
43 int (*test_walk)(unsigned long addr, unsigned long next, 40 int (*test_walk)(unsigned long addr, unsigned long next,
44 struct mm_walk *walk); 41 struct mm_walk *walk);
42};
43
44/**
45 * mm_walk - walk_page_range data
46 * @ops: operation to call during the walk
47 * @mm: mm_struct representing the target process of page table walk
48 * @vma: vma currently walked (NULL if walking outside vmas)
49 * @private: private data for callbacks' usage
50 *
51 * (see the comment on walk_page_range() for more details)
52 */
53struct mm_walk {
54 const struct mm_walk_ops *ops;
45 struct mm_struct *mm; 55 struct mm_struct *mm;
46 struct vm_area_struct *vma; 56 struct vm_area_struct *vma;
47 void *private; 57 void *private;
48}; 58};
49 59
50int walk_page_range(unsigned long addr, unsigned long end, 60int walk_page_range(struct mm_struct *mm, unsigned long start,
51 struct mm_walk *walk); 61 unsigned long end, const struct mm_walk_ops *ops,
52int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk); 62 void *private);
63int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
64 void *private);
53 65
54#endif /* _LINUX_PAGEWALK_H */ 66#endif /* _LINUX_PAGEWALK_H */
diff --git a/mm/hmm.c b/mm/hmm.c
index 26916ff6c8df..902f5fa6bf93 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -852,6 +852,13 @@ void hmm_range_unregister(struct hmm_range *range)
852} 852}
853EXPORT_SYMBOL(hmm_range_unregister); 853EXPORT_SYMBOL(hmm_range_unregister);
854 854
855static const struct mm_walk_ops hmm_walk_ops = {
856 .pud_entry = hmm_vma_walk_pud,
857 .pmd_entry = hmm_vma_walk_pmd,
858 .pte_hole = hmm_vma_walk_hole,
859 .hugetlb_entry = hmm_vma_walk_hugetlb_entry,
860};
861
855/** 862/**
856 * hmm_range_fault - try to fault some address in a virtual address range 863 * hmm_range_fault - try to fault some address in a virtual address range
857 * @range: range being faulted 864 * @range: range being faulted
@@ -887,7 +894,6 @@ long hmm_range_fault(struct hmm_range *range, unsigned int flags)
887 struct hmm_vma_walk hmm_vma_walk; 894 struct hmm_vma_walk hmm_vma_walk;
888 struct hmm *hmm = range->hmm; 895 struct hmm *hmm = range->hmm;
889 struct vm_area_struct *vma; 896 struct vm_area_struct *vma;
890 struct mm_walk mm_walk;
891 int ret; 897 int ret;
892 898
893 lockdep_assert_held(&hmm->mmu_notifier.mm->mmap_sem); 899 lockdep_assert_held(&hmm->mmu_notifier.mm->mmap_sem);
@@ -916,21 +922,14 @@ long hmm_range_fault(struct hmm_range *range, unsigned int flags)
916 hmm_vma_walk.last = start; 922 hmm_vma_walk.last = start;
917 hmm_vma_walk.flags = flags; 923 hmm_vma_walk.flags = flags;
918 hmm_vma_walk.range = range; 924 hmm_vma_walk.range = range;
919 mm_walk.private = &hmm_vma_walk;
920 end = min(range->end, vma->vm_end); 925 end = min(range->end, vma->vm_end);
921 926
922 mm_walk.vma = vma; 927 walk_page_range(vma->vm_mm, start, end, &hmm_walk_ops,
923 mm_walk.mm = vma->vm_mm; 928 &hmm_vma_walk);
924 mm_walk.pte_entry = NULL;
925 mm_walk.test_walk = NULL;
926 mm_walk.hugetlb_entry = NULL;
927 mm_walk.pud_entry = hmm_vma_walk_pud;
928 mm_walk.pmd_entry = hmm_vma_walk_pmd;
929 mm_walk.pte_hole = hmm_vma_walk_hole;
930 mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry;
931 929
932 do { 930 do {
933 ret = walk_page_range(start, end, &mm_walk); 931 ret = walk_page_range(vma->vm_mm, start, end,
932 &hmm_walk_ops, &hmm_vma_walk);
934 start = hmm_vma_walk.last; 933 start = hmm_vma_walk.last;
935 934
936 /* Keep trying while the range is valid. */ 935 /* Keep trying while the range is valid. */
diff --git a/mm/madvise.c b/mm/madvise.c
index 80a78bb16782..afe2b015ea58 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -226,19 +226,9 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
226 return 0; 226 return 0;
227} 227}
228 228
229static void force_swapin_readahead(struct vm_area_struct *vma, 229static const struct mm_walk_ops swapin_walk_ops = {
230 unsigned long start, unsigned long end) 230 .pmd_entry = swapin_walk_pmd_entry,
231{ 231};
232 struct mm_walk walk = {
233 .mm = vma->vm_mm,
234 .pmd_entry = swapin_walk_pmd_entry,
235 .private = vma,
236 };
237
238 walk_page_range(start, end, &walk);
239
240 lru_add_drain(); /* Push any new pages onto the LRU now */
241}
242 232
243static void force_shm_swapin_readahead(struct vm_area_struct *vma, 233static void force_shm_swapin_readahead(struct vm_area_struct *vma,
244 unsigned long start, unsigned long end, 234 unsigned long start, unsigned long end,
@@ -280,7 +270,8 @@ static long madvise_willneed(struct vm_area_struct *vma,
280 *prev = vma; 270 *prev = vma;
281#ifdef CONFIG_SWAP 271#ifdef CONFIG_SWAP
282 if (!file) { 272 if (!file) {
283 force_swapin_readahead(vma, start, end); 273 walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
274 lru_add_drain(); /* Push any new pages onto the LRU now */
284 return 0; 275 return 0;
285 } 276 }
286 277
@@ -441,20 +432,9 @@ next:
441 return 0; 432 return 0;
442} 433}
443 434
444static void madvise_free_page_range(struct mmu_gather *tlb, 435static const struct mm_walk_ops madvise_free_walk_ops = {
445 struct vm_area_struct *vma, 436 .pmd_entry = madvise_free_pte_range,
446 unsigned long addr, unsigned long end) 437};
447{
448 struct mm_walk free_walk = {
449 .pmd_entry = madvise_free_pte_range,
450 .mm = vma->vm_mm,
451 .private = tlb,
452 };
453
454 tlb_start_vma(tlb, vma);
455 walk_page_range(addr, end, &free_walk);
456 tlb_end_vma(tlb, vma);
457}
458 438
459static int madvise_free_single_vma(struct vm_area_struct *vma, 439static int madvise_free_single_vma(struct vm_area_struct *vma,
460 unsigned long start_addr, unsigned long end_addr) 440 unsigned long start_addr, unsigned long end_addr)
@@ -481,7 +461,10 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
481 update_hiwater_rss(mm); 461 update_hiwater_rss(mm);
482 462
483 mmu_notifier_invalidate_range_start(&range); 463 mmu_notifier_invalidate_range_start(&range);
484 madvise_free_page_range(&tlb, vma, range.start, range.end); 464 tlb_start_vma(&tlb, vma);
465 walk_page_range(vma->vm_mm, range.start, range.end,
466 &madvise_free_walk_ops, &tlb);
467 tlb_end_vma(&tlb, vma);
485 mmu_notifier_invalidate_range_end(&range); 468 mmu_notifier_invalidate_range_end(&range);
486 tlb_finish_mmu(&tlb, range.start, range.end); 469 tlb_finish_mmu(&tlb, range.start, range.end);
487 470
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4c3af5d71ab1..9b2516a76be2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5283,17 +5283,16 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5283 return 0; 5283 return 0;
5284} 5284}
5285 5285
5286static const struct mm_walk_ops precharge_walk_ops = {
5287 .pmd_entry = mem_cgroup_count_precharge_pte_range,
5288};
5289
5286static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 5290static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
5287{ 5291{
5288 unsigned long precharge; 5292 unsigned long precharge;
5289 5293
5290 struct mm_walk mem_cgroup_count_precharge_walk = {
5291 .pmd_entry = mem_cgroup_count_precharge_pte_range,
5292 .mm = mm,
5293 };
5294 down_read(&mm->mmap_sem); 5294 down_read(&mm->mmap_sem);
5295 walk_page_range(0, mm->highest_vm_end, 5295 walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
5296 &mem_cgroup_count_precharge_walk);
5297 up_read(&mm->mmap_sem); 5296 up_read(&mm->mmap_sem);
5298 5297
5299 precharge = mc.precharge; 5298 precharge = mc.precharge;
@@ -5562,13 +5561,12 @@ put: /* get_mctgt_type() gets the page */
5562 return ret; 5561 return ret;
5563} 5562}
5564 5563
5564static const struct mm_walk_ops charge_walk_ops = {
5565 .pmd_entry = mem_cgroup_move_charge_pte_range,
5566};
5567
5565static void mem_cgroup_move_charge(void) 5568static void mem_cgroup_move_charge(void)
5566{ 5569{
5567 struct mm_walk mem_cgroup_move_charge_walk = {
5568 .pmd_entry = mem_cgroup_move_charge_pte_range,
5569 .mm = mc.mm,
5570 };
5571
5572 lru_add_drain_all(); 5570 lru_add_drain_all();
5573 /* 5571 /*
5574 * Signal lock_page_memcg() to take the memcg's move_lock 5572 * Signal lock_page_memcg() to take the memcg's move_lock
@@ -5594,7 +5592,8 @@ retry:
5594 * When we have consumed all precharges and failed in doing 5592 * When we have consumed all precharges and failed in doing
5595 * additional charge, the page walk just aborts. 5593 * additional charge, the page walk just aborts.
5596 */ 5594 */
5597 walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk); 5595 walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
5596 NULL);
5598 5597
5599 up_read(&mc.mm->mmap_sem); 5598 up_read(&mc.mm->mmap_sem);
5600 atomic_dec(&mc.from->moving_account); 5599 atomic_dec(&mc.from->moving_account);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3a96def1e796..f000771558d8 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -655,6 +655,12 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
655 return 1; 655 return 1;
656} 656}
657 657
658static const struct mm_walk_ops queue_pages_walk_ops = {
659 .hugetlb_entry = queue_pages_hugetlb,
660 .pmd_entry = queue_pages_pte_range,
661 .test_walk = queue_pages_test_walk,
662};
663
658/* 664/*
659 * Walk through page tables and collect pages to be migrated. 665 * Walk through page tables and collect pages to be migrated.
660 * 666 *
@@ -679,15 +685,8 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
679 .nmask = nodes, 685 .nmask = nodes,
680 .prev = NULL, 686 .prev = NULL,
681 }; 687 };
682 struct mm_walk queue_pages_walk = {
683 .hugetlb_entry = queue_pages_hugetlb,
684 .pmd_entry = queue_pages_pte_range,
685 .test_walk = queue_pages_test_walk,
686 .mm = mm,
687 .private = &qp,
688 };
689 688
690 return walk_page_range(start, end, &queue_pages_walk); 689 return walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
691} 690}
692 691
693/* 692/*
diff --git a/mm/migrate.c b/mm/migrate.c
index c9c73a35aca7..9f4ed4e985c1 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2320,6 +2320,11 @@ next:
2320 return 0; 2320 return 0;
2321} 2321}
2322 2322
2323static const struct mm_walk_ops migrate_vma_walk_ops = {
2324 .pmd_entry = migrate_vma_collect_pmd,
2325 .pte_hole = migrate_vma_collect_hole,
2326};
2327
2323/* 2328/*
2324 * migrate_vma_collect() - collect pages over a range of virtual addresses 2329 * migrate_vma_collect() - collect pages over a range of virtual addresses
2325 * @migrate: migrate struct containing all migration information 2330 * @migrate: migrate struct containing all migration information
@@ -2331,21 +2336,15 @@ next:
2331static void migrate_vma_collect(struct migrate_vma *migrate) 2336static void migrate_vma_collect(struct migrate_vma *migrate)
2332{ 2337{
2333 struct mmu_notifier_range range; 2338 struct mmu_notifier_range range;
2334 struct mm_walk mm_walk = {
2335 .pmd_entry = migrate_vma_collect_pmd,
2336 .pte_hole = migrate_vma_collect_hole,
2337 .vma = migrate->vma,
2338 .mm = migrate->vma->vm_mm,
2339 .private = migrate,
2340 };
2341 2339
2342 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm_walk.mm, 2340 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL,
2343 migrate->start, 2341 migrate->vma->vm_mm, migrate->start, migrate->end);
2344 migrate->end);
2345 mmu_notifier_invalidate_range_start(&range); 2342 mmu_notifier_invalidate_range_start(&range);
2346 walk_page_range(migrate->start, migrate->end, &mm_walk);
2347 mmu_notifier_invalidate_range_end(&range);
2348 2343
2344 walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end,
2345 &migrate_vma_walk_ops, migrate);
2346
2347 mmu_notifier_invalidate_range_end(&range);
2349 migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT); 2348 migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
2350} 2349}
2351 2350
diff --git a/mm/mincore.c b/mm/mincore.c
index 3b051b6ab3fe..f9a9dbe8cd33 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -193,6 +193,12 @@ static inline bool can_do_mincore(struct vm_area_struct *vma)
193 inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0; 193 inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
194} 194}
195 195
196static const struct mm_walk_ops mincore_walk_ops = {
197 .pmd_entry = mincore_pte_range,
198 .pte_hole = mincore_unmapped_range,
199 .hugetlb_entry = mincore_hugetlb,
200};
201
196/* 202/*
197 * Do a chunk of "sys_mincore()". We've already checked 203 * Do a chunk of "sys_mincore()". We've already checked
198 * all the arguments, we hold the mmap semaphore: we should 204 * all the arguments, we hold the mmap semaphore: we should
@@ -203,12 +209,6 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
203 struct vm_area_struct *vma; 209 struct vm_area_struct *vma;
204 unsigned long end; 210 unsigned long end;
205 int err; 211 int err;
206 struct mm_walk mincore_walk = {
207 .pmd_entry = mincore_pte_range,
208 .pte_hole = mincore_unmapped_range,
209 .hugetlb_entry = mincore_hugetlb,
210 .private = vec,
211 };
212 212
213 vma = find_vma(current->mm, addr); 213 vma = find_vma(current->mm, addr);
214 if (!vma || addr < vma->vm_start) 214 if (!vma || addr < vma->vm_start)
@@ -219,8 +219,7 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
219 memset(vec, 1, pages); 219 memset(vec, 1, pages);
220 return pages; 220 return pages;
221 } 221 }
222 mincore_walk.mm = vma->vm_mm; 222 err = walk_page_range(vma->vm_mm, addr, end, &mincore_walk_ops, vec);
223 err = walk_page_range(addr, end, &mincore_walk);
224 if (err < 0) 223 if (err < 0)
225 return err; 224 return err;
226 return (end - addr) >> PAGE_SHIFT; 225 return (end - addr) >> PAGE_SHIFT;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index cc73318dbc25..675e5d34a507 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -329,20 +329,11 @@ static int prot_none_test(unsigned long addr, unsigned long next,
329 return 0; 329 return 0;
330} 330}
331 331
332static int prot_none_walk(struct vm_area_struct *vma, unsigned long start, 332static const struct mm_walk_ops prot_none_walk_ops = {
333 unsigned long end, unsigned long newflags) 333 .pte_entry = prot_none_pte_entry,
334{ 334 .hugetlb_entry = prot_none_hugetlb_entry,
335 pgprot_t new_pgprot = vm_get_page_prot(newflags); 335 .test_walk = prot_none_test,
336 struct mm_walk prot_none_walk = { 336};
337 .pte_entry = prot_none_pte_entry,
338 .hugetlb_entry = prot_none_hugetlb_entry,
339 .test_walk = prot_none_test,
340 .mm = current->mm,
341 .private = &new_pgprot,
342 };
343
344 return walk_page_range(start, end, &prot_none_walk);
345}
346 337
347int 338int
348mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, 339mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
@@ -369,7 +360,10 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
369 if (arch_has_pfn_modify_check() && 360 if (arch_has_pfn_modify_check() &&
370 (vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && 361 (vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
371 (newflags & (VM_READ|VM_WRITE|VM_EXEC)) == 0) { 362 (newflags & (VM_READ|VM_WRITE|VM_EXEC)) == 0) {
372 error = prot_none_walk(vma, start, end, newflags); 363 pgprot_t new_pgprot = vm_get_page_prot(newflags);
364
365 error = walk_page_range(current->mm, start, end,
366 &prot_none_walk_ops, &new_pgprot);
373 if (error) 367 if (error)
374 return error; 368 return error;
375 } 369 }
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 8a92a961a2ee..b8762b673a3d 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -9,10 +9,11 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
9{ 9{
10 pte_t *pte; 10 pte_t *pte;
11 int err = 0; 11 int err = 0;
12 const struct mm_walk_ops *ops = walk->ops;
12 13
13 pte = pte_offset_map(pmd, addr); 14 pte = pte_offset_map(pmd, addr);
14 for (;;) { 15 for (;;) {
15 err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, walk); 16 err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
16 if (err) 17 if (err)
17 break; 18 break;
18 addr += PAGE_SIZE; 19 addr += PAGE_SIZE;
@@ -30,6 +31,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
30{ 31{
31 pmd_t *pmd; 32 pmd_t *pmd;
32 unsigned long next; 33 unsigned long next;
34 const struct mm_walk_ops *ops = walk->ops;
33 int err = 0; 35 int err = 0;
34 36
35 pmd = pmd_offset(pud, addr); 37 pmd = pmd_offset(pud, addr);
@@ -37,8 +39,8 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
37again: 39again:
38 next = pmd_addr_end(addr, end); 40 next = pmd_addr_end(addr, end);
39 if (pmd_none(*pmd) || !walk->vma) { 41 if (pmd_none(*pmd) || !walk->vma) {
40 if (walk->pte_hole) 42 if (ops->pte_hole)
41 err = walk->pte_hole(addr, next, walk); 43 err = ops->pte_hole(addr, next, walk);
42 if (err) 44 if (err)
43 break; 45 break;
44 continue; 46 continue;
@@ -47,8 +49,8 @@ again:
47 * This implies that each ->pmd_entry() handler 49 * This implies that each ->pmd_entry() handler
48 * needs to know about pmd_trans_huge() pmds 50 * needs to know about pmd_trans_huge() pmds
49 */ 51 */
50 if (walk->pmd_entry) 52 if (ops->pmd_entry)
51 err = walk->pmd_entry(pmd, addr, next, walk); 53 err = ops->pmd_entry(pmd, addr, next, walk);
52 if (err) 54 if (err)
53 break; 55 break;
54 56
@@ -56,7 +58,7 @@ again:
56 * Check this here so we only break down trans_huge 58 * Check this here so we only break down trans_huge
57 * pages when we _need_ to 59 * pages when we _need_ to
58 */ 60 */
59 if (!walk->pte_entry) 61 if (!ops->pte_entry)
60 continue; 62 continue;
61 63
62 split_huge_pmd(walk->vma, pmd, addr); 64 split_huge_pmd(walk->vma, pmd, addr);
@@ -75,6 +77,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
75{ 77{
76 pud_t *pud; 78 pud_t *pud;
77 unsigned long next; 79 unsigned long next;
80 const struct mm_walk_ops *ops = walk->ops;
78 int err = 0; 81 int err = 0;
79 82
80 pud = pud_offset(p4d, addr); 83 pud = pud_offset(p4d, addr);
@@ -82,18 +85,18 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
82 again: 85 again:
83 next = pud_addr_end(addr, end); 86 next = pud_addr_end(addr, end);
84 if (pud_none(*pud) || !walk->vma) { 87 if (pud_none(*pud) || !walk->vma) {
85 if (walk->pte_hole) 88 if (ops->pte_hole)
86 err = walk->pte_hole(addr, next, walk); 89 err = ops->pte_hole(addr, next, walk);
87 if (err) 90 if (err)
88 break; 91 break;
89 continue; 92 continue;
90 } 93 }
91 94
92 if (walk->pud_entry) { 95 if (ops->pud_entry) {
93 spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma); 96 spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma);
94 97
95 if (ptl) { 98 if (ptl) {
96 err = walk->pud_entry(pud, addr, next, walk); 99 err = ops->pud_entry(pud, addr, next, walk);
97 spin_unlock(ptl); 100 spin_unlock(ptl);
98 if (err) 101 if (err)
99 break; 102 break;
@@ -105,7 +108,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
105 if (pud_none(*pud)) 108 if (pud_none(*pud))
106 goto again; 109 goto again;
107 110
108 if (walk->pmd_entry || walk->pte_entry) 111 if (ops->pmd_entry || ops->pte_entry)
109 err = walk_pmd_range(pud, addr, next, walk); 112 err = walk_pmd_range(pud, addr, next, walk);
110 if (err) 113 if (err)
111 break; 114 break;
@@ -119,19 +122,20 @@ static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
119{ 122{
120 p4d_t *p4d; 123 p4d_t *p4d;
121 unsigned long next; 124 unsigned long next;
125 const struct mm_walk_ops *ops = walk->ops;
122 int err = 0; 126 int err = 0;
123 127
124 p4d = p4d_offset(pgd, addr); 128 p4d = p4d_offset(pgd, addr);
125 do { 129 do {
126 next = p4d_addr_end(addr, end); 130 next = p4d_addr_end(addr, end);
127 if (p4d_none_or_clear_bad(p4d)) { 131 if (p4d_none_or_clear_bad(p4d)) {
128 if (walk->pte_hole) 132 if (ops->pte_hole)
129 err = walk->pte_hole(addr, next, walk); 133 err = ops->pte_hole(addr, next, walk);
130 if (err) 134 if (err)
131 break; 135 break;
132 continue; 136 continue;
133 } 137 }
134 if (walk->pmd_entry || walk->pte_entry) 138 if (ops->pmd_entry || ops->pte_entry)
135 err = walk_pud_range(p4d, addr, next, walk); 139 err = walk_pud_range(p4d, addr, next, walk);
136 if (err) 140 if (err)
137 break; 141 break;
@@ -145,19 +149,20 @@ static int walk_pgd_range(unsigned long addr, unsigned long end,
145{ 149{
146 pgd_t *pgd; 150 pgd_t *pgd;
147 unsigned long next; 151 unsigned long next;
152 const struct mm_walk_ops *ops = walk->ops;
148 int err = 0; 153 int err = 0;
149 154
150 pgd = pgd_offset(walk->mm, addr); 155 pgd = pgd_offset(walk->mm, addr);
151 do { 156 do {
152 next = pgd_addr_end(addr, end); 157 next = pgd_addr_end(addr, end);
153 if (pgd_none_or_clear_bad(pgd)) { 158 if (pgd_none_or_clear_bad(pgd)) {
154 if (walk->pte_hole) 159 if (ops->pte_hole)
155 err = walk->pte_hole(addr, next, walk); 160 err = ops->pte_hole(addr, next, walk);
156 if (err) 161 if (err)
157 break; 162 break;
158 continue; 163 continue;
159 } 164 }
160 if (walk->pmd_entry || walk->pte_entry) 165 if (ops->pmd_entry || ops->pte_entry)
161 err = walk_p4d_range(pgd, addr, next, walk); 166 err = walk_p4d_range(pgd, addr, next, walk);
162 if (err) 167 if (err)
163 break; 168 break;
@@ -183,6 +188,7 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end,
183 unsigned long hmask = huge_page_mask(h); 188 unsigned long hmask = huge_page_mask(h);
184 unsigned long sz = huge_page_size(h); 189 unsigned long sz = huge_page_size(h);
185 pte_t *pte; 190 pte_t *pte;
191 const struct mm_walk_ops *ops = walk->ops;
186 int err = 0; 192 int err = 0;
187 193
188 do { 194 do {
@@ -190,9 +196,9 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end,
190 pte = huge_pte_offset(walk->mm, addr & hmask, sz); 196 pte = huge_pte_offset(walk->mm, addr & hmask, sz);
191 197
192 if (pte) 198 if (pte)
193 err = walk->hugetlb_entry(pte, hmask, addr, next, walk); 199 err = ops->hugetlb_entry(pte, hmask, addr, next, walk);
194 else if (walk->pte_hole) 200 else if (ops->pte_hole)
195 err = walk->pte_hole(addr, next, walk); 201 err = ops->pte_hole(addr, next, walk);
196 202
197 if (err) 203 if (err)
198 break; 204 break;
@@ -220,9 +226,10 @@ static int walk_page_test(unsigned long start, unsigned long end,
220 struct mm_walk *walk) 226 struct mm_walk *walk)
221{ 227{
222 struct vm_area_struct *vma = walk->vma; 228 struct vm_area_struct *vma = walk->vma;
229 const struct mm_walk_ops *ops = walk->ops;
223 230
224 if (walk->test_walk) 231 if (ops->test_walk)
225 return walk->test_walk(start, end, walk); 232 return ops->test_walk(start, end, walk);
226 233
227 /* 234 /*
228 * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP 235 * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP
@@ -234,8 +241,8 @@ static int walk_page_test(unsigned long start, unsigned long end,
234 */ 241 */
235 if (vma->vm_flags & VM_PFNMAP) { 242 if (vma->vm_flags & VM_PFNMAP) {
236 int err = 1; 243 int err = 1;
237 if (walk->pte_hole) 244 if (ops->pte_hole)
238 err = walk->pte_hole(start, end, walk); 245 err = ops->pte_hole(start, end, walk);
239 return err ? err : 1; 246 return err ? err : 1;
240 } 247 }
241 return 0; 248 return 0;
@@ -248,7 +255,7 @@ static int __walk_page_range(unsigned long start, unsigned long end,
248 struct vm_area_struct *vma = walk->vma; 255 struct vm_area_struct *vma = walk->vma;
249 256
250 if (vma && is_vm_hugetlb_page(vma)) { 257 if (vma && is_vm_hugetlb_page(vma)) {
251 if (walk->hugetlb_entry) 258 if (walk->ops->hugetlb_entry)
252 err = walk_hugetlb_range(start, end, walk); 259 err = walk_hugetlb_range(start, end, walk);
253 } else 260 } else
254 err = walk_pgd_range(start, end, walk); 261 err = walk_pgd_range(start, end, walk);
@@ -258,11 +265,13 @@ static int __walk_page_range(unsigned long start, unsigned long end,
258 265
259/** 266/**
260 * walk_page_range - walk page table with caller specific callbacks 267 * walk_page_range - walk page table with caller specific callbacks
261 * @start: start address of the virtual address range 268 * @mm: mm_struct representing the target process of page table walk
262 * @end: end address of the virtual address range 269 * @start: start address of the virtual address range
263 * @walk: mm_walk structure defining the callbacks and the target address space 270 * @end: end address of the virtual address range
271 * @ops: operation to call during the walk
272 * @private: private data for callbacks' usage
264 * 273 *
265 * Recursively walk the page table tree of the process represented by @walk->mm 274 * Recursively walk the page table tree of the process represented by @mm
266 * within the virtual address range [@start, @end). During walking, we can do 275 * within the virtual address range [@start, @end). During walking, we can do
267 * some caller-specific works for each entry, by setting up pmd_entry(), 276 * some caller-specific works for each entry, by setting up pmd_entry(),
268 * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these 277 * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
@@ -278,47 +287,52 @@ static int __walk_page_range(unsigned long start, unsigned long end,
278 * 287 *
279 * Before starting to walk page table, some callers want to check whether 288 * Before starting to walk page table, some callers want to check whether
280 * they really want to walk over the current vma, typically by checking 289 * they really want to walk over the current vma, typically by checking
281 * its vm_flags. walk_page_test() and @walk->test_walk() are used for this 290 * its vm_flags. walk_page_test() and @ops->test_walk() are used for this
282 * purpose. 291 * purpose.
283 * 292 *
284 * struct mm_walk keeps current values of some common data like vma and pmd, 293 * struct mm_walk keeps current values of some common data like vma and pmd,
285 * which are useful for the access from callbacks. If you want to pass some 294 * which are useful for the access from callbacks. If you want to pass some
286 * caller-specific data to callbacks, @walk->private should be helpful. 295 * caller-specific data to callbacks, @private should be helpful.
287 * 296 *
288 * Locking: 297 * Locking:
289 * Callers of walk_page_range() and walk_page_vma() should hold 298 * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_sem,
290 * @walk->mm->mmap_sem, because these function traverse vma list and/or 299 * because these function traverse vma list and/or access to vma's data.
291 * access to vma's data.
292 */ 300 */
293int walk_page_range(unsigned long start, unsigned long end, 301int walk_page_range(struct mm_struct *mm, unsigned long start,
294 struct mm_walk *walk) 302 unsigned long end, const struct mm_walk_ops *ops,
303 void *private)
295{ 304{
296 int err = 0; 305 int err = 0;
297 unsigned long next; 306 unsigned long next;
298 struct vm_area_struct *vma; 307 struct vm_area_struct *vma;
308 struct mm_walk walk = {
309 .ops = ops,
310 .mm = mm,
311 .private = private,
312 };
299 313
300 if (start >= end) 314 if (start >= end)
301 return -EINVAL; 315 return -EINVAL;
302 316
303 if (!walk->mm) 317 if (!walk.mm)
304 return -EINVAL; 318 return -EINVAL;
305 319
306 VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm); 320 VM_BUG_ON_MM(!rwsem_is_locked(&walk.mm->mmap_sem), walk.mm);
307 321
308 vma = find_vma(walk->mm, start); 322 vma = find_vma(walk.mm, start);
309 do { 323 do {
310 if (!vma) { /* after the last vma */ 324 if (!vma) { /* after the last vma */
311 walk->vma = NULL; 325 walk.vma = NULL;
312 next = end; 326 next = end;
313 } else if (start < vma->vm_start) { /* outside vma */ 327 } else if (start < vma->vm_start) { /* outside vma */
314 walk->vma = NULL; 328 walk.vma = NULL;
315 next = min(end, vma->vm_start); 329 next = min(end, vma->vm_start);
316 } else { /* inside vma */ 330 } else { /* inside vma */
317 walk->vma = vma; 331 walk.vma = vma;
318 next = min(end, vma->vm_end); 332 next = min(end, vma->vm_end);
319 vma = vma->vm_next; 333 vma = vma->vm_next;
320 334
321 err = walk_page_test(start, next, walk); 335 err = walk_page_test(start, next, &walk);
322 if (err > 0) { 336 if (err > 0) {
323 /* 337 /*
324 * positive return values are purely for 338 * positive return values are purely for
@@ -331,28 +345,34 @@ int walk_page_range(unsigned long start, unsigned long end,
331 if (err < 0) 345 if (err < 0)
332 break; 346 break;
333 } 347 }
334 if (walk->vma || walk->pte_hole) 348 if (walk.vma || walk.ops->pte_hole)
335 err = __walk_page_range(start, next, walk); 349 err = __walk_page_range(start, next, &walk);
336 if (err) 350 if (err)
337 break; 351 break;
338 } while (start = next, start < end); 352 } while (start = next, start < end);
339 return err; 353 return err;
340} 354}
341 355
342int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk) 356int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
357 void *private)
343{ 358{
359 struct mm_walk walk = {
360 .ops = ops,
361 .mm = vma->vm_mm,
362 .vma = vma,
363 .private = private,
364 };
344 int err; 365 int err;
345 366
346 if (!walk->mm) 367 if (!walk.mm)
347 return -EINVAL; 368 return -EINVAL;
348 369
349 VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); 370 VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem));
350 VM_BUG_ON(!vma); 371
351 walk->vma = vma; 372 err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
352 err = walk_page_test(vma->vm_start, vma->vm_end, walk);
353 if (err > 0) 373 if (err > 0)
354 return 0; 374 return 0;
355 if (err < 0) 375 if (err < 0)
356 return err; 376 return err;
357 return __walk_page_range(vma->vm_start, vma->vm_end, walk); 377 return __walk_page_range(vma->vm_start, vma->vm_end, &walk);
358} 378}