aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHugh Dickins <hugh.dickins@tiscali.co.uk>2009-12-14 20:59:31 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2009-12-15 11:53:20 -0500
commite9995ef978a7d5296fe04a9a2c5ca6e66d8bb4e5 (patch)
treedf4324273856e06b8277b7e4a0fa9289eb8e6385
parent407f9c8b0889ced1dbe2f9157e4e60c61329d5c9 (diff)
ksm: rmap_walk to remove_migation_ptes
A side-effect of making ksm pages swappable is that they have to be placed on the LRUs: which then exposes them to isolate_lru_page() and hence to page migration. Add rmap_walk() for remove_migration_ptes() to use: rmap_walk_anon() and rmap_walk_file() in rmap.c, but rmap_walk_ksm() in ksm.c. Perhaps some consolidation with existing code is possible, but don't attempt that yet (try_to_unmap needs to handle nonlinears, but migration pte removal does not). rmap_walk() is sadly less general than it appears: rmap_walk_anon(), like remove_anon_migration_ptes() which it replaces, avoids calling page_lock_anon_vma(), because that includes a page_mapped() test which fails when all migration ptes are in place. That was valid when NUMA page migration was introduced (holding mmap_sem provided the missing guarantee that anon_vma's slab had not already been destroyed), but I believe not valid in the memory hotremove case added since. For now do the same as before, and consider the best way to fix that unlikely race later on. When fixed, we can probably use rmap_walk() on hwpoisoned ksm pages too: for now, they remain among hwpoison's various exceptions (its PageKsm test comes before the page is locked, but its page_lock_anon_vma fails safely if an anon gets upgraded). Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk> Cc: Izik Eidus <ieidus@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Chris Wright <chrisw@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/ksm.h13
-rw-r--r--include/linux/rmap.h6
-rw-r--r--mm/ksm.c65
-rw-r--r--mm/migrate.c85
-rw-r--r--mm/rmap.c79
5 files changed, 181 insertions, 67 deletions
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 157d83dbaef8..bed5f16ba827 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -88,6 +88,9 @@ static inline struct page *ksm_might_need_to_copy(struct page *page,
88int page_referenced_ksm(struct page *page, 88int page_referenced_ksm(struct page *page,
89 struct mem_cgroup *memcg, unsigned long *vm_flags); 89 struct mem_cgroup *memcg, unsigned long *vm_flags);
90int try_to_unmap_ksm(struct page *page, enum ttu_flags flags); 90int try_to_unmap_ksm(struct page *page, enum ttu_flags flags);
91int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
92 struct vm_area_struct *, unsigned long, void *), void *arg);
93void ksm_migrate_page(struct page *newpage, struct page *oldpage);
91 94
92#else /* !CONFIG_KSM */ 95#else /* !CONFIG_KSM */
93 96
@@ -127,6 +130,16 @@ static inline int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
127{ 130{
128 return 0; 131 return 0;
129} 132}
133
134static inline int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page*,
135 struct vm_area_struct *, unsigned long, void *), void *arg)
136{
137 return 0;
138}
139
140static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage)
141{
142}
130#endif /* !CONFIG_KSM */ 143#endif /* !CONFIG_KSM */
131 144
132#endif /* __LINUX_KSM_H */ 145#endif /* __LINUX_KSM_H */
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 980094a527ee..b019ae64e2ab 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -164,6 +164,12 @@ struct anon_vma *page_lock_anon_vma(struct page *page);
164void page_unlock_anon_vma(struct anon_vma *anon_vma); 164void page_unlock_anon_vma(struct anon_vma *anon_vma);
165int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); 165int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
166 166
167/*
168 * Called by migrate.c to remove migration ptes, but might be used more later.
169 */
170int rmap_walk(struct page *page, int (*rmap_one)(struct page *,
171 struct vm_area_struct *, unsigned long, void *), void *arg);
172
167#else /* !CONFIG_MMU */ 173#else /* !CONFIG_MMU */
168 174
169#define anon_vma_init() do {} while (0) 175#define anon_vma_init() do {} while (0)
diff --git a/mm/ksm.c b/mm/ksm.c
index 20f46a7b2799..dfdc292d3626 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1656,6 +1656,71 @@ out:
1656 return ret; 1656 return ret;
1657} 1657}
1658 1658
1659#ifdef CONFIG_MIGRATION
1660int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
1661 struct vm_area_struct *, unsigned long, void *), void *arg)
1662{
1663 struct stable_node *stable_node;
1664 struct hlist_node *hlist;
1665 struct rmap_item *rmap_item;
1666 int ret = SWAP_AGAIN;
1667 int search_new_forks = 0;
1668
1669 VM_BUG_ON(!PageKsm(page));
1670 VM_BUG_ON(!PageLocked(page));
1671
1672 stable_node = page_stable_node(page);
1673 if (!stable_node)
1674 return ret;
1675again:
1676 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
1677 struct anon_vma *anon_vma = rmap_item->anon_vma;
1678 struct vm_area_struct *vma;
1679
1680 spin_lock(&anon_vma->lock);
1681 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
1682 if (rmap_item->address < vma->vm_start ||
1683 rmap_item->address >= vma->vm_end)
1684 continue;
1685 /*
1686 * Initially we examine only the vma which covers this
1687 * rmap_item; but later, if there is still work to do,
1688 * we examine covering vmas in other mms: in case they
1689 * were forked from the original since ksmd passed.
1690 */
1691 if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
1692 continue;
1693
1694 ret = rmap_one(page, vma, rmap_item->address, arg);
1695 if (ret != SWAP_AGAIN) {
1696 spin_unlock(&anon_vma->lock);
1697 goto out;
1698 }
1699 }
1700 spin_unlock(&anon_vma->lock);
1701 }
1702 if (!search_new_forks++)
1703 goto again;
1704out:
1705 return ret;
1706}
1707
1708void ksm_migrate_page(struct page *newpage, struct page *oldpage)
1709{
1710 struct stable_node *stable_node;
1711
1712 VM_BUG_ON(!PageLocked(oldpage));
1713 VM_BUG_ON(!PageLocked(newpage));
1714 VM_BUG_ON(newpage->mapping != oldpage->mapping);
1715
1716 stable_node = page_stable_node(newpage);
1717 if (stable_node) {
1718 VM_BUG_ON(stable_node->page != oldpage);
1719 stable_node->page = newpage;
1720 }
1721}
1722#endif /* CONFIG_MIGRATION */
1723
1659#ifdef CONFIG_SYSFS 1724#ifdef CONFIG_SYSFS
1660/* 1725/*
1661 * This all compiles without CONFIG_SYSFS, but is a waste of space. 1726 * This all compiles without CONFIG_SYSFS, but is a waste of space.
diff --git a/mm/migrate.c b/mm/migrate.c
index 367272d04423..0b714747c028 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -21,6 +21,7 @@
21#include <linux/mm_inline.h> 21#include <linux/mm_inline.h>
22#include <linux/nsproxy.h> 22#include <linux/nsproxy.h>
23#include <linux/pagevec.h> 23#include <linux/pagevec.h>
24#include <linux/ksm.h>
24#include <linux/rmap.h> 25#include <linux/rmap.h>
25#include <linux/topology.h> 26#include <linux/topology.h>
26#include <linux/cpu.h> 27#include <linux/cpu.h>
@@ -78,8 +79,8 @@ int putback_lru_pages(struct list_head *l)
78/* 79/*
79 * Restore a potential migration pte to a working pte entry 80 * Restore a potential migration pte to a working pte entry
80 */ 81 */
81static void remove_migration_pte(struct vm_area_struct *vma, 82static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
82 struct page *old, struct page *new) 83 unsigned long addr, void *old)
83{ 84{
84 struct mm_struct *mm = vma->vm_mm; 85 struct mm_struct *mm = vma->vm_mm;
85 swp_entry_t entry; 86 swp_entry_t entry;
@@ -88,40 +89,37 @@ static void remove_migration_pte(struct vm_area_struct *vma,
88 pmd_t *pmd; 89 pmd_t *pmd;
89 pte_t *ptep, pte; 90 pte_t *ptep, pte;
90 spinlock_t *ptl; 91 spinlock_t *ptl;
91 unsigned long addr = page_address_in_vma(new, vma);
92
93 if (addr == -EFAULT)
94 return;
95 92
96 pgd = pgd_offset(mm, addr); 93 pgd = pgd_offset(mm, addr);
97 if (!pgd_present(*pgd)) 94 if (!pgd_present(*pgd))
98 return; 95 goto out;
99 96
100 pud = pud_offset(pgd, addr); 97 pud = pud_offset(pgd, addr);
101 if (!pud_present(*pud)) 98 if (!pud_present(*pud))
102 return; 99 goto out;
103 100
104 pmd = pmd_offset(pud, addr); 101 pmd = pmd_offset(pud, addr);
105 if (!pmd_present(*pmd)) 102 if (!pmd_present(*pmd))
106 return; 103 goto out;
107 104
108 ptep = pte_offset_map(pmd, addr); 105 ptep = pte_offset_map(pmd, addr);
109 106
110 if (!is_swap_pte(*ptep)) { 107 if (!is_swap_pte(*ptep)) {
111 pte_unmap(ptep); 108 pte_unmap(ptep);
112 return; 109 goto out;
113 } 110 }
114 111
115 ptl = pte_lockptr(mm, pmd); 112 ptl = pte_lockptr(mm, pmd);
116 spin_lock(ptl); 113 spin_lock(ptl);
117 pte = *ptep; 114 pte = *ptep;
118 if (!is_swap_pte(pte)) 115 if (!is_swap_pte(pte))
119 goto out; 116 goto unlock;
120 117
121 entry = pte_to_swp_entry(pte); 118 entry = pte_to_swp_entry(pte);
122 119
123 if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old) 120 if (!is_migration_entry(entry) ||
124 goto out; 121 migration_entry_to_page(entry) != old)
122 goto unlock;
125 123
126 get_page(new); 124 get_page(new);
127 pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); 125 pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
@@ -137,55 +135,10 @@ static void remove_migration_pte(struct vm_area_struct *vma,
137 135
138 /* No need to invalidate - it was non-present before */ 136 /* No need to invalidate - it was non-present before */
139 update_mmu_cache(vma, addr, pte); 137 update_mmu_cache(vma, addr, pte);
140 138unlock:
141out:
142 pte_unmap_unlock(ptep, ptl); 139 pte_unmap_unlock(ptep, ptl);
143} 140out:
144 141 return SWAP_AGAIN;
145/*
146 * Note that remove_file_migration_ptes will only work on regular mappings,
147 * Nonlinear mappings do not use migration entries.
148 */
149static void remove_file_migration_ptes(struct page *old, struct page *new)
150{
151 struct vm_area_struct *vma;
152 struct address_space *mapping = new->mapping;
153 struct prio_tree_iter iter;
154 pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
155
156 if (!mapping)
157 return;
158
159 spin_lock(&mapping->i_mmap_lock);
160
161 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff)
162 remove_migration_pte(vma, old, new);
163
164 spin_unlock(&mapping->i_mmap_lock);
165}
166
167/*
168 * Must hold mmap_sem lock on at least one of the vmas containing
169 * the page so that the anon_vma cannot vanish.
170 */
171static void remove_anon_migration_ptes(struct page *old, struct page *new)
172{
173 struct anon_vma *anon_vma;
174 struct vm_area_struct *vma;
175
176 /*
177 * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
178 */
179 anon_vma = page_anon_vma(new);
180 if (!anon_vma)
181 return;
182
183 spin_lock(&anon_vma->lock);
184
185 list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
186 remove_migration_pte(vma, old, new);
187
188 spin_unlock(&anon_vma->lock);
189} 142}
190 143
191/* 144/*
@@ -194,10 +147,7 @@ static void remove_anon_migration_ptes(struct page *old, struct page *new)
194 */ 147 */
195static void remove_migration_ptes(struct page *old, struct page *new) 148static void remove_migration_ptes(struct page *old, struct page *new)
196{ 149{
197 if (PageAnon(new)) 150 rmap_walk(new, remove_migration_pte, old);
198 remove_anon_migration_ptes(old, new);
199 else
200 remove_file_migration_ptes(old, new);
201} 151}
202 152
203/* 153/*
@@ -358,6 +308,7 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
358 } 308 }
359 309
360 mlock_migrate_page(newpage, page); 310 mlock_migrate_page(newpage, page);
311 ksm_migrate_page(newpage, page);
361 312
362 ClearPageSwapCache(page); 313 ClearPageSwapCache(page);
363 ClearPagePrivate(page); 314 ClearPagePrivate(page);
@@ -577,9 +528,9 @@ static int move_to_new_page(struct page *newpage, struct page *page)
577 else 528 else
578 rc = fallback_migrate_page(mapping, newpage, page); 529 rc = fallback_migrate_page(mapping, newpage, page);
579 530
580 if (!rc) { 531 if (!rc)
581 remove_migration_ptes(page, newpage); 532 remove_migration_ptes(page, newpage);
582 } else 533 else
583 newpage->mapping = NULL; 534 newpage->mapping = NULL;
584 535
585 unlock_page(newpage); 536 unlock_page(newpage);
diff --git a/mm/rmap.c b/mm/rmap.c
index 2e38e9048327..c81bedd7d527 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1203,3 +1203,82 @@ int try_to_munlock(struct page *page)
1203 else 1203 else
1204 return try_to_unmap_file(page, TTU_MUNLOCK); 1204 return try_to_unmap_file(page, TTU_MUNLOCK);
1205} 1205}
1206
1207#ifdef CONFIG_MIGRATION
1208/*
1209 * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file():
1210 * Called by migrate.c to remove migration ptes, but might be used more later.
1211 */
1212static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1213 struct vm_area_struct *, unsigned long, void *), void *arg)
1214{
1215 struct anon_vma *anon_vma;
1216 struct vm_area_struct *vma;
1217 int ret = SWAP_AGAIN;
1218
1219 /*
1220 * Note: remove_migration_ptes() cannot use page_lock_anon_vma()
1221 * because that depends on page_mapped(); but not all its usages
1222 * are holding mmap_sem, which also gave the necessary guarantee
1223 * (that this anon_vma's slab has not already been destroyed).
1224 * This needs to be reviewed later: avoiding page_lock_anon_vma()
1225 * is risky, and currently limits the usefulness of rmap_walk().
1226 */
1227 anon_vma = page_anon_vma(page);
1228 if (!anon_vma)
1229 return ret;
1230 spin_lock(&anon_vma->lock);
1231 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
1232 unsigned long address = vma_address(page, vma);
1233 if (address == -EFAULT)
1234 continue;
1235 ret = rmap_one(page, vma, address, arg);
1236 if (ret != SWAP_AGAIN)
1237 break;
1238 }
1239 spin_unlock(&anon_vma->lock);
1240 return ret;
1241}
1242
1243static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *,
1244 struct vm_area_struct *, unsigned long, void *), void *arg)
1245{
1246 struct address_space *mapping = page->mapping;
1247 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
1248 struct vm_area_struct *vma;
1249 struct prio_tree_iter iter;
1250 int ret = SWAP_AGAIN;
1251
1252 if (!mapping)
1253 return ret;
1254 spin_lock(&mapping->i_mmap_lock);
1255 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
1256 unsigned long address = vma_address(page, vma);
1257 if (address == -EFAULT)
1258 continue;
1259 ret = rmap_one(page, vma, address, arg);
1260 if (ret != SWAP_AGAIN)
1261 break;
1262 }
1263 /*
1264 * No nonlinear handling: being always shared, nonlinear vmas
1265 * never contain migration ptes. Decide what to do about this
1266 * limitation to linear when we need rmap_walk() on nonlinear.
1267 */
1268 spin_unlock(&mapping->i_mmap_lock);
1269 return ret;
1270}
1271
1272int rmap_walk(struct page *page, int (*rmap_one)(struct page *,
1273 struct vm_area_struct *, unsigned long, void *), void *arg)
1274{
1275 VM_BUG_ON(!PageLocked(page));
1276
1277 if (unlikely(PageKsm(page)))
1278 return rmap_walk_ksm(page, rmap_one, arg);
1279 else if (PageAnon(page))
1280 return rmap_walk_anon(page, rmap_one, arg);
1281 else
1282 return rmap_walk_file(page, rmap_one, arg);
1283}
1284#endif /* CONFIG_MIGRATION */