aboutsummaryrefslogtreecommitdiffstats
path: root/mm/rmap.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/rmap.c')
-rw-r--r--mm/rmap.c203
1 files changed, 177 insertions, 26 deletions
diff --git a/mm/rmap.c b/mm/rmap.c
index 38a336e2eea1..f6f0d2dda2ea 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -56,6 +56,7 @@
56#include <linux/memcontrol.h> 56#include <linux/memcontrol.h>
57#include <linux/mmu_notifier.h> 57#include <linux/mmu_notifier.h>
58#include <linux/migrate.h> 58#include <linux/migrate.h>
59#include <linux/hugetlb.h>
59 60
60#include <asm/tlbflush.h> 61#include <asm/tlbflush.h>
61 62
@@ -132,9 +133,14 @@ int anon_vma_prepare(struct vm_area_struct *vma)
132 if (unlikely(!anon_vma)) 133 if (unlikely(!anon_vma))
133 goto out_enomem_free_avc; 134 goto out_enomem_free_avc;
134 allocated = anon_vma; 135 allocated = anon_vma;
136 /*
137 * This VMA had no anon_vma yet. This anon_vma is
138 * the root of any anon_vma tree that might form.
139 */
140 anon_vma->root = anon_vma;
135 } 141 }
136 142
137 spin_lock(&anon_vma->lock); 143 anon_vma_lock(anon_vma);
138 /* page_table_lock to protect against threads */ 144 /* page_table_lock to protect against threads */
139 spin_lock(&mm->page_table_lock); 145 spin_lock(&mm->page_table_lock);
140 if (likely(!vma->anon_vma)) { 146 if (likely(!vma->anon_vma)) {
@@ -142,12 +148,12 @@ int anon_vma_prepare(struct vm_area_struct *vma)
142 avc->anon_vma = anon_vma; 148 avc->anon_vma = anon_vma;
143 avc->vma = vma; 149 avc->vma = vma;
144 list_add(&avc->same_vma, &vma->anon_vma_chain); 150 list_add(&avc->same_vma, &vma->anon_vma_chain);
145 list_add(&avc->same_anon_vma, &anon_vma->head); 151 list_add_tail(&avc->same_anon_vma, &anon_vma->head);
146 allocated = NULL; 152 allocated = NULL;
147 avc = NULL; 153 avc = NULL;
148 } 154 }
149 spin_unlock(&mm->page_table_lock); 155 spin_unlock(&mm->page_table_lock);
150 spin_unlock(&anon_vma->lock); 156 anon_vma_unlock(anon_vma);
151 157
152 if (unlikely(allocated)) 158 if (unlikely(allocated))
153 anon_vma_free(allocated); 159 anon_vma_free(allocated);
@@ -170,9 +176,9 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
170 avc->anon_vma = anon_vma; 176 avc->anon_vma = anon_vma;
171 list_add(&avc->same_vma, &vma->anon_vma_chain); 177 list_add(&avc->same_vma, &vma->anon_vma_chain);
172 178
173 spin_lock(&anon_vma->lock); 179 anon_vma_lock(anon_vma);
174 list_add_tail(&avc->same_anon_vma, &anon_vma->head); 180 list_add_tail(&avc->same_anon_vma, &anon_vma->head);
175 spin_unlock(&anon_vma->lock); 181 anon_vma_unlock(anon_vma);
176} 182}
177 183
178/* 184/*
@@ -224,9 +230,21 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
224 avc = anon_vma_chain_alloc(); 230 avc = anon_vma_chain_alloc();
225 if (!avc) 231 if (!avc)
226 goto out_error_free_anon_vma; 232 goto out_error_free_anon_vma;
227 anon_vma_chain_link(vma, avc, anon_vma); 233
234 /*
235 * The root anon_vma's spinlock is the lock actually used when we
236 * lock any of the anon_vmas in this anon_vma tree.
237 */
238 anon_vma->root = pvma->anon_vma->root;
239 /*
240 * With KSM refcounts, an anon_vma can stay around longer than the
241 * process it belongs to. The root anon_vma needs to be pinned
242 * until this anon_vma is freed, because the lock lives in the root.
243 */
244 get_anon_vma(anon_vma->root);
228 /* Mark this anon_vma as the one where our new (COWed) pages go. */ 245 /* Mark this anon_vma as the one where our new (COWed) pages go. */
229 vma->anon_vma = anon_vma; 246 vma->anon_vma = anon_vma;
247 anon_vma_chain_link(vma, avc, anon_vma);
230 248
231 return 0; 249 return 0;
232 250
@@ -246,22 +264,29 @@ static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
246 if (!anon_vma) 264 if (!anon_vma)
247 return; 265 return;
248 266
249 spin_lock(&anon_vma->lock); 267 anon_vma_lock(anon_vma);
250 list_del(&anon_vma_chain->same_anon_vma); 268 list_del(&anon_vma_chain->same_anon_vma);
251 269
252 /* We must garbage collect the anon_vma if it's empty */ 270 /* We must garbage collect the anon_vma if it's empty */
253 empty = list_empty(&anon_vma->head) && !anonvma_external_refcount(anon_vma); 271 empty = list_empty(&anon_vma->head) && !anonvma_external_refcount(anon_vma);
254 spin_unlock(&anon_vma->lock); 272 anon_vma_unlock(anon_vma);
255 273
256 if (empty) 274 if (empty) {
275 /* We no longer need the root anon_vma */
276 if (anon_vma->root != anon_vma)
277 drop_anon_vma(anon_vma->root);
257 anon_vma_free(anon_vma); 278 anon_vma_free(anon_vma);
279 }
258} 280}
259 281
260void unlink_anon_vmas(struct vm_area_struct *vma) 282void unlink_anon_vmas(struct vm_area_struct *vma)
261{ 283{
262 struct anon_vma_chain *avc, *next; 284 struct anon_vma_chain *avc, *next;
263 285
264 /* Unlink each anon_vma chained to the VMA. */ 286 /*
287 * Unlink each anon_vma chained to the VMA. This list is ordered
288 * from newest to oldest, ensuring the root anon_vma gets freed last.
289 */
265 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { 290 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
266 anon_vma_unlink(avc); 291 anon_vma_unlink(avc);
267 list_del(&avc->same_vma); 292 list_del(&avc->same_vma);
@@ -291,7 +316,7 @@ void __init anon_vma_init(void)
291 */ 316 */
292struct anon_vma *page_lock_anon_vma(struct page *page) 317struct anon_vma *page_lock_anon_vma(struct page *page)
293{ 318{
294 struct anon_vma *anon_vma; 319 struct anon_vma *anon_vma, *root_anon_vma;
295 unsigned long anon_mapping; 320 unsigned long anon_mapping;
296 321
297 rcu_read_lock(); 322 rcu_read_lock();
@@ -302,8 +327,21 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
302 goto out; 327 goto out;
303 328
304 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); 329 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
305 spin_lock(&anon_vma->lock); 330 root_anon_vma = ACCESS_ONCE(anon_vma->root);
306 return anon_vma; 331 spin_lock(&root_anon_vma->lock);
332
333 /*
334 * If this page is still mapped, then its anon_vma cannot have been
335 * freed. But if it has been unmapped, we have no security against
336 * the anon_vma structure being freed and reused (for another anon_vma:
337 * SLAB_DESTROY_BY_RCU guarantees that - so the spin_lock above cannot
338 * corrupt): with anon_vma_prepare() or anon_vma_fork() redirecting
339 * anon_vma->root before page_unlock_anon_vma() is called to unlock.
340 */
341 if (page_mapped(page))
342 return anon_vma;
343
344 spin_unlock(&root_anon_vma->lock);
307out: 345out:
308 rcu_read_unlock(); 346 rcu_read_unlock();
309 return NULL; 347 return NULL;
@@ -311,7 +349,7 @@ out:
311 349
312void page_unlock_anon_vma(struct anon_vma *anon_vma) 350void page_unlock_anon_vma(struct anon_vma *anon_vma)
313{ 351{
314 spin_unlock(&anon_vma->lock); 352 anon_vma_unlock(anon_vma);
315 rcu_read_unlock(); 353 rcu_read_unlock();
316} 354}
317 355
@@ -326,6 +364,8 @@ vma_address(struct page *page, struct vm_area_struct *vma)
326 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 364 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
327 unsigned long address; 365 unsigned long address;
328 366
367 if (unlikely(is_vm_hugetlb_page(vma)))
368 pgoff = page->index << huge_page_order(page_hstate(page));
329 address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 369 address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
330 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { 370 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
331 /* page should be within @vma mapping range */ 371 /* page should be within @vma mapping range */
@@ -340,9 +380,10 @@ vma_address(struct page *page, struct vm_area_struct *vma)
340 */ 380 */
341unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) 381unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
342{ 382{
343 if (PageAnon(page)) 383 if (PageAnon(page)) {
344 ; 384 if (vma->anon_vma->root != page_anon_vma(page)->root)
345 else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { 385 return -EFAULT;
386 } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
346 if (!vma->vm_file || 387 if (!vma->vm_file ||
347 vma->vm_file->f_mapping != page->mapping) 388 vma->vm_file->f_mapping != page->mapping)
348 return -EFAULT; 389 return -EFAULT;
@@ -369,6 +410,12 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
369 pte_t *pte; 410 pte_t *pte;
370 spinlock_t *ptl; 411 spinlock_t *ptl;
371 412
413 if (unlikely(PageHuge(page))) {
414 pte = huge_pte_offset(mm, address);
415 ptl = &mm->page_table_lock;
416 goto check;
417 }
418
372 pgd = pgd_offset(mm, address); 419 pgd = pgd_offset(mm, address);
373 if (!pgd_present(*pgd)) 420 if (!pgd_present(*pgd))
374 return NULL; 421 return NULL;
@@ -389,6 +436,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
389 } 436 }
390 437
391 ptl = pte_lockptr(mm, pmd); 438 ptl = pte_lockptr(mm, pmd);
439check:
392 spin_lock(ptl); 440 spin_lock(ptl);
393 if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { 441 if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
394 *ptlp = ptl; 442 *ptlp = ptl;
@@ -743,14 +791,20 @@ static void __page_set_anon_rmap(struct page *page,
743 * If the page isn't exclusively mapped into this vma, 791 * If the page isn't exclusively mapped into this vma,
744 * we must use the _oldest_ possible anon_vma for the 792 * we must use the _oldest_ possible anon_vma for the
745 * page mapping! 793 * page mapping!
746 *
747 * So take the last AVC chain entry in the vma, which is
748 * the deepest ancestor, and use the anon_vma from that.
749 */ 794 */
750 if (!exclusive) { 795 if (!exclusive) {
751 struct anon_vma_chain *avc; 796 if (PageAnon(page))
752 avc = list_entry(vma->anon_vma_chain.prev, struct anon_vma_chain, same_vma); 797 return;
753 anon_vma = avc->anon_vma; 798 anon_vma = anon_vma->root;
799 } else {
800 /*
801 * In this case, swapped-out-but-not-discarded swap-cache
802 * is remapped. So, no need to update page->mapping here.
803 * We convice anon_vma poitned by page->mapping is not obsolete
804 * because vma->anon_vma is necessary to be a family of it.
805 */
806 if (PageAnon(page))
807 return;
754 } 808 }
755 809
756 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 810 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
@@ -780,6 +834,7 @@ static void __page_check_anon_rmap(struct page *page,
780 * are initially only visible via the pagetables, and the pte is locked 834 * are initially only visible via the pagetables, and the pte is locked
781 * over the call to page_add_new_anon_rmap. 835 * over the call to page_add_new_anon_rmap.
782 */ 836 */
837 BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root);
783 BUG_ON(page->index != linear_page_index(vma, address)); 838 BUG_ON(page->index != linear_page_index(vma, address));
784#endif 839#endif
785} 840}
@@ -798,6 +853,17 @@ static void __page_check_anon_rmap(struct page *page,
798void page_add_anon_rmap(struct page *page, 853void page_add_anon_rmap(struct page *page,
799 struct vm_area_struct *vma, unsigned long address) 854 struct vm_area_struct *vma, unsigned long address)
800{ 855{
856 do_page_add_anon_rmap(page, vma, address, 0);
857}
858
859/*
860 * Special version of the above for do_swap_page, which often runs
861 * into pages that are exclusively owned by the current process.
862 * Everybody else should continue to use page_add_anon_rmap above.
863 */
864void do_page_add_anon_rmap(struct page *page,
865 struct vm_area_struct *vma, unsigned long address, int exclusive)
866{
801 int first = atomic_inc_and_test(&page->_mapcount); 867 int first = atomic_inc_and_test(&page->_mapcount);
802 if (first) 868 if (first)
803 __inc_zone_page_state(page, NR_ANON_PAGES); 869 __inc_zone_page_state(page, NR_ANON_PAGES);
@@ -807,7 +873,7 @@ void page_add_anon_rmap(struct page *page,
807 VM_BUG_ON(!PageLocked(page)); 873 VM_BUG_ON(!PageLocked(page));
808 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); 874 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
809 if (first) 875 if (first)
810 __page_set_anon_rmap(page, vma, address, 0); 876 __page_set_anon_rmap(page, vma, address, exclusive);
811 else 877 else
812 __page_check_anon_rmap(page, vma, address); 878 __page_check_anon_rmap(page, vma, address);
813} 879}
@@ -873,6 +939,12 @@ void page_remove_rmap(struct page *page)
873 page_clear_dirty(page); 939 page_clear_dirty(page);
874 set_page_dirty(page); 940 set_page_dirty(page);
875 } 941 }
942 /*
943 * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED
944 * and not charged by memcg for now.
945 */
946 if (unlikely(PageHuge(page)))
947 return;
876 if (PageAnon(page)) { 948 if (PageAnon(page)) {
877 mem_cgroup_uncharge_page(page); 949 mem_cgroup_uncharge_page(page);
878 __dec_zone_page_state(page, NR_ANON_PAGES); 950 __dec_zone_page_state(page, NR_ANON_PAGES);
@@ -1368,6 +1440,42 @@ int try_to_munlock(struct page *page)
1368 return try_to_unmap_file(page, TTU_MUNLOCK); 1440 return try_to_unmap_file(page, TTU_MUNLOCK);
1369} 1441}
1370 1442
1443#if defined(CONFIG_KSM) || defined(CONFIG_MIGRATION)
1444/*
1445 * Drop an anon_vma refcount, freeing the anon_vma and anon_vma->root
1446 * if necessary. Be careful to do all the tests under the lock. Once
1447 * we know we are the last user, nobody else can get a reference and we
1448 * can do the freeing without the lock.
1449 */
1450void drop_anon_vma(struct anon_vma *anon_vma)
1451{
1452 BUG_ON(atomic_read(&anon_vma->external_refcount) <= 0);
1453 if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->root->lock)) {
1454 struct anon_vma *root = anon_vma->root;
1455 int empty = list_empty(&anon_vma->head);
1456 int last_root_user = 0;
1457 int root_empty = 0;
1458
1459 /*
1460 * The refcount on a non-root anon_vma got dropped. Drop
1461 * the refcount on the root and check if we need to free it.
1462 */
1463 if (empty && anon_vma != root) {
1464 BUG_ON(atomic_read(&root->external_refcount) <= 0);
1465 last_root_user = atomic_dec_and_test(&root->external_refcount);
1466 root_empty = list_empty(&root->head);
1467 }
1468 anon_vma_unlock(anon_vma);
1469
1470 if (empty) {
1471 anon_vma_free(anon_vma);
1472 if (root_empty && last_root_user)
1473 anon_vma_free(root);
1474 }
1475 }
1476}
1477#endif
1478
1371#ifdef CONFIG_MIGRATION 1479#ifdef CONFIG_MIGRATION
1372/* 1480/*
1373 * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file(): 1481 * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file():
@@ -1389,7 +1497,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1389 anon_vma = page_anon_vma(page); 1497 anon_vma = page_anon_vma(page);
1390 if (!anon_vma) 1498 if (!anon_vma)
1391 return ret; 1499 return ret;
1392 spin_lock(&anon_vma->lock); 1500 anon_vma_lock(anon_vma);
1393 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { 1501 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1394 struct vm_area_struct *vma = avc->vma; 1502 struct vm_area_struct *vma = avc->vma;
1395 unsigned long address = vma_address(page, vma); 1503 unsigned long address = vma_address(page, vma);
@@ -1399,7 +1507,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1399 if (ret != SWAP_AGAIN) 1507 if (ret != SWAP_AGAIN)
1400 break; 1508 break;
1401 } 1509 }
1402 spin_unlock(&anon_vma->lock); 1510 anon_vma_unlock(anon_vma);
1403 return ret; 1511 return ret;
1404} 1512}
1405 1513
@@ -1445,3 +1553,46 @@ int rmap_walk(struct page *page, int (*rmap_one)(struct page *,
1445 return rmap_walk_file(page, rmap_one, arg); 1553 return rmap_walk_file(page, rmap_one, arg);
1446} 1554}
1447#endif /* CONFIG_MIGRATION */ 1555#endif /* CONFIG_MIGRATION */
1556
1557#ifdef CONFIG_HUGETLB_PAGE
1558/*
1559 * The following three functions are for anonymous (private mapped) hugepages.
1560 * Unlike common anonymous pages, anonymous hugepages have no accounting code
1561 * and no lru code, because we handle hugepages differently from common pages.
1562 */
1563static void __hugepage_set_anon_rmap(struct page *page,
1564 struct vm_area_struct *vma, unsigned long address, int exclusive)
1565{
1566 struct anon_vma *anon_vma = vma->anon_vma;
1567 BUG_ON(!anon_vma);
1568 if (!exclusive) {
1569 struct anon_vma_chain *avc;
1570 avc = list_entry(vma->anon_vma_chain.prev,
1571 struct anon_vma_chain, same_vma);
1572 anon_vma = avc->anon_vma;
1573 }
1574 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
1575 page->mapping = (struct address_space *) anon_vma;
1576 page->index = linear_page_index(vma, address);
1577}
1578
1579void hugepage_add_anon_rmap(struct page *page,
1580 struct vm_area_struct *vma, unsigned long address)
1581{
1582 struct anon_vma *anon_vma = vma->anon_vma;
1583 int first;
1584 BUG_ON(!anon_vma);
1585 BUG_ON(address < vma->vm_start || address >= vma->vm_end);
1586 first = atomic_inc_and_test(&page->_mapcount);
1587 if (first)
1588 __hugepage_set_anon_rmap(page, vma, address, 0);
1589}
1590
1591void hugepage_add_new_anon_rmap(struct page *page,
1592 struct vm_area_struct *vma, unsigned long address)
1593{
1594 BUG_ON(address < vma->vm_start || address >= vma->vm_end);
1595 atomic_set(&page->_mapcount, 0);
1596 __hugepage_set_anon_rmap(page, vma, address, 1);
1597}
1598#endif /* CONFIG_HUGETLB_PAGE */