aboutsummaryrefslogtreecommitdiffstats
path: root/mm/rmap.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/rmap.c')
-rw-r--r--mm/rmap.c211
1 files changed, 170 insertions, 41 deletions
diff --git a/mm/rmap.c b/mm/rmap.c
index 278cd277bdec..07fc94758799 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -62,6 +62,7 @@
62#include "internal.h" 62#include "internal.h"
63 63
64static struct kmem_cache *anon_vma_cachep; 64static struct kmem_cache *anon_vma_cachep;
65static struct kmem_cache *anon_vma_chain_cachep;
65 66
66static inline struct anon_vma *anon_vma_alloc(void) 67static inline struct anon_vma *anon_vma_alloc(void)
67{ 68{
@@ -73,6 +74,16 @@ void anon_vma_free(struct anon_vma *anon_vma)
73 kmem_cache_free(anon_vma_cachep, anon_vma); 74 kmem_cache_free(anon_vma_cachep, anon_vma);
74} 75}
75 76
77static inline struct anon_vma_chain *anon_vma_chain_alloc(void)
78{
79 return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL);
80}
81
82void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
83{
84 kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
85}
86
76/** 87/**
77 * anon_vma_prepare - attach an anon_vma to a memory region 88 * anon_vma_prepare - attach an anon_vma to a memory region
78 * @vma: the memory region in question 89 * @vma: the memory region in question
@@ -103,73 +114,140 @@ void anon_vma_free(struct anon_vma *anon_vma)
103int anon_vma_prepare(struct vm_area_struct *vma) 114int anon_vma_prepare(struct vm_area_struct *vma)
104{ 115{
105 struct anon_vma *anon_vma = vma->anon_vma; 116 struct anon_vma *anon_vma = vma->anon_vma;
117 struct anon_vma_chain *avc;
106 118
107 might_sleep(); 119 might_sleep();
108 if (unlikely(!anon_vma)) { 120 if (unlikely(!anon_vma)) {
109 struct mm_struct *mm = vma->vm_mm; 121 struct mm_struct *mm = vma->vm_mm;
110 struct anon_vma *allocated; 122 struct anon_vma *allocated;
111 123
124 avc = anon_vma_chain_alloc();
125 if (!avc)
126 goto out_enomem;
127
112 anon_vma = find_mergeable_anon_vma(vma); 128 anon_vma = find_mergeable_anon_vma(vma);
113 allocated = NULL; 129 allocated = NULL;
114 if (!anon_vma) { 130 if (!anon_vma) {
115 anon_vma = anon_vma_alloc(); 131 anon_vma = anon_vma_alloc();
116 if (unlikely(!anon_vma)) 132 if (unlikely(!anon_vma))
117 return -ENOMEM; 133 goto out_enomem_free_avc;
118 allocated = anon_vma; 134 allocated = anon_vma;
119 } 135 }
120 spin_lock(&anon_vma->lock);
121 136
137 spin_lock(&anon_vma->lock);
122 /* page_table_lock to protect against threads */ 138 /* page_table_lock to protect against threads */
123 spin_lock(&mm->page_table_lock); 139 spin_lock(&mm->page_table_lock);
124 if (likely(!vma->anon_vma)) { 140 if (likely(!vma->anon_vma)) {
125 vma->anon_vma = anon_vma; 141 vma->anon_vma = anon_vma;
126 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 142 avc->anon_vma = anon_vma;
143 avc->vma = vma;
144 list_add(&avc->same_vma, &vma->anon_vma_chain);
145 list_add(&avc->same_anon_vma, &anon_vma->head);
127 allocated = NULL; 146 allocated = NULL;
147 avc = NULL;
128 } 148 }
129 spin_unlock(&mm->page_table_lock); 149 spin_unlock(&mm->page_table_lock);
130
131 spin_unlock(&anon_vma->lock); 150 spin_unlock(&anon_vma->lock);
151
132 if (unlikely(allocated)) 152 if (unlikely(allocated))
133 anon_vma_free(allocated); 153 anon_vma_free(allocated);
154 if (unlikely(avc))
155 anon_vma_chain_free(avc);
134 } 156 }
135 return 0; 157 return 0;
158
159 out_enomem_free_avc:
160 anon_vma_chain_free(avc);
161 out_enomem:
162 return -ENOMEM;
136} 163}
137 164
138void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) 165static void anon_vma_chain_link(struct vm_area_struct *vma,
166 struct anon_vma_chain *avc,
167 struct anon_vma *anon_vma)
139{ 168{
140 BUG_ON(vma->anon_vma != next->anon_vma); 169 avc->vma = vma;
141 list_del(&next->anon_vma_node); 170 avc->anon_vma = anon_vma;
171 list_add(&avc->same_vma, &vma->anon_vma_chain);
172
173 spin_lock(&anon_vma->lock);
174 list_add_tail(&avc->same_anon_vma, &anon_vma->head);
175 spin_unlock(&anon_vma->lock);
142} 176}
143 177
144void __anon_vma_link(struct vm_area_struct *vma) 178/*
179 * Attach the anon_vmas from src to dst.
180 * Returns 0 on success, -ENOMEM on failure.
181 */
182int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
145{ 183{
146 struct anon_vma *anon_vma = vma->anon_vma; 184 struct anon_vma_chain *avc, *pavc;
147 185
148 if (anon_vma) 186 list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
149 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 187 avc = anon_vma_chain_alloc();
188 if (!avc)
189 goto enomem_failure;
190 anon_vma_chain_link(dst, avc, pavc->anon_vma);
191 }
192 return 0;
193
194 enomem_failure:
195 unlink_anon_vmas(dst);
196 return -ENOMEM;
150} 197}
151 198
152void anon_vma_link(struct vm_area_struct *vma) 199/*
200 * Attach vma to its own anon_vma, as well as to the anon_vmas that
201 * the corresponding VMA in the parent process is attached to.
202 * Returns 0 on success, non-zero on failure.
203 */
204int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
153{ 205{
154 struct anon_vma *anon_vma = vma->anon_vma; 206 struct anon_vma_chain *avc;
207 struct anon_vma *anon_vma;
155 208
156 if (anon_vma) { 209 /* Don't bother if the parent process has no anon_vma here. */
157 spin_lock(&anon_vma->lock); 210 if (!pvma->anon_vma)
158 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 211 return 0;
159 spin_unlock(&anon_vma->lock); 212
160 } 213 /*
214 * First, attach the new VMA to the parent VMA's anon_vmas,
215 * so rmap can find non-COWed pages in child processes.
216 */
217 if (anon_vma_clone(vma, pvma))
218 return -ENOMEM;
219
220 /* Then add our own anon_vma. */
221 anon_vma = anon_vma_alloc();
222 if (!anon_vma)
223 goto out_error;
224 avc = anon_vma_chain_alloc();
225 if (!avc)
226 goto out_error_free_anon_vma;
227 anon_vma_chain_link(vma, avc, anon_vma);
228 /* Mark this anon_vma as the one where our new (COWed) pages go. */
229 vma->anon_vma = anon_vma;
230
231 return 0;
232
233 out_error_free_anon_vma:
234 anon_vma_free(anon_vma);
235 out_error:
236 unlink_anon_vmas(vma);
237 return -ENOMEM;
161} 238}
162 239
163void anon_vma_unlink(struct vm_area_struct *vma) 240static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
164{ 241{
165 struct anon_vma *anon_vma = vma->anon_vma; 242 struct anon_vma *anon_vma = anon_vma_chain->anon_vma;
166 int empty; 243 int empty;
167 244
245 /* If anon_vma_fork fails, we can get an empty anon_vma_chain. */
168 if (!anon_vma) 246 if (!anon_vma)
169 return; 247 return;
170 248
171 spin_lock(&anon_vma->lock); 249 spin_lock(&anon_vma->lock);
172 list_del(&vma->anon_vma_node); 250 list_del(&anon_vma_chain->same_anon_vma);
173 251
174 /* We must garbage collect the anon_vma if it's empty */ 252 /* We must garbage collect the anon_vma if it's empty */
175 empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma); 253 empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma);
@@ -179,6 +257,18 @@ void anon_vma_unlink(struct vm_area_struct *vma)
179 anon_vma_free(anon_vma); 257 anon_vma_free(anon_vma);
180} 258}
181 259
260void unlink_anon_vmas(struct vm_area_struct *vma)
261{
262 struct anon_vma_chain *avc, *next;
263
264 /* Unlink each anon_vma chained to the VMA. */
265 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
266 anon_vma_unlink(avc);
267 list_del(&avc->same_vma);
268 anon_vma_chain_free(avc);
269 }
270}
271
182static void anon_vma_ctor(void *data) 272static void anon_vma_ctor(void *data)
183{ 273{
184 struct anon_vma *anon_vma = data; 274 struct anon_vma *anon_vma = data;
@@ -192,6 +282,7 @@ void __init anon_vma_init(void)
192{ 282{
193 anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), 283 anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
194 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); 284 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor);
285 anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC);
195} 286}
196 287
197/* 288/*
@@ -396,7 +487,7 @@ static int page_referenced_anon(struct page *page,
396{ 487{
397 unsigned int mapcount; 488 unsigned int mapcount;
398 struct anon_vma *anon_vma; 489 struct anon_vma *anon_vma;
399 struct vm_area_struct *vma; 490 struct anon_vma_chain *avc;
400 int referenced = 0; 491 int referenced = 0;
401 492
402 anon_vma = page_lock_anon_vma(page); 493 anon_vma = page_lock_anon_vma(page);
@@ -404,7 +495,8 @@ static int page_referenced_anon(struct page *page,
404 return referenced; 495 return referenced;
405 496
406 mapcount = page_mapcount(page); 497 mapcount = page_mapcount(page);
407 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 498 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
499 struct vm_area_struct *vma = avc->vma;
408 unsigned long address = vma_address(page, vma); 500 unsigned long address = vma_address(page, vma);
409 if (address == -EFAULT) 501 if (address == -EFAULT)
410 continue; 502 continue;
@@ -511,9 +603,6 @@ int page_referenced(struct page *page,
511 int referenced = 0; 603 int referenced = 0;
512 int we_locked = 0; 604 int we_locked = 0;
513 605
514 if (TestClearPageReferenced(page))
515 referenced++;
516
517 *vm_flags = 0; 606 *vm_flags = 0;
518 if (page_mapped(page) && page_rmapping(page)) { 607 if (page_mapped(page) && page_rmapping(page)) {
519 if (!is_locked && (!PageAnon(page) || PageKsm(page))) { 608 if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
@@ -614,17 +703,57 @@ int page_mkclean(struct page *page)
614EXPORT_SYMBOL_GPL(page_mkclean); 703EXPORT_SYMBOL_GPL(page_mkclean);
615 704
616/** 705/**
706 * page_move_anon_rmap - move a page to our anon_vma
707 * @page: the page to move to our anon_vma
708 * @vma: the vma the page belongs to
709 * @address: the user virtual address mapped
710 *
711 * When a page belongs exclusively to one process after a COW event,
712 * that page can be moved into the anon_vma that belongs to just that
713 * process, so the rmap code will not search the parent or sibling
714 * processes.
715 */
716void page_move_anon_rmap(struct page *page,
717 struct vm_area_struct *vma, unsigned long address)
718{
719 struct anon_vma *anon_vma = vma->anon_vma;
720
721 VM_BUG_ON(!PageLocked(page));
722 VM_BUG_ON(!anon_vma);
723 VM_BUG_ON(page->index != linear_page_index(vma, address));
724
725 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
726 page->mapping = (struct address_space *) anon_vma;
727}
728
729/**
617 * __page_set_anon_rmap - setup new anonymous rmap 730 * __page_set_anon_rmap - setup new anonymous rmap
618 * @page: the page to add the mapping to 731 * @page: the page to add the mapping to
619 * @vma: the vm area in which the mapping is added 732 * @vma: the vm area in which the mapping is added
620 * @address: the user virtual address mapped 733 * @address: the user virtual address mapped
734 * @exclusive: the page is exclusively owned by the current process
621 */ 735 */
622static void __page_set_anon_rmap(struct page *page, 736static void __page_set_anon_rmap(struct page *page,
623 struct vm_area_struct *vma, unsigned long address) 737 struct vm_area_struct *vma, unsigned long address, int exclusive)
624{ 738{
625 struct anon_vma *anon_vma = vma->anon_vma; 739 struct anon_vma *anon_vma = vma->anon_vma;
626 740
627 BUG_ON(!anon_vma); 741 BUG_ON(!anon_vma);
742
743 /*
744 * If the page isn't exclusively mapped into this vma,
745 * we must use the _oldest_ possible anon_vma for the
746 * page mapping!
747 *
748 * So take the last AVC chain entry in the vma, which is
749 * the deepest ancestor, and use the anon_vma from that.
750 */
751 if (!exclusive) {
752 struct anon_vma_chain *avc;
753 avc = list_entry(vma->anon_vma_chain.prev, struct anon_vma_chain, same_vma);
754 anon_vma = avc->anon_vma;
755 }
756
628 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 757 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
629 page->mapping = (struct address_space *) anon_vma; 758 page->mapping = (struct address_space *) anon_vma;
630 page->index = linear_page_index(vma, address); 759 page->index = linear_page_index(vma, address);
@@ -652,9 +781,6 @@ static void __page_check_anon_rmap(struct page *page,
652 * are initially only visible via the pagetables, and the pte is locked 781 * are initially only visible via the pagetables, and the pte is locked
653 * over the call to page_add_new_anon_rmap. 782 * over the call to page_add_new_anon_rmap.
654 */ 783 */
655 struct anon_vma *anon_vma = vma->anon_vma;
656 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
657 BUG_ON(page->mapping != (struct address_space *)anon_vma);
658 BUG_ON(page->index != linear_page_index(vma, address)); 784 BUG_ON(page->index != linear_page_index(vma, address));
659#endif 785#endif
660} 786}
@@ -682,7 +808,7 @@ void page_add_anon_rmap(struct page *page,
682 VM_BUG_ON(!PageLocked(page)); 808 VM_BUG_ON(!PageLocked(page));
683 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); 809 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
684 if (first) 810 if (first)
685 __page_set_anon_rmap(page, vma, address); 811 __page_set_anon_rmap(page, vma, address, 0);
686 else 812 else
687 __page_check_anon_rmap(page, vma, address); 813 __page_check_anon_rmap(page, vma, address);
688} 814}
@@ -704,7 +830,7 @@ void page_add_new_anon_rmap(struct page *page,
704 SetPageSwapBacked(page); 830 SetPageSwapBacked(page);
705 atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ 831 atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
706 __inc_zone_page_state(page, NR_ANON_PAGES); 832 __inc_zone_page_state(page, NR_ANON_PAGES);
707 __page_set_anon_rmap(page, vma, address); 833 __page_set_anon_rmap(page, vma, address, 1);
708 if (page_evictable(page, vma)) 834 if (page_evictable(page, vma))
709 lru_cache_add_lru(page, LRU_ACTIVE_ANON); 835 lru_cache_add_lru(page, LRU_ACTIVE_ANON);
710 else 836 else
@@ -815,9 +941,9 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
815 941
816 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { 942 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
817 if (PageAnon(page)) 943 if (PageAnon(page))
818 dec_mm_counter(mm, anon_rss); 944 dec_mm_counter(mm, MM_ANONPAGES);
819 else 945 else
820 dec_mm_counter(mm, file_rss); 946 dec_mm_counter(mm, MM_FILEPAGES);
821 set_pte_at(mm, address, pte, 947 set_pte_at(mm, address, pte,
822 swp_entry_to_pte(make_hwpoison_entry(page))); 948 swp_entry_to_pte(make_hwpoison_entry(page)));
823 } else if (PageAnon(page)) { 949 } else if (PageAnon(page)) {
@@ -839,7 +965,8 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
839 list_add(&mm->mmlist, &init_mm.mmlist); 965 list_add(&mm->mmlist, &init_mm.mmlist);
840 spin_unlock(&mmlist_lock); 966 spin_unlock(&mmlist_lock);
841 } 967 }
842 dec_mm_counter(mm, anon_rss); 968 dec_mm_counter(mm, MM_ANONPAGES);
969 inc_mm_counter(mm, MM_SWAPENTS);
843 } else if (PAGE_MIGRATION) { 970 } else if (PAGE_MIGRATION) {
844 /* 971 /*
845 * Store the pfn of the page in a special migration 972 * Store the pfn of the page in a special migration
@@ -857,7 +984,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
857 entry = make_migration_entry(page, pte_write(pteval)); 984 entry = make_migration_entry(page, pte_write(pteval));
858 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 985 set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
859 } else 986 } else
860 dec_mm_counter(mm, file_rss); 987 dec_mm_counter(mm, MM_FILEPAGES);
861 988
862 page_remove_rmap(page); 989 page_remove_rmap(page);
863 page_cache_release(page); 990 page_cache_release(page);
@@ -996,7 +1123,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
996 1123
997 page_remove_rmap(page); 1124 page_remove_rmap(page);
998 page_cache_release(page); 1125 page_cache_release(page);
999 dec_mm_counter(mm, file_rss); 1126 dec_mm_counter(mm, MM_FILEPAGES);
1000 (*mapcount)--; 1127 (*mapcount)--;
1001 } 1128 }
1002 pte_unmap_unlock(pte - 1, ptl); 1129 pte_unmap_unlock(pte - 1, ptl);
@@ -1024,14 +1151,15 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1024static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) 1151static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1025{ 1152{
1026 struct anon_vma *anon_vma; 1153 struct anon_vma *anon_vma;
1027 struct vm_area_struct *vma; 1154 struct anon_vma_chain *avc;
1028 int ret = SWAP_AGAIN; 1155 int ret = SWAP_AGAIN;
1029 1156
1030 anon_vma = page_lock_anon_vma(page); 1157 anon_vma = page_lock_anon_vma(page);
1031 if (!anon_vma) 1158 if (!anon_vma)
1032 return ret; 1159 return ret;
1033 1160
1034 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 1161 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1162 struct vm_area_struct *vma = avc->vma;
1035 unsigned long address = vma_address(page, vma); 1163 unsigned long address = vma_address(page, vma);
1036 if (address == -EFAULT) 1164 if (address == -EFAULT)
1037 continue; 1165 continue;
@@ -1222,7 +1350,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1222 struct vm_area_struct *, unsigned long, void *), void *arg) 1350 struct vm_area_struct *, unsigned long, void *), void *arg)
1223{ 1351{
1224 struct anon_vma *anon_vma; 1352 struct anon_vma *anon_vma;
1225 struct vm_area_struct *vma; 1353 struct anon_vma_chain *avc;
1226 int ret = SWAP_AGAIN; 1354 int ret = SWAP_AGAIN;
1227 1355
1228 /* 1356 /*
@@ -1237,7 +1365,8 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1237 if (!anon_vma) 1365 if (!anon_vma)
1238 return ret; 1366 return ret;
1239 spin_lock(&anon_vma->lock); 1367 spin_lock(&anon_vma->lock);
1240 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 1368 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1369 struct vm_area_struct *vma = avc->vma;
1241 unsigned long address = vma_address(page, vma); 1370 unsigned long address = vma_address(page, vma);
1242 if (address == -EFAULT) 1371 if (address == -EFAULT)
1243 continue; 1372 continue;