diff options
Diffstat (limited to 'mm/swap.c')
-rw-r--r-- | mm/swap.c | 319 |
1 files changed, 60 insertions, 259 deletions
@@ -24,6 +24,7 @@ | |||
24 | #include <linux/export.h> | 24 | #include <linux/export.h> |
25 | #include <linux/mm_inline.h> | 25 | #include <linux/mm_inline.h> |
26 | #include <linux/percpu_counter.h> | 26 | #include <linux/percpu_counter.h> |
27 | #include <linux/memremap.h> | ||
27 | #include <linux/percpu.h> | 28 | #include <linux/percpu.h> |
28 | #include <linux/cpu.h> | 29 | #include <linux/cpu.h> |
29 | #include <linux/notifier.h> | 30 | #include <linux/notifier.h> |
@@ -45,6 +46,7 @@ int page_cluster; | |||
45 | static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); | 46 | static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); |
46 | static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); | 47 | static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); |
47 | static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); | 48 | static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); |
49 | static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); | ||
48 | 50 | ||
49 | /* | 51 | /* |
50 | * This path almost never happens for VM activity - pages are normally | 52 | * This path almost never happens for VM activity - pages are normally |
@@ -89,260 +91,14 @@ static void __put_compound_page(struct page *page) | |||
89 | (*dtor)(page); | 91 | (*dtor)(page); |
90 | } | 92 | } |
91 | 93 | ||
92 | /** | 94 | void __put_page(struct page *page) |
93 | * Two special cases here: we could avoid taking compound_lock_irqsave | ||
94 | * and could skip the tail refcounting(in _mapcount). | ||
95 | * | ||
96 | * 1. Hugetlbfs page: | ||
97 | * | ||
98 | * PageHeadHuge will remain true until the compound page | ||
99 | * is released and enters the buddy allocator, and it could | ||
100 | * not be split by __split_huge_page_refcount(). | ||
101 | * | ||
102 | * So if we see PageHeadHuge set, and we have the tail page pin, | ||
103 | * then we could safely put head page. | ||
104 | * | ||
105 | * 2. Slab THP page: | ||
106 | * | ||
107 | * PG_slab is cleared before the slab frees the head page, and | ||
108 | * tail pin cannot be the last reference left on the head page, | ||
109 | * because the slab code is free to reuse the compound page | ||
110 | * after a kfree/kmem_cache_free without having to check if | ||
111 | * there's any tail pin left. In turn all tail pinsmust be always | ||
112 | * released while the head is still pinned by the slab code | ||
113 | * and so we know PG_slab will be still set too. | ||
114 | * | ||
115 | * So if we see PageSlab set, and we have the tail page pin, | ||
116 | * then we could safely put head page. | ||
117 | */ | ||
118 | static __always_inline | ||
119 | void put_unrefcounted_compound_page(struct page *page_head, struct page *page) | ||
120 | { | ||
121 | /* | ||
122 | * If @page is a THP tail, we must read the tail page | ||
123 | * flags after the head page flags. The | ||
124 | * __split_huge_page_refcount side enforces write memory barriers | ||
125 | * between clearing PageTail and before the head page | ||
126 | * can be freed and reallocated. | ||
127 | */ | ||
128 | smp_rmb(); | ||
129 | if (likely(PageTail(page))) { | ||
130 | /* | ||
131 | * __split_huge_page_refcount cannot race | ||
132 | * here, see the comment above this function. | ||
133 | */ | ||
134 | VM_BUG_ON_PAGE(!PageHead(page_head), page_head); | ||
135 | if (put_page_testzero(page_head)) { | ||
136 | /* | ||
137 | * If this is the tail of a slab THP page, | ||
138 | * the tail pin must not be the last reference | ||
139 | * held on the page, because the PG_slab cannot | ||
140 | * be cleared before all tail pins (which skips | ||
141 | * the _mapcount tail refcounting) have been | ||
142 | * released. | ||
143 | * | ||
144 | * If this is the tail of a hugetlbfs page, | ||
145 | * the tail pin may be the last reference on | ||
146 | * the page instead, because PageHeadHuge will | ||
147 | * not go away until the compound page enters | ||
148 | * the buddy allocator. | ||
149 | */ | ||
150 | VM_BUG_ON_PAGE(PageSlab(page_head), page_head); | ||
151 | __put_compound_page(page_head); | ||
152 | } | ||
153 | } else | ||
154 | /* | ||
155 | * __split_huge_page_refcount run before us, | ||
156 | * @page was a THP tail. The split @page_head | ||
157 | * has been freed and reallocated as slab or | ||
158 | * hugetlbfs page of smaller order (only | ||
159 | * possible if reallocated as slab on x86). | ||
160 | */ | ||
161 | if (put_page_testzero(page)) | ||
162 | __put_single_page(page); | ||
163 | } | ||
164 | |||
165 | static __always_inline | ||
166 | void put_refcounted_compound_page(struct page *page_head, struct page *page) | ||
167 | { | ||
168 | if (likely(page != page_head && get_page_unless_zero(page_head))) { | ||
169 | unsigned long flags; | ||
170 | |||
171 | /* | ||
172 | * @page_head wasn't a dangling pointer but it may not | ||
173 | * be a head page anymore by the time we obtain the | ||
174 | * lock. That is ok as long as it can't be freed from | ||
175 | * under us. | ||
176 | */ | ||
177 | flags = compound_lock_irqsave(page_head); | ||
178 | if (unlikely(!PageTail(page))) { | ||
179 | /* __split_huge_page_refcount run before us */ | ||
180 | compound_unlock_irqrestore(page_head, flags); | ||
181 | if (put_page_testzero(page_head)) { | ||
182 | /* | ||
183 | * The @page_head may have been freed | ||
184 | * and reallocated as a compound page | ||
185 | * of smaller order and then freed | ||
186 | * again. All we know is that it | ||
187 | * cannot have become: a THP page, a | ||
188 | * compound page of higher order, a | ||
189 | * tail page. That is because we | ||
190 | * still hold the refcount of the | ||
191 | * split THP tail and page_head was | ||
192 | * the THP head before the split. | ||
193 | */ | ||
194 | if (PageHead(page_head)) | ||
195 | __put_compound_page(page_head); | ||
196 | else | ||
197 | __put_single_page(page_head); | ||
198 | } | ||
199 | out_put_single: | ||
200 | if (put_page_testzero(page)) | ||
201 | __put_single_page(page); | ||
202 | return; | ||
203 | } | ||
204 | VM_BUG_ON_PAGE(page_head != compound_head(page), page); | ||
205 | /* | ||
206 | * We can release the refcount taken by | ||
207 | * get_page_unless_zero() now that | ||
208 | * __split_huge_page_refcount() is blocked on the | ||
209 | * compound_lock. | ||
210 | */ | ||
211 | if (put_page_testzero(page_head)) | ||
212 | VM_BUG_ON_PAGE(1, page_head); | ||
213 | /* __split_huge_page_refcount will wait now */ | ||
214 | VM_BUG_ON_PAGE(page_mapcount(page) <= 0, page); | ||
215 | atomic_dec(&page->_mapcount); | ||
216 | VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0, page_head); | ||
217 | VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page); | ||
218 | compound_unlock_irqrestore(page_head, flags); | ||
219 | |||
220 | if (put_page_testzero(page_head)) { | ||
221 | if (PageHead(page_head)) | ||
222 | __put_compound_page(page_head); | ||
223 | else | ||
224 | __put_single_page(page_head); | ||
225 | } | ||
226 | } else { | ||
227 | /* @page_head is a dangling pointer */ | ||
228 | VM_BUG_ON_PAGE(PageTail(page), page); | ||
229 | goto out_put_single; | ||
230 | } | ||
231 | } | ||
232 | |||
233 | static void put_compound_page(struct page *page) | ||
234 | { | ||
235 | struct page *page_head; | ||
236 | |||
237 | /* | ||
238 | * We see the PageCompound set and PageTail not set, so @page maybe: | ||
239 | * 1. hugetlbfs head page, or | ||
240 | * 2. THP head page. | ||
241 | */ | ||
242 | if (likely(!PageTail(page))) { | ||
243 | if (put_page_testzero(page)) { | ||
244 | /* | ||
245 | * By the time all refcounts have been released | ||
246 | * split_huge_page cannot run anymore from under us. | ||
247 | */ | ||
248 | if (PageHead(page)) | ||
249 | __put_compound_page(page); | ||
250 | else | ||
251 | __put_single_page(page); | ||
252 | } | ||
253 | return; | ||
254 | } | ||
255 | |||
256 | /* | ||
257 | * We see the PageCompound set and PageTail set, so @page maybe: | ||
258 | * 1. a tail hugetlbfs page, or | ||
259 | * 2. a tail THP page, or | ||
260 | * 3. a split THP page. | ||
261 | * | ||
262 | * Case 3 is possible, as we may race with | ||
263 | * __split_huge_page_refcount tearing down a THP page. | ||
264 | */ | ||
265 | page_head = compound_head(page); | ||
266 | if (!__compound_tail_refcounted(page_head)) | ||
267 | put_unrefcounted_compound_page(page_head, page); | ||
268 | else | ||
269 | put_refcounted_compound_page(page_head, page); | ||
270 | } | ||
271 | |||
272 | void put_page(struct page *page) | ||
273 | { | 95 | { |
274 | if (unlikely(PageCompound(page))) | 96 | if (unlikely(PageCompound(page))) |
275 | put_compound_page(page); | 97 | __put_compound_page(page); |
276 | else if (put_page_testzero(page)) | 98 | else |
277 | __put_single_page(page); | 99 | __put_single_page(page); |
278 | } | 100 | } |
279 | EXPORT_SYMBOL(put_page); | 101 | EXPORT_SYMBOL(__put_page); |
280 | |||
281 | /* | ||
282 | * This function is exported but must not be called by anything other | ||
283 | * than get_page(). It implements the slow path of get_page(). | ||
284 | */ | ||
285 | bool __get_page_tail(struct page *page) | ||
286 | { | ||
287 | /* | ||
288 | * This takes care of get_page() if run on a tail page | ||
289 | * returned by one of the get_user_pages/follow_page variants. | ||
290 | * get_user_pages/follow_page itself doesn't need the compound | ||
291 | * lock because it runs __get_page_tail_foll() under the | ||
292 | * proper PT lock that already serializes against | ||
293 | * split_huge_page(). | ||
294 | */ | ||
295 | unsigned long flags; | ||
296 | bool got; | ||
297 | struct page *page_head = compound_head(page); | ||
298 | |||
299 | /* Ref to put_compound_page() comment. */ | ||
300 | if (!__compound_tail_refcounted(page_head)) { | ||
301 | smp_rmb(); | ||
302 | if (likely(PageTail(page))) { | ||
303 | /* | ||
304 | * This is a hugetlbfs page or a slab | ||
305 | * page. __split_huge_page_refcount | ||
306 | * cannot race here. | ||
307 | */ | ||
308 | VM_BUG_ON_PAGE(!PageHead(page_head), page_head); | ||
309 | __get_page_tail_foll(page, true); | ||
310 | return true; | ||
311 | } else { | ||
312 | /* | ||
313 | * __split_huge_page_refcount run | ||
314 | * before us, "page" was a THP | ||
315 | * tail. The split page_head has been | ||
316 | * freed and reallocated as slab or | ||
317 | * hugetlbfs page of smaller order | ||
318 | * (only possible if reallocated as | ||
319 | * slab on x86). | ||
320 | */ | ||
321 | return false; | ||
322 | } | ||
323 | } | ||
324 | |||
325 | got = false; | ||
326 | if (likely(page != page_head && get_page_unless_zero(page_head))) { | ||
327 | /* | ||
328 | * page_head wasn't a dangling pointer but it | ||
329 | * may not be a head page anymore by the time | ||
330 | * we obtain the lock. That is ok as long as it | ||
331 | * can't be freed from under us. | ||
332 | */ | ||
333 | flags = compound_lock_irqsave(page_head); | ||
334 | /* here __split_huge_page_refcount won't run anymore */ | ||
335 | if (likely(PageTail(page))) { | ||
336 | __get_page_tail_foll(page, false); | ||
337 | got = true; | ||
338 | } | ||
339 | compound_unlock_irqrestore(page_head, flags); | ||
340 | if (unlikely(!got)) | ||
341 | put_page(page_head); | ||
342 | } | ||
343 | return got; | ||
344 | } | ||
345 | EXPORT_SYMBOL(__get_page_tail); | ||
346 | 102 | ||
347 | /** | 103 | /** |
348 | * put_pages_list() - release a list of pages | 104 | * put_pages_list() - release a list of pages |
@@ -604,6 +360,7 @@ static void __lru_cache_activate_page(struct page *page) | |||
604 | */ | 360 | */ |
605 | void mark_page_accessed(struct page *page) | 361 | void mark_page_accessed(struct page *page) |
606 | { | 362 | { |
363 | page = compound_head(page); | ||
607 | if (!PageActive(page) && !PageUnevictable(page) && | 364 | if (!PageActive(page) && !PageUnevictable(page) && |
608 | PageReferenced(page)) { | 365 | PageReferenced(page)) { |
609 | 366 | ||
@@ -799,6 +556,24 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, | |||
799 | update_page_reclaim_stat(lruvec, file, 0); | 556 | update_page_reclaim_stat(lruvec, file, 0); |
800 | } | 557 | } |
801 | 558 | ||
559 | |||
560 | static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, | ||
561 | void *arg) | ||
562 | { | ||
563 | if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { | ||
564 | int file = page_is_file_cache(page); | ||
565 | int lru = page_lru_base_type(page); | ||
566 | |||
567 | del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE); | ||
568 | ClearPageActive(page); | ||
569 | ClearPageReferenced(page); | ||
570 | add_page_to_lru_list(page, lruvec, lru); | ||
571 | |||
572 | __count_vm_event(PGDEACTIVATE); | ||
573 | update_page_reclaim_stat(lruvec, file, 0); | ||
574 | } | ||
575 | } | ||
576 | |||
802 | /* | 577 | /* |
803 | * Drain pages out of the cpu's pagevecs. | 578 | * Drain pages out of the cpu's pagevecs. |
804 | * Either "cpu" is the current CPU, and preemption has already been | 579 | * Either "cpu" is the current CPU, and preemption has already been |
@@ -825,6 +600,10 @@ void lru_add_drain_cpu(int cpu) | |||
825 | if (pagevec_count(pvec)) | 600 | if (pagevec_count(pvec)) |
826 | pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); | 601 | pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); |
827 | 602 | ||
603 | pvec = &per_cpu(lru_deactivate_pvecs, cpu); | ||
604 | if (pagevec_count(pvec)) | ||
605 | pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); | ||
606 | |||
828 | activate_page_drain(cpu); | 607 | activate_page_drain(cpu); |
829 | } | 608 | } |
830 | 609 | ||
@@ -854,6 +633,26 @@ void deactivate_file_page(struct page *page) | |||
854 | } | 633 | } |
855 | } | 634 | } |
856 | 635 | ||
636 | /** | ||
637 | * deactivate_page - deactivate a page | ||
638 | * @page: page to deactivate | ||
639 | * | ||
640 | * deactivate_page() moves @page to the inactive list if @page was on the active | ||
641 | * list and was not an unevictable page. This is done to accelerate the reclaim | ||
642 | * of @page. | ||
643 | */ | ||
644 | void deactivate_page(struct page *page) | ||
645 | { | ||
646 | if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { | ||
647 | struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); | ||
648 | |||
649 | page_cache_get(page); | ||
650 | if (!pagevec_add(pvec, page)) | ||
651 | pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); | ||
652 | put_cpu_var(lru_deactivate_pvecs); | ||
653 | } | ||
654 | } | ||
655 | |||
857 | void lru_add_drain(void) | 656 | void lru_add_drain(void) |
858 | { | 657 | { |
859 | lru_add_drain_cpu(get_cpu()); | 658 | lru_add_drain_cpu(get_cpu()); |
@@ -883,6 +682,7 @@ void lru_add_drain_all(void) | |||
883 | if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || | 682 | if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || |
884 | pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || | 683 | pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || |
885 | pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) || | 684 | pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) || |
685 | pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || | ||
886 | need_activate_page_drain(cpu)) { | 686 | need_activate_page_drain(cpu)) { |
887 | INIT_WORK(work, lru_add_drain_per_cpu); | 687 | INIT_WORK(work, lru_add_drain_per_cpu); |
888 | schedule_work_on(cpu, work); | 688 | schedule_work_on(cpu, work); |
@@ -918,15 +718,6 @@ void release_pages(struct page **pages, int nr, bool cold) | |||
918 | for (i = 0; i < nr; i++) { | 718 | for (i = 0; i < nr; i++) { |
919 | struct page *page = pages[i]; | 719 | struct page *page = pages[i]; |
920 | 720 | ||
921 | if (unlikely(PageCompound(page))) { | ||
922 | if (zone) { | ||
923 | spin_unlock_irqrestore(&zone->lru_lock, flags); | ||
924 | zone = NULL; | ||
925 | } | ||
926 | put_compound_page(page); | ||
927 | continue; | ||
928 | } | ||
929 | |||
930 | /* | 721 | /* |
931 | * Make sure the IRQ-safe lock-holding time does not get | 722 | * Make sure the IRQ-safe lock-holding time does not get |
932 | * excessive with a continuous string of pages from the | 723 | * excessive with a continuous string of pages from the |
@@ -937,9 +728,19 @@ void release_pages(struct page **pages, int nr, bool cold) | |||
937 | zone = NULL; | 728 | zone = NULL; |
938 | } | 729 | } |
939 | 730 | ||
731 | page = compound_head(page); | ||
940 | if (!put_page_testzero(page)) | 732 | if (!put_page_testzero(page)) |
941 | continue; | 733 | continue; |
942 | 734 | ||
735 | if (PageCompound(page)) { | ||
736 | if (zone) { | ||
737 | spin_unlock_irqrestore(&zone->lru_lock, flags); | ||
738 | zone = NULL; | ||
739 | } | ||
740 | __put_compound_page(page); | ||
741 | continue; | ||
742 | } | ||
743 | |||
943 | if (PageLRU(page)) { | 744 | if (PageLRU(page)) { |
944 | struct zone *pagezone = page_zone(page); | 745 | struct zone *pagezone = page_zone(page); |
945 | 746 | ||