aboutsummaryrefslogtreecommitdiffstats
path: root/mm/swap.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/swap.c')
-rw-r--r--mm/swap.c319
1 files changed, 60 insertions, 259 deletions
diff --git a/mm/swap.c b/mm/swap.c
index 39395fb549c0..09fe5e97714a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -24,6 +24,7 @@
24#include <linux/export.h> 24#include <linux/export.h>
25#include <linux/mm_inline.h> 25#include <linux/mm_inline.h>
26#include <linux/percpu_counter.h> 26#include <linux/percpu_counter.h>
27#include <linux/memremap.h>
27#include <linux/percpu.h> 28#include <linux/percpu.h>
28#include <linux/cpu.h> 29#include <linux/cpu.h>
29#include <linux/notifier.h> 30#include <linux/notifier.h>
@@ -45,6 +46,7 @@ int page_cluster;
45static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); 46static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
46static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); 47static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
47static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); 48static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
49static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
48 50
49/* 51/*
50 * This path almost never happens for VM activity - pages are normally 52 * This path almost never happens for VM activity - pages are normally
@@ -89,260 +91,14 @@ static void __put_compound_page(struct page *page)
89 (*dtor)(page); 91 (*dtor)(page);
90} 92}
91 93
92/** 94void __put_page(struct page *page)
93 * Two special cases here: we could avoid taking compound_lock_irqsave
94 * and could skip the tail refcounting(in _mapcount).
95 *
96 * 1. Hugetlbfs page:
97 *
98 * PageHeadHuge will remain true until the compound page
99 * is released and enters the buddy allocator, and it could
100 * not be split by __split_huge_page_refcount().
101 *
102 * So if we see PageHeadHuge set, and we have the tail page pin,
103 * then we could safely put head page.
104 *
105 * 2. Slab THP page:
106 *
107 * PG_slab is cleared before the slab frees the head page, and
108 * tail pin cannot be the last reference left on the head page,
109 * because the slab code is free to reuse the compound page
110 * after a kfree/kmem_cache_free without having to check if
111 * there's any tail pin left. In turn all tail pinsmust be always
112 * released while the head is still pinned by the slab code
113 * and so we know PG_slab will be still set too.
114 *
115 * So if we see PageSlab set, and we have the tail page pin,
116 * then we could safely put head page.
117 */
118static __always_inline
119void put_unrefcounted_compound_page(struct page *page_head, struct page *page)
120{
121 /*
122 * If @page is a THP tail, we must read the tail page
123 * flags after the head page flags. The
124 * __split_huge_page_refcount side enforces write memory barriers
125 * between clearing PageTail and before the head page
126 * can be freed and reallocated.
127 */
128 smp_rmb();
129 if (likely(PageTail(page))) {
130 /*
131 * __split_huge_page_refcount cannot race
132 * here, see the comment above this function.
133 */
134 VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
135 if (put_page_testzero(page_head)) {
136 /*
137 * If this is the tail of a slab THP page,
138 * the tail pin must not be the last reference
139 * held on the page, because the PG_slab cannot
140 * be cleared before all tail pins (which skips
141 * the _mapcount tail refcounting) have been
142 * released.
143 *
144 * If this is the tail of a hugetlbfs page,
145 * the tail pin may be the last reference on
146 * the page instead, because PageHeadHuge will
147 * not go away until the compound page enters
148 * the buddy allocator.
149 */
150 VM_BUG_ON_PAGE(PageSlab(page_head), page_head);
151 __put_compound_page(page_head);
152 }
153 } else
154 /*
155 * __split_huge_page_refcount run before us,
156 * @page was a THP tail. The split @page_head
157 * has been freed and reallocated as slab or
158 * hugetlbfs page of smaller order (only
159 * possible if reallocated as slab on x86).
160 */
161 if (put_page_testzero(page))
162 __put_single_page(page);
163}
164
165static __always_inline
166void put_refcounted_compound_page(struct page *page_head, struct page *page)
167{
168 if (likely(page != page_head && get_page_unless_zero(page_head))) {
169 unsigned long flags;
170
171 /*
172 * @page_head wasn't a dangling pointer but it may not
173 * be a head page anymore by the time we obtain the
174 * lock. That is ok as long as it can't be freed from
175 * under us.
176 */
177 flags = compound_lock_irqsave(page_head);
178 if (unlikely(!PageTail(page))) {
179 /* __split_huge_page_refcount run before us */
180 compound_unlock_irqrestore(page_head, flags);
181 if (put_page_testzero(page_head)) {
182 /*
183 * The @page_head may have been freed
184 * and reallocated as a compound page
185 * of smaller order and then freed
186 * again. All we know is that it
187 * cannot have become: a THP page, a
188 * compound page of higher order, a
189 * tail page. That is because we
190 * still hold the refcount of the
191 * split THP tail and page_head was
192 * the THP head before the split.
193 */
194 if (PageHead(page_head))
195 __put_compound_page(page_head);
196 else
197 __put_single_page(page_head);
198 }
199out_put_single:
200 if (put_page_testzero(page))
201 __put_single_page(page);
202 return;
203 }
204 VM_BUG_ON_PAGE(page_head != compound_head(page), page);
205 /*
206 * We can release the refcount taken by
207 * get_page_unless_zero() now that
208 * __split_huge_page_refcount() is blocked on the
209 * compound_lock.
210 */
211 if (put_page_testzero(page_head))
212 VM_BUG_ON_PAGE(1, page_head);
213 /* __split_huge_page_refcount will wait now */
214 VM_BUG_ON_PAGE(page_mapcount(page) <= 0, page);
215 atomic_dec(&page->_mapcount);
216 VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0, page_head);
217 VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page);
218 compound_unlock_irqrestore(page_head, flags);
219
220 if (put_page_testzero(page_head)) {
221 if (PageHead(page_head))
222 __put_compound_page(page_head);
223 else
224 __put_single_page(page_head);
225 }
226 } else {
227 /* @page_head is a dangling pointer */
228 VM_BUG_ON_PAGE(PageTail(page), page);
229 goto out_put_single;
230 }
231}
232
233static void put_compound_page(struct page *page)
234{
235 struct page *page_head;
236
237 /*
238 * We see the PageCompound set and PageTail not set, so @page maybe:
239 * 1. hugetlbfs head page, or
240 * 2. THP head page.
241 */
242 if (likely(!PageTail(page))) {
243 if (put_page_testzero(page)) {
244 /*
245 * By the time all refcounts have been released
246 * split_huge_page cannot run anymore from under us.
247 */
248 if (PageHead(page))
249 __put_compound_page(page);
250 else
251 __put_single_page(page);
252 }
253 return;
254 }
255
256 /*
257 * We see the PageCompound set and PageTail set, so @page maybe:
258 * 1. a tail hugetlbfs page, or
259 * 2. a tail THP page, or
260 * 3. a split THP page.
261 *
262 * Case 3 is possible, as we may race with
263 * __split_huge_page_refcount tearing down a THP page.
264 */
265 page_head = compound_head(page);
266 if (!__compound_tail_refcounted(page_head))
267 put_unrefcounted_compound_page(page_head, page);
268 else
269 put_refcounted_compound_page(page_head, page);
270}
271
272void put_page(struct page *page)
273{ 95{
274 if (unlikely(PageCompound(page))) 96 if (unlikely(PageCompound(page)))
275 put_compound_page(page); 97 __put_compound_page(page);
276 else if (put_page_testzero(page)) 98 else
277 __put_single_page(page); 99 __put_single_page(page);
278} 100}
279EXPORT_SYMBOL(put_page); 101EXPORT_SYMBOL(__put_page);
280
281/*
282 * This function is exported but must not be called by anything other
283 * than get_page(). It implements the slow path of get_page().
284 */
285bool __get_page_tail(struct page *page)
286{
287 /*
288 * This takes care of get_page() if run on a tail page
289 * returned by one of the get_user_pages/follow_page variants.
290 * get_user_pages/follow_page itself doesn't need the compound
291 * lock because it runs __get_page_tail_foll() under the
292 * proper PT lock that already serializes against
293 * split_huge_page().
294 */
295 unsigned long flags;
296 bool got;
297 struct page *page_head = compound_head(page);
298
299 /* Ref to put_compound_page() comment. */
300 if (!__compound_tail_refcounted(page_head)) {
301 smp_rmb();
302 if (likely(PageTail(page))) {
303 /*
304 * This is a hugetlbfs page or a slab
305 * page. __split_huge_page_refcount
306 * cannot race here.
307 */
308 VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
309 __get_page_tail_foll(page, true);
310 return true;
311 } else {
312 /*
313 * __split_huge_page_refcount run
314 * before us, "page" was a THP
315 * tail. The split page_head has been
316 * freed and reallocated as slab or
317 * hugetlbfs page of smaller order
318 * (only possible if reallocated as
319 * slab on x86).
320 */
321 return false;
322 }
323 }
324
325 got = false;
326 if (likely(page != page_head && get_page_unless_zero(page_head))) {
327 /*
328 * page_head wasn't a dangling pointer but it
329 * may not be a head page anymore by the time
330 * we obtain the lock. That is ok as long as it
331 * can't be freed from under us.
332 */
333 flags = compound_lock_irqsave(page_head);
334 /* here __split_huge_page_refcount won't run anymore */
335 if (likely(PageTail(page))) {
336 __get_page_tail_foll(page, false);
337 got = true;
338 }
339 compound_unlock_irqrestore(page_head, flags);
340 if (unlikely(!got))
341 put_page(page_head);
342 }
343 return got;
344}
345EXPORT_SYMBOL(__get_page_tail);
346 102
347/** 103/**
348 * put_pages_list() - release a list of pages 104 * put_pages_list() - release a list of pages
@@ -604,6 +360,7 @@ static void __lru_cache_activate_page(struct page *page)
604 */ 360 */
605void mark_page_accessed(struct page *page) 361void mark_page_accessed(struct page *page)
606{ 362{
363 page = compound_head(page);
607 if (!PageActive(page) && !PageUnevictable(page) && 364 if (!PageActive(page) && !PageUnevictable(page) &&
608 PageReferenced(page)) { 365 PageReferenced(page)) {
609 366
@@ -799,6 +556,24 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
799 update_page_reclaim_stat(lruvec, file, 0); 556 update_page_reclaim_stat(lruvec, file, 0);
800} 557}
801 558
559
560static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
561 void *arg)
562{
563 if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
564 int file = page_is_file_cache(page);
565 int lru = page_lru_base_type(page);
566
567 del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
568 ClearPageActive(page);
569 ClearPageReferenced(page);
570 add_page_to_lru_list(page, lruvec, lru);
571
572 __count_vm_event(PGDEACTIVATE);
573 update_page_reclaim_stat(lruvec, file, 0);
574 }
575}
576
802/* 577/*
803 * Drain pages out of the cpu's pagevecs. 578 * Drain pages out of the cpu's pagevecs.
804 * Either "cpu" is the current CPU, and preemption has already been 579 * Either "cpu" is the current CPU, and preemption has already been
@@ -825,6 +600,10 @@ void lru_add_drain_cpu(int cpu)
825 if (pagevec_count(pvec)) 600 if (pagevec_count(pvec))
826 pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); 601 pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
827 602
603 pvec = &per_cpu(lru_deactivate_pvecs, cpu);
604 if (pagevec_count(pvec))
605 pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
606
828 activate_page_drain(cpu); 607 activate_page_drain(cpu);
829} 608}
830 609
@@ -854,6 +633,26 @@ void deactivate_file_page(struct page *page)
854 } 633 }
855} 634}
856 635
636/**
637 * deactivate_page - deactivate a page
638 * @page: page to deactivate
639 *
640 * deactivate_page() moves @page to the inactive list if @page was on the active
641 * list and was not an unevictable page. This is done to accelerate the reclaim
642 * of @page.
643 */
644void deactivate_page(struct page *page)
645{
646 if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
647 struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
648
649 page_cache_get(page);
650 if (!pagevec_add(pvec, page))
651 pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
652 put_cpu_var(lru_deactivate_pvecs);
653 }
654}
655
857void lru_add_drain(void) 656void lru_add_drain(void)
858{ 657{
859 lru_add_drain_cpu(get_cpu()); 658 lru_add_drain_cpu(get_cpu());
@@ -883,6 +682,7 @@ void lru_add_drain_all(void)
883 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || 682 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
884 pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || 683 pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
885 pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) || 684 pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
685 pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
886 need_activate_page_drain(cpu)) { 686 need_activate_page_drain(cpu)) {
887 INIT_WORK(work, lru_add_drain_per_cpu); 687 INIT_WORK(work, lru_add_drain_per_cpu);
888 schedule_work_on(cpu, work); 688 schedule_work_on(cpu, work);
@@ -918,15 +718,6 @@ void release_pages(struct page **pages, int nr, bool cold)
918 for (i = 0; i < nr; i++) { 718 for (i = 0; i < nr; i++) {
919 struct page *page = pages[i]; 719 struct page *page = pages[i];
920 720
921 if (unlikely(PageCompound(page))) {
922 if (zone) {
923 spin_unlock_irqrestore(&zone->lru_lock, flags);
924 zone = NULL;
925 }
926 put_compound_page(page);
927 continue;
928 }
929
930 /* 721 /*
931 * Make sure the IRQ-safe lock-holding time does not get 722 * Make sure the IRQ-safe lock-holding time does not get
932 * excessive with a continuous string of pages from the 723 * excessive with a continuous string of pages from the
@@ -937,9 +728,19 @@ void release_pages(struct page **pages, int nr, bool cold)
937 zone = NULL; 728 zone = NULL;
938 } 729 }
939 730
731 page = compound_head(page);
940 if (!put_page_testzero(page)) 732 if (!put_page_testzero(page))
941 continue; 733 continue;
942 734
735 if (PageCompound(page)) {
736 if (zone) {
737 spin_unlock_irqrestore(&zone->lru_lock, flags);
738 zone = NULL;
739 }
740 __put_compound_page(page);
741 continue;
742 }
743
943 if (PageLRU(page)) { 744 if (PageLRU(page)) {
944 struct zone *pagezone = page_zone(page); 745 struct zone *pagezone = page_zone(page);
945 746