aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLee Schermerhorn <Lee.Schermerhorn@hp.com>2008-10-18 23:26:39 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-10-20 11:50:26 -0400
commit894bc310419ac95f4fa4142dc364401a7e607f65 (patch)
tree15d56a7333b41620016b845d2323dd06e822b621 /mm
parent8a7a8544a4f6554ec2d8048ac9f9672f442db5a2 (diff)
Unevictable LRU Infrastructure
When the system contains lots of mlocked or otherwise unevictable pages, the pageout code (kswapd) can spend lots of time scanning over these pages. Worse still, the presence of lots of unevictable pages can confuse kswapd into thinking that more aggressive pageout modes are required, resulting in all kinds of bad behaviour. Infrastructure to manage pages excluded from reclaim--i.e., hidden from vmscan. Based on a patch by Larry Woodman of Red Hat. Reworked to maintain "unevictable" pages on a separate per-zone LRU list, to "hide" them from vmscan. Kosaki Motohiro added the support for the memory controller unevictable lru list. Pages on the unevictable list have both PG_unevictable and PG_lru set. Thus, PG_unevictable is analogous to and mutually exclusive with PG_active--it specifies which LRU list the page is on. The unevictable infrastructure is enabled by a new mm Kconfig option [CONFIG_]UNEVICTABLE_LRU. A new function 'page_evictable(page, vma)' in vmscan.c tests whether or not a page may be evictable. Subsequent patches will add the various !evictable tests. We'll want to keep these tests light-weight for use in shrink_active_list() and, possibly, the fault path. To avoid races between tasks putting pages [back] onto an LRU list and tasks that might be moving the page from non-evictable to evictable state, the new function 'putback_lru_page()' -- inverse to 'isolate_lru_page()' -- tests the "evictability" of a page after placing it on the LRU, before dropping the reference. If the page has become unevictable, putback_lru_page() will redo the 'putback', thus moving the page to the unevictable list. This way, we avoid "stranding" evictable pages on the unevictable list. [akpm@linux-foundation.org: fix fallout from out-of-order merge] [riel@redhat.com: fix UNEVICTABLE_LRU and !PROC_PAGE_MONITOR build] [nishimura@mxp.nes.nec.co.jp: remove redundant mapping check] [kosaki.motohiro@jp.fujitsu.com: unevictable-lru-infrastructure: putback_lru_page()/unevictable page handling rework] [kosaki.motohiro@jp.fujitsu.com: kill unnecessary lock_page() in vmscan.c] [kosaki.motohiro@jp.fujitsu.com: revert migration change of unevictable lru infrastructure] [kosaki.motohiro@jp.fujitsu.com: revert to unevictable-lru-infrastructure-kconfig-fix.patch] [kosaki.motohiro@jp.fujitsu.com: restore patch failure of vmstat-unevictable-and-mlocked-pages-vm-events.patch] Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Rik van Riel <riel@redhat.com> Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Debugged-by: Benjamin Kidwell <benjkidwell@yahoo.com> Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig11
-rw-r--r--mm/internal.h26
-rw-r--r--mm/memcontrol.c73
-rw-r--r--mm/mempolicy.c2
-rw-r--r--mm/migrate.c31
-rw-r--r--mm/swap.c42
-rw-r--r--mm/vmscan.c149
7 files changed, 272 insertions, 62 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 1a501a4de95c..5b5790f8a816 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -209,5 +209,16 @@ config VIRT_TO_BUS
209 def_bool y 209 def_bool y
210 depends on !ARCH_NO_VIRT_TO_BUS 210 depends on !ARCH_NO_VIRT_TO_BUS
211 211
212config UNEVICTABLE_LRU
213 bool "Add LRU list to track non-evictable pages"
214 default y
215 depends on MMU
216 help
217 Keeps unevictable pages off of the active and inactive pageout
218 lists, so kswapd will not waste CPU time or have its balancing
219 algorithms thrown off by scanning these pages. Selecting this
220 will use one page flag and increase the code size a little,
221 say Y unless you know what you are doing.
222
212config MMU_NOTIFIER 223config MMU_NOTIFIER
213 bool 224 bool
diff --git a/mm/internal.h b/mm/internal.h
index 4e8e78b978b5..3db17b2a1ac6 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -39,8 +39,15 @@ static inline void __put_page(struct page *page)
39 atomic_dec(&page->_count); 39 atomic_dec(&page->_count);
40} 40}
41 41
42/*
43 * in mm/vmscan.c:
44 */
42extern int isolate_lru_page(struct page *page); 45extern int isolate_lru_page(struct page *page);
46extern void putback_lru_page(struct page *page);
43 47
48/*
49 * in mm/page_alloc.c
50 */
44extern void __free_pages_bootmem(struct page *page, unsigned int order); 51extern void __free_pages_bootmem(struct page *page, unsigned int order);
45 52
46/* 53/*
@@ -54,6 +61,25 @@ static inline unsigned long page_order(struct page *page)
54 return page_private(page); 61 return page_private(page);
55} 62}
56 63
64#ifdef CONFIG_UNEVICTABLE_LRU
65/*
66 * unevictable_migrate_page() called only from migrate_page_copy() to
67 * migrate unevictable flag to new page.
68 * Note that the old page has been isolated from the LRU lists at this
69 * point so we don't need to worry about LRU statistics.
70 */
71static inline void unevictable_migrate_page(struct page *new, struct page *old)
72{
73 if (TestClearPageUnevictable(old))
74 SetPageUnevictable(new);
75}
76#else
77static inline void unevictable_migrate_page(struct page *new, struct page *old)
78{
79}
80#endif
81
82
57/* 83/*
58 * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node, 84 * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node,
59 * so all functions starting at paging_init should be marked __init 85 * so all functions starting at paging_init should be marked __init
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 27e9e75f4eab..82c065e7551e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -160,9 +160,10 @@ struct page_cgroup {
160 struct mem_cgroup *mem_cgroup; 160 struct mem_cgroup *mem_cgroup;
161 int flags; 161 int flags;
162}; 162};
163#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */ 163#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */
164#define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */ 164#define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */
165#define PAGE_CGROUP_FLAG_FILE (0x4) /* page is file system backed */ 165#define PAGE_CGROUP_FLAG_FILE (0x4) /* page is file system backed */
166#define PAGE_CGROUP_FLAG_UNEVICTABLE (0x8) /* page is unevictableable */
166 167
167static int page_cgroup_nid(struct page_cgroup *pc) 168static int page_cgroup_nid(struct page_cgroup *pc)
168{ 169{
@@ -292,10 +293,14 @@ static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
292{ 293{
293 int lru = LRU_BASE; 294 int lru = LRU_BASE;
294 295
295 if (pc->flags & PAGE_CGROUP_FLAG_ACTIVE) 296 if (pc->flags & PAGE_CGROUP_FLAG_UNEVICTABLE)
296 lru += LRU_ACTIVE; 297 lru = LRU_UNEVICTABLE;
297 if (pc->flags & PAGE_CGROUP_FLAG_FILE) 298 else {
298 lru += LRU_FILE; 299 if (pc->flags & PAGE_CGROUP_FLAG_ACTIVE)
300 lru += LRU_ACTIVE;
301 if (pc->flags & PAGE_CGROUP_FLAG_FILE)
302 lru += LRU_FILE;
303 }
299 304
300 MEM_CGROUP_ZSTAT(mz, lru) -= 1; 305 MEM_CGROUP_ZSTAT(mz, lru) -= 1;
301 306
@@ -308,10 +313,14 @@ static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
308{ 313{
309 int lru = LRU_BASE; 314 int lru = LRU_BASE;
310 315
311 if (pc->flags & PAGE_CGROUP_FLAG_ACTIVE) 316 if (pc->flags & PAGE_CGROUP_FLAG_UNEVICTABLE)
312 lru += LRU_ACTIVE; 317 lru = LRU_UNEVICTABLE;
313 if (pc->flags & PAGE_CGROUP_FLAG_FILE) 318 else {
314 lru += LRU_FILE; 319 if (pc->flags & PAGE_CGROUP_FLAG_ACTIVE)
320 lru += LRU_ACTIVE;
321 if (pc->flags & PAGE_CGROUP_FLAG_FILE)
322 lru += LRU_FILE;
323 }
315 324
316 MEM_CGROUP_ZSTAT(mz, lru) += 1; 325 MEM_CGROUP_ZSTAT(mz, lru) += 1;
317 list_add(&pc->lru, &mz->lists[lru]); 326 list_add(&pc->lru, &mz->lists[lru]);
@@ -319,21 +328,31 @@ static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
319 mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, true); 328 mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, true);
320} 329}
321 330
322static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active) 331static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru)
323{ 332{
324 struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); 333 struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
325 int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; 334 int active = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
326 int file = pc->flags & PAGE_CGROUP_FLAG_FILE; 335 int file = pc->flags & PAGE_CGROUP_FLAG_FILE;
327 int lru = LRU_FILE * !!file + !!from; 336 int unevictable = pc->flags & PAGE_CGROUP_FLAG_UNEVICTABLE;
337 enum lru_list from = unevictable ? LRU_UNEVICTABLE :
338 (LRU_FILE * !!file + !!active);
328 339
329 MEM_CGROUP_ZSTAT(mz, lru) -= 1; 340 if (lru == from)
341 return;
330 342
331 if (active) 343 MEM_CGROUP_ZSTAT(mz, from) -= 1;
332 pc->flags |= PAGE_CGROUP_FLAG_ACTIVE; 344
333 else 345 if (is_unevictable_lru(lru)) {
334 pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE; 346 pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE;
347 pc->flags |= PAGE_CGROUP_FLAG_UNEVICTABLE;
348 } else {
349 if (is_active_lru(lru))
350 pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;
351 else
352 pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE;
353 pc->flags &= ~PAGE_CGROUP_FLAG_UNEVICTABLE;
354 }
335 355
336 lru = LRU_FILE * !!file + !!active;
337 MEM_CGROUP_ZSTAT(mz, lru) += 1; 356 MEM_CGROUP_ZSTAT(mz, lru) += 1;
338 list_move(&pc->lru, &mz->lists[lru]); 357 list_move(&pc->lru, &mz->lists[lru]);
339} 358}
@@ -351,7 +370,7 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
351/* 370/*
352 * This routine assumes that the appropriate zone's lru lock is already held 371 * This routine assumes that the appropriate zone's lru lock is already held
353 */ 372 */
354void mem_cgroup_move_lists(struct page *page, bool active) 373void mem_cgroup_move_lists(struct page *page, enum lru_list lru)
355{ 374{
356 struct page_cgroup *pc; 375 struct page_cgroup *pc;
357 struct mem_cgroup_per_zone *mz; 376 struct mem_cgroup_per_zone *mz;
@@ -374,7 +393,7 @@ void mem_cgroup_move_lists(struct page *page, bool active)
374 if (pc) { 393 if (pc) {
375 mz = page_cgroup_zoneinfo(pc); 394 mz = page_cgroup_zoneinfo(pc);
376 spin_lock_irqsave(&mz->lru_lock, flags); 395 spin_lock_irqsave(&mz->lru_lock, flags);
377 __mem_cgroup_move_lists(pc, active); 396 __mem_cgroup_move_lists(pc, lru);
378 spin_unlock_irqrestore(&mz->lru_lock, flags); 397 spin_unlock_irqrestore(&mz->lru_lock, flags);
379 } 398 }
380 unlock_page_cgroup(page); 399 unlock_page_cgroup(page);
@@ -472,12 +491,10 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
472 /* 491 /*
473 * TODO: play better with lumpy reclaim, grabbing anything. 492 * TODO: play better with lumpy reclaim, grabbing anything.
474 */ 493 */
475 if (PageActive(page) && !active) { 494 if (PageUnevictable(page) ||
476 __mem_cgroup_move_lists(pc, true); 495 (PageActive(page) && !active) ||
477 continue; 496 (!PageActive(page) && active)) {
478 } 497 __mem_cgroup_move_lists(pc, page_lru(page));
479 if (!PageActive(page) && active) {
480 __mem_cgroup_move_lists(pc, false);
481 continue; 498 continue;
482 } 499 }
483 500
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 71b47491487d..36f42573a335 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2202,7 +2202,7 @@ static void gather_stats(struct page *page, void *private, int pte_dirty)
2202 if (PageSwapCache(page)) 2202 if (PageSwapCache(page))
2203 md->swapcache++; 2203 md->swapcache++;
2204 2204
2205 if (PageActive(page)) 2205 if (PageActive(page) || PageUnevictable(page))
2206 md->active++; 2206 md->active++;
2207 2207
2208 if (PageWriteback(page)) 2208 if (PageWriteback(page))
diff --git a/mm/migrate.c b/mm/migrate.c
index c07327487111..b10237d8b459 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -53,14 +53,9 @@ int migrate_prep(void)
53 return 0; 53 return 0;
54} 54}
55 55
56static inline void move_to_lru(struct page *page)
57{
58 lru_cache_add_lru(page, page_lru(page));
59 put_page(page);
60}
61
62/* 56/*
63 * Add isolated pages on the list back to the LRU. 57 * Add isolated pages on the list back to the LRU under page lock
58 * to avoid leaking evictable pages back onto unevictable list.
64 * 59 *
65 * returns the number of pages put back. 60 * returns the number of pages put back.
66 */ 61 */
@@ -72,7 +67,7 @@ int putback_lru_pages(struct list_head *l)
72 67
73 list_for_each_entry_safe(page, page2, l, lru) { 68 list_for_each_entry_safe(page, page2, l, lru) {
74 list_del(&page->lru); 69 list_del(&page->lru);
75 move_to_lru(page); 70 putback_lru_page(page);
76 count++; 71 count++;
77 } 72 }
78 return count; 73 return count;
@@ -354,8 +349,11 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
354 SetPageReferenced(newpage); 349 SetPageReferenced(newpage);
355 if (PageUptodate(page)) 350 if (PageUptodate(page))
356 SetPageUptodate(newpage); 351 SetPageUptodate(newpage);
357 if (PageActive(page)) 352 if (TestClearPageActive(page)) {
353 VM_BUG_ON(PageUnevictable(page));
358 SetPageActive(newpage); 354 SetPageActive(newpage);
355 } else
356 unevictable_migrate_page(newpage, page);
359 if (PageChecked(page)) 357 if (PageChecked(page))
360 SetPageChecked(newpage); 358 SetPageChecked(newpage);
361 if (PageMappedToDisk(page)) 359 if (PageMappedToDisk(page))
@@ -376,7 +374,6 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
376#ifdef CONFIG_SWAP 374#ifdef CONFIG_SWAP
377 ClearPageSwapCache(page); 375 ClearPageSwapCache(page);
378#endif 376#endif
379 ClearPageActive(page);
380 ClearPagePrivate(page); 377 ClearPagePrivate(page);
381 set_page_private(page, 0); 378 set_page_private(page, 0);
382 page->mapping = NULL; 379 page->mapping = NULL;
@@ -555,6 +552,10 @@ static int fallback_migrate_page(struct address_space *mapping,
555 * 552 *
556 * The new page will have replaced the old page if this function 553 * The new page will have replaced the old page if this function
557 * is successful. 554 * is successful.
555 *
556 * Return value:
557 * < 0 - error code
558 * == 0 - success
558 */ 559 */
559static int move_to_new_page(struct page *newpage, struct page *page) 560static int move_to_new_page(struct page *newpage, struct page *page)
560{ 561{
@@ -617,9 +618,10 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
617 if (!newpage) 618 if (!newpage)
618 return -ENOMEM; 619 return -ENOMEM;
619 620
620 if (page_count(page) == 1) 621 if (page_count(page) == 1) {
621 /* page was freed from under us. So we are done. */ 622 /* page was freed from under us. So we are done. */
622 goto move_newpage; 623 goto move_newpage;
624 }
623 625
624 charge = mem_cgroup_prepare_migration(page, newpage); 626 charge = mem_cgroup_prepare_migration(page, newpage);
625 if (charge == -ENOMEM) { 627 if (charge == -ENOMEM) {
@@ -693,7 +695,6 @@ rcu_unlock:
693 rcu_read_unlock(); 695 rcu_read_unlock();
694 696
695unlock: 697unlock:
696
697 unlock_page(page); 698 unlock_page(page);
698 699
699 if (rc != -EAGAIN) { 700 if (rc != -EAGAIN) {
@@ -704,17 +705,19 @@ unlock:
704 * restored. 705 * restored.
705 */ 706 */
706 list_del(&page->lru); 707 list_del(&page->lru);
707 move_to_lru(page); 708 putback_lru_page(page);
708 } 709 }
709 710
710move_newpage: 711move_newpage:
711 if (!charge) 712 if (!charge)
712 mem_cgroup_end_migration(newpage); 713 mem_cgroup_end_migration(newpage);
714
713 /* 715 /*
714 * Move the new page to the LRU. If migration was not successful 716 * Move the new page to the LRU. If migration was not successful
715 * then this will free the page. 717 * then this will free the page.
716 */ 718 */
717 move_to_lru(newpage); 719 putback_lru_page(newpage);
720
718 if (result) { 721 if (result) {
719 if (rc) 722 if (rc)
720 *result = rc; 723 *result = rc;
diff --git a/mm/swap.c b/mm/swap.c
index 0b1974a08974..fee6b973f143 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -115,7 +115,7 @@ static void pagevec_move_tail(struct pagevec *pvec)
115 zone = pagezone; 115 zone = pagezone;
116 spin_lock(&zone->lru_lock); 116 spin_lock(&zone->lru_lock);
117 } 117 }
118 if (PageLRU(page) && !PageActive(page)) { 118 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
119 int lru = page_is_file_cache(page); 119 int lru = page_is_file_cache(page);
120 list_move_tail(&page->lru, &zone->lru[lru].list); 120 list_move_tail(&page->lru, &zone->lru[lru].list);
121 pgmoved++; 121 pgmoved++;
@@ -136,7 +136,7 @@ static void pagevec_move_tail(struct pagevec *pvec)
136void rotate_reclaimable_page(struct page *page) 136void rotate_reclaimable_page(struct page *page)
137{ 137{
138 if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && 138 if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) &&
139 PageLRU(page)) { 139 !PageUnevictable(page) && PageLRU(page)) {
140 struct pagevec *pvec; 140 struct pagevec *pvec;
141 unsigned long flags; 141 unsigned long flags;
142 142
@@ -157,7 +157,7 @@ void activate_page(struct page *page)
157 struct zone *zone = page_zone(page); 157 struct zone *zone = page_zone(page);
158 158
159 spin_lock_irq(&zone->lru_lock); 159 spin_lock_irq(&zone->lru_lock);
160 if (PageLRU(page) && !PageActive(page)) { 160 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
161 int file = page_is_file_cache(page); 161 int file = page_is_file_cache(page);
162 int lru = LRU_BASE + file; 162 int lru = LRU_BASE + file;
163 del_page_from_lru_list(zone, page, lru); 163 del_page_from_lru_list(zone, page, lru);
@@ -166,7 +166,7 @@ void activate_page(struct page *page)
166 lru += LRU_ACTIVE; 166 lru += LRU_ACTIVE;
167 add_page_to_lru_list(zone, page, lru); 167 add_page_to_lru_list(zone, page, lru);
168 __count_vm_event(PGACTIVATE); 168 __count_vm_event(PGACTIVATE);
169 mem_cgroup_move_lists(page, true); 169 mem_cgroup_move_lists(page, lru);
170 170
171 zone->recent_rotated[!!file]++; 171 zone->recent_rotated[!!file]++;
172 zone->recent_scanned[!!file]++; 172 zone->recent_scanned[!!file]++;
@@ -183,7 +183,8 @@ void activate_page(struct page *page)
183 */ 183 */
184void mark_page_accessed(struct page *page) 184void mark_page_accessed(struct page *page)
185{ 185{
186 if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) { 186 if (!PageActive(page) && !PageUnevictable(page) &&
187 PageReferenced(page) && PageLRU(page)) {
187 activate_page(page); 188 activate_page(page);
188 ClearPageReferenced(page); 189 ClearPageReferenced(page);
189 } else if (!PageReferenced(page)) { 190 } else if (!PageReferenced(page)) {
@@ -211,13 +212,38 @@ void __lru_cache_add(struct page *page, enum lru_list lru)
211void lru_cache_add_lru(struct page *page, enum lru_list lru) 212void lru_cache_add_lru(struct page *page, enum lru_list lru)
212{ 213{
213 if (PageActive(page)) { 214 if (PageActive(page)) {
215 VM_BUG_ON(PageUnevictable(page));
214 ClearPageActive(page); 216 ClearPageActive(page);
217 } else if (PageUnevictable(page)) {
218 VM_BUG_ON(PageActive(page));
219 ClearPageUnevictable(page);
215 } 220 }
216 221
217 VM_BUG_ON(PageLRU(page) || PageActive(page)); 222 VM_BUG_ON(PageLRU(page) || PageActive(page) || PageUnevictable(page));
218 __lru_cache_add(page, lru); 223 __lru_cache_add(page, lru);
219} 224}
220 225
226/**
227 * add_page_to_unevictable_list - add a page to the unevictable list
228 * @page: the page to be added to the unevictable list
229 *
230 * Add page directly to its zone's unevictable list. To avoid races with
231 * tasks that might be making the page evictable, through eg. munlock,
232 * munmap or exit, while it's not on the lru, we want to add the page
233 * while it's locked or otherwise "invisible" to other tasks. This is
234 * difficult to do when using the pagevec cache, so bypass that.
235 */
236void add_page_to_unevictable_list(struct page *page)
237{
238 struct zone *zone = page_zone(page);
239
240 spin_lock_irq(&zone->lru_lock);
241 SetPageUnevictable(page);
242 SetPageLRU(page);
243 add_page_to_lru_list(zone, page, LRU_UNEVICTABLE);
244 spin_unlock_irq(&zone->lru_lock);
245}
246
221/* 247/*
222 * Drain pages out of the cpu's pagevecs. 248 * Drain pages out of the cpu's pagevecs.
223 * Either "cpu" is the current CPU, and preemption has already been 249 * Either "cpu" is the current CPU, and preemption has already been
@@ -316,6 +342,7 @@ void release_pages(struct page **pages, int nr, int cold)
316 342
317 if (PageLRU(page)) { 343 if (PageLRU(page)) {
318 struct zone *pagezone = page_zone(page); 344 struct zone *pagezone = page_zone(page);
345
319 if (pagezone != zone) { 346 if (pagezone != zone) {
320 if (zone) 347 if (zone)
321 spin_unlock_irqrestore(&zone->lru_lock, 348 spin_unlock_irqrestore(&zone->lru_lock,
@@ -392,6 +419,7 @@ void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
392{ 419{
393 int i; 420 int i;
394 struct zone *zone = NULL; 421 struct zone *zone = NULL;
422 VM_BUG_ON(is_unevictable_lru(lru));
395 423
396 for (i = 0; i < pagevec_count(pvec); i++) { 424 for (i = 0; i < pagevec_count(pvec); i++) {
397 struct page *page = pvec->pages[i]; 425 struct page *page = pvec->pages[i];
@@ -403,6 +431,8 @@ void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
403 zone = pagezone; 431 zone = pagezone;
404 spin_lock_irq(&zone->lru_lock); 432 spin_lock_irq(&zone->lru_lock);
405 } 433 }
434 VM_BUG_ON(PageActive(page));
435 VM_BUG_ON(PageUnevictable(page));
406 VM_BUG_ON(PageLRU(page)); 436 VM_BUG_ON(PageLRU(page));
407 SetPageLRU(page); 437 SetPageLRU(page);
408 if (is_active_lru(lru)) 438 if (is_active_lru(lru))
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a8347b677e74..154b9b608da6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -470,6 +470,79 @@ int remove_mapping(struct address_space *mapping, struct page *page)
470 return 0; 470 return 0;
471} 471}
472 472
473/**
474 * putback_lru_page - put previously isolated page onto appropriate LRU list
475 * @page: page to be put back to appropriate lru list
476 *
477 * Add previously isolated @page to appropriate LRU list.
478 * Page may still be unevictable for other reasons.
479 *
480 * lru_lock must not be held, interrupts must be enabled.
481 */
482#ifdef CONFIG_UNEVICTABLE_LRU
483void putback_lru_page(struct page *page)
484{
485 int lru;
486 int active = !!TestClearPageActive(page);
487
488 VM_BUG_ON(PageLRU(page));
489
490redo:
491 ClearPageUnevictable(page);
492
493 if (page_evictable(page, NULL)) {
494 /*
495 * For evictable pages, we can use the cache.
496 * In event of a race, worst case is we end up with an
497 * unevictable page on [in]active list.
498 * We know how to handle that.
499 */
500 lru = active + page_is_file_cache(page);
501 lru_cache_add_lru(page, lru);
502 } else {
503 /*
504 * Put unevictable pages directly on zone's unevictable
505 * list.
506 */
507 lru = LRU_UNEVICTABLE;
508 add_page_to_unevictable_list(page);
509 }
510 mem_cgroup_move_lists(page, lru);
511
512 /*
513 * page's status can change while we move it among lru. If an evictable
514 * page is on unevictable list, it never be freed. To avoid that,
515 * check after we added it to the list, again.
516 */
517 if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) {
518 if (!isolate_lru_page(page)) {
519 put_page(page);
520 goto redo;
521 }
522 /* This means someone else dropped this page from LRU
523 * So, it will be freed or putback to LRU again. There is
524 * nothing to do here.
525 */
526 }
527
528 put_page(page); /* drop ref from isolate */
529}
530
531#else /* CONFIG_UNEVICTABLE_LRU */
532
533void putback_lru_page(struct page *page)
534{
535 int lru;
536 VM_BUG_ON(PageLRU(page));
537
538 lru = !!TestClearPageActive(page) + page_is_file_cache(page);
539 lru_cache_add_lru(page, lru);
540 mem_cgroup_move_lists(page, lru);
541 put_page(page);
542}
543#endif /* CONFIG_UNEVICTABLE_LRU */
544
545
473/* 546/*
474 * shrink_page_list() returns the number of reclaimed pages 547 * shrink_page_list() returns the number of reclaimed pages
475 */ 548 */
@@ -503,6 +576,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
503 576
504 sc->nr_scanned++; 577 sc->nr_scanned++;
505 578
579 if (unlikely(!page_evictable(page, NULL))) {
580 unlock_page(page);
581 putback_lru_page(page);
582 continue;
583 }
584
506 if (!sc->may_swap && page_mapped(page)) 585 if (!sc->may_swap && page_mapped(page))
507 goto keep_locked; 586 goto keep_locked;
508 587
@@ -602,7 +681,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
602 * possible for a page to have PageDirty set, but it is actually 681 * possible for a page to have PageDirty set, but it is actually
603 * clean (all its buffers are clean). This happens if the 682 * clean (all its buffers are clean). This happens if the
604 * buffers were written out directly, with submit_bh(). ext3 683 * buffers were written out directly, with submit_bh(). ext3
605 * will do this, as well as the blockdev mapping. 684 * will do this, as well as the blockdev mapping.
606 * try_to_release_page() will discover that cleanness and will 685 * try_to_release_page() will discover that cleanness and will
607 * drop the buffers and mark the page clean - it can be freed. 686 * drop the buffers and mark the page clean - it can be freed.
608 * 687 *
@@ -650,6 +729,7 @@ activate_locked:
650 /* Not a candidate for swapping, so reclaim swap space. */ 729 /* Not a candidate for swapping, so reclaim swap space. */
651 if (PageSwapCache(page) && vm_swap_full()) 730 if (PageSwapCache(page) && vm_swap_full())
652 remove_exclusive_swap_page_ref(page); 731 remove_exclusive_swap_page_ref(page);
732 VM_BUG_ON(PageActive(page));
653 SetPageActive(page); 733 SetPageActive(page);
654 pgactivate++; 734 pgactivate++;
655keep_locked: 735keep_locked:
@@ -699,6 +779,14 @@ int __isolate_lru_page(struct page *page, int mode, int file)
699 if (mode != ISOLATE_BOTH && (!page_is_file_cache(page) != !file)) 779 if (mode != ISOLATE_BOTH && (!page_is_file_cache(page) != !file))
700 return ret; 780 return ret;
701 781
782 /*
783 * When this function is being called for lumpy reclaim, we
784 * initially look into all LRU pages, active, inactive and
785 * unevictable; only give shrink_page_list evictable pages.
786 */
787 if (PageUnevictable(page))
788 return ret;
789
702 ret = -EBUSY; 790 ret = -EBUSY;
703 if (likely(get_page_unless_zero(page))) { 791 if (likely(get_page_unless_zero(page))) {
704 /* 792 /*
@@ -810,7 +898,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
810 /* else it is being freed elsewhere */ 898 /* else it is being freed elsewhere */
811 list_move(&cursor_page->lru, src); 899 list_move(&cursor_page->lru, src);
812 default: 900 default:
813 break; 901 break; /* ! on LRU or wrong list */
814 } 902 }
815 } 903 }
816 } 904 }
@@ -870,8 +958,9 @@ static unsigned long clear_active_flags(struct list_head *page_list,
870 * Returns -EBUSY if the page was not on an LRU list. 958 * Returns -EBUSY if the page was not on an LRU list.
871 * 959 *
872 * The returned page will have PageLRU() cleared. If it was found on 960 * The returned page will have PageLRU() cleared. If it was found on
873 * the active list, it will have PageActive set. That flag may need 961 * the active list, it will have PageActive set. If it was found on
874 * to be cleared by the caller before letting the page go. 962 * the unevictable list, it will have the PageUnevictable bit set. That flag
963 * may need to be cleared by the caller before letting the page go.
875 * 964 *
876 * The vmstat statistic corresponding to the list on which the page was 965 * The vmstat statistic corresponding to the list on which the page was
877 * found will be decremented. 966 * found will be decremented.
@@ -892,11 +981,10 @@ int isolate_lru_page(struct page *page)
892 981
893 spin_lock_irq(&zone->lru_lock); 982 spin_lock_irq(&zone->lru_lock);
894 if (PageLRU(page) && get_page_unless_zero(page)) { 983 if (PageLRU(page) && get_page_unless_zero(page)) {
895 int lru = LRU_BASE; 984 int lru = page_lru(page);
896 ret = 0; 985 ret = 0;
897 ClearPageLRU(page); 986 ClearPageLRU(page);
898 987
899 lru += page_is_file_cache(page) + !!PageActive(page);
900 del_page_from_lru_list(zone, page, lru); 988 del_page_from_lru_list(zone, page, lru);
901 } 989 }
902 spin_unlock_irq(&zone->lru_lock); 990 spin_unlock_irq(&zone->lru_lock);
@@ -1008,11 +1096,20 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1008 * Put back any unfreeable pages. 1096 * Put back any unfreeable pages.
1009 */ 1097 */
1010 while (!list_empty(&page_list)) { 1098 while (!list_empty(&page_list)) {
1099 int lru;
1011 page = lru_to_page(&page_list); 1100 page = lru_to_page(&page_list);
1012 VM_BUG_ON(PageLRU(page)); 1101 VM_BUG_ON(PageLRU(page));
1013 SetPageLRU(page);
1014 list_del(&page->lru); 1102 list_del(&page->lru);
1015 add_page_to_lru_list(zone, page, page_lru(page)); 1103 if (unlikely(!page_evictable(page, NULL))) {
1104 spin_unlock_irq(&zone->lru_lock);
1105 putback_lru_page(page);
1106 spin_lock_irq(&zone->lru_lock);
1107 continue;
1108 }
1109 SetPageLRU(page);
1110 lru = page_lru(page);
1111 add_page_to_lru_list(zone, page, lru);
1112 mem_cgroup_move_lists(page, lru);
1016 if (PageActive(page) && scan_global_lru(sc)) { 1113 if (PageActive(page) && scan_global_lru(sc)) {
1017 int file = !!page_is_file_cache(page); 1114 int file = !!page_is_file_cache(page);
1018 zone->recent_rotated[file]++; 1115 zone->recent_rotated[file]++;
@@ -1107,6 +1204,11 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1107 page = lru_to_page(&l_hold); 1204 page = lru_to_page(&l_hold);
1108 list_del(&page->lru); 1205 list_del(&page->lru);
1109 1206
1207 if (unlikely(!page_evictable(page, NULL))) {
1208 putback_lru_page(page);
1209 continue;
1210 }
1211
1110 /* page_referenced clears PageReferenced */ 1212 /* page_referenced clears PageReferenced */
1111 if (page_mapping_inuse(page) && 1213 if (page_mapping_inuse(page) &&
1112 page_referenced(page, 0, sc->mem_cgroup)) 1214 page_referenced(page, 0, sc->mem_cgroup))
@@ -1140,7 +1242,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1140 ClearPageActive(page); 1242 ClearPageActive(page);
1141 1243
1142 list_move(&page->lru, &zone->lru[lru].list); 1244 list_move(&page->lru, &zone->lru[lru].list);
1143 mem_cgroup_move_lists(page, false); 1245 mem_cgroup_move_lists(page, lru);
1144 pgmoved++; 1246 pgmoved++;
1145 if (!pagevec_add(&pvec, page)) { 1247 if (!pagevec_add(&pvec, page)) {
1146 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); 1248 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
@@ -1286,7 +1388,7 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
1286 1388
1287 get_scan_ratio(zone, sc, percent); 1389 get_scan_ratio(zone, sc, percent);
1288 1390
1289 for_each_lru(l) { 1391 for_each_evictable_lru(l) {
1290 if (scan_global_lru(sc)) { 1392 if (scan_global_lru(sc)) {
1291 int file = is_file_lru(l); 1393 int file = is_file_lru(l);
1292 int scan; 1394 int scan;
@@ -1318,7 +1420,7 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
1318 1420
1319 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 1421 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
1320 nr[LRU_INACTIVE_FILE]) { 1422 nr[LRU_INACTIVE_FILE]) {
1321 for_each_lru(l) { 1423 for_each_evictable_lru(l) {
1322 if (nr[l]) { 1424 if (nr[l]) {
1323 nr_to_scan = min(nr[l], 1425 nr_to_scan = min(nr[l],
1324 (unsigned long)sc->swap_cluster_max); 1426 (unsigned long)sc->swap_cluster_max);
@@ -1875,8 +1977,8 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
1875 if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY) 1977 if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY)
1876 continue; 1978 continue;
1877 1979
1878 for_each_lru(l) { 1980 for_each_evictable_lru(l) {
1879 /* For pass = 0 we don't shrink the active list */ 1981 /* For pass = 0, we don't shrink the active list */
1880 if (pass == 0 && 1982 if (pass == 0 &&
1881 (l == LRU_ACTIVE || l == LRU_ACTIVE_FILE)) 1983 (l == LRU_ACTIVE || l == LRU_ACTIVE_FILE))
1882 continue; 1984 continue;
@@ -2213,3 +2315,24 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2213 return ret; 2315 return ret;
2214} 2316}
2215#endif 2317#endif
2318
2319#ifdef CONFIG_UNEVICTABLE_LRU
2320/*
2321 * page_evictable - test whether a page is evictable
2322 * @page: the page to test
2323 * @vma: the VMA in which the page is or will be mapped, may be NULL
2324 *
2325 * Test whether page is evictable--i.e., should be placed on active/inactive
2326 * lists vs unevictable list.
2327 *
2328 * Reasons page might not be evictable:
2329 * TODO - later patches
2330 */
2331int page_evictable(struct page *page, struct vm_area_struct *vma)
2332{
2333
2334 /* TODO: test page [!]evictable conditions */
2335
2336 return 1;
2337}
2338#endif