aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorRik van Riel <riel@redhat.com>2008-10-18 23:26:32 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-10-20 11:50:25 -0400
commit4f98a2fee8acdb4ac84545df98cccecfd130f8db (patch)
tree035a2937f4c3e2f7b4269412041c073ac646937c /mm
parentb2e185384f534781fd22f5ce170b2ad26f97df70 (diff)
vmscan: split LRU lists into anon & file sets
Split the LRU lists in two, one set for pages that are backed by real file systems ("file") and one for pages that are backed by memory and swap ("anon"). The latter includes tmpfs. The advantage of doing this is that the VM will not have to scan over lots of anonymous pages (which we generally do not want to swap out), just to find the page cache pages that it should evict. This patch has the infrastructure and a basic policy to balance how much we scan the anon lists and how much we scan the file lists. The big policy changes are in separate patches. [lee.schermerhorn@hp.com: collect lru meminfo statistics from correct offset] [kosaki.motohiro@jp.fujitsu.com: prevent incorrect oom under split_lru] [kosaki.motohiro@jp.fujitsu.com: fix pagevec_move_tail() doesn't treat unevictable page] [hugh@veritas.com: memcg swapbacked pages active] [hugh@veritas.com: splitlru: BDI_CAP_SWAP_BACKED] [akpm@linux-foundation.org: fix /proc/vmstat units] [nishimura@mxp.nes.nec.co.jp: memcg: fix handling of shmem migration] [kosaki.motohiro@jp.fujitsu.com: adjust Quicklists field of /proc/meminfo] [kosaki.motohiro@jp.fujitsu.com: fix style issue of get_scan_ratio()] Signed-off-by: Rik van Riel <riel@redhat.com> Signed-off-by: Lee Schermerhorn <Lee.Schermerhorn@hp.com> Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/filemap.c22
-rw-r--r--mm/hugetlb.c10
-rw-r--r--mm/memcontrol.c88
-rw-r--r--mm/memory.c6
-rw-r--r--mm/page-writeback.c8
-rw-r--r--mm/page_alloc.c25
-rw-r--r--mm/readahead.c2
-rw-r--r--mm/shmem.c2
-rw-r--r--mm/swap.c14
-rw-r--r--mm/swap_state.c4
-rw-r--r--mm/vmscan.c416
-rw-r--r--mm/vmstat.c14
12 files changed, 337 insertions, 274 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 903bf316912a..a1ddd2557af2 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -33,6 +33,7 @@
33#include <linux/cpuset.h> 33#include <linux/cpuset.h>
34#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ 34#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
35#include <linux/memcontrol.h> 35#include <linux/memcontrol.h>
36#include <linux/mm_inline.h> /* for page_is_file_cache() */
36#include "internal.h" 37#include "internal.h"
37 38
38/* 39/*
@@ -492,9 +493,24 @@ EXPORT_SYMBOL(add_to_page_cache_locked);
492int add_to_page_cache_lru(struct page *page, struct address_space *mapping, 493int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
493 pgoff_t offset, gfp_t gfp_mask) 494 pgoff_t offset, gfp_t gfp_mask)
494{ 495{
495 int ret = add_to_page_cache(page, mapping, offset, gfp_mask); 496 int ret;
496 if (ret == 0) 497
497 lru_cache_add(page); 498 /*
499 * Splice_read and readahead add shmem/tmpfs pages into the page cache
500 * before shmem_readpage has a chance to mark them as SwapBacked: they
501 * need to go on the active_anon lru below, and mem_cgroup_cache_charge
502 * (called in add_to_page_cache) needs to know where they're going too.
503 */
504 if (mapping_cap_swap_backed(mapping))
505 SetPageSwapBacked(page);
506
507 ret = add_to_page_cache(page, mapping, offset, gfp_mask);
508 if (ret == 0) {
509 if (page_is_file_cache(page))
510 lru_cache_add_file(page);
511 else
512 lru_cache_add_active_anon(page);
513 }
498 return ret; 514 return ret;
499} 515}
500 516
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 38633864a93e..2fc7fddd9b1f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1459,11 +1459,11 @@ int hugetlb_report_meminfo(char *buf)
1459{ 1459{
1460 struct hstate *h = &default_hstate; 1460 struct hstate *h = &default_hstate;
1461 return sprintf(buf, 1461 return sprintf(buf,
1462 "HugePages_Total: %5lu\n" 1462 "HugePages_Total: %5lu\n"
1463 "HugePages_Free: %5lu\n" 1463 "HugePages_Free: %5lu\n"
1464 "HugePages_Rsvd: %5lu\n" 1464 "HugePages_Rsvd: %5lu\n"
1465 "HugePages_Surp: %5lu\n" 1465 "HugePages_Surp: %5lu\n"
1466 "Hugepagesize: %5lu kB\n", 1466 "Hugepagesize: %8lu kB\n",
1467 h->nr_huge_pages, 1467 h->nr_huge_pages,
1468 h->free_huge_pages, 1468 h->free_huge_pages,
1469 h->resv_huge_pages, 1469 h->resv_huge_pages,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c0cbd7790c51..27e9e75f4eab 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -162,6 +162,7 @@ struct page_cgroup {
162}; 162};
163#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */ 163#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */
164#define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */ 164#define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */
165#define PAGE_CGROUP_FLAG_FILE (0x4) /* page is file system backed */
165 166
166static int page_cgroup_nid(struct page_cgroup *pc) 167static int page_cgroup_nid(struct page_cgroup *pc)
167{ 168{
@@ -177,6 +178,7 @@ enum charge_type {
177 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 178 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
178 MEM_CGROUP_CHARGE_TYPE_MAPPED, 179 MEM_CGROUP_CHARGE_TYPE_MAPPED,
179 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ 180 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */
181 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */
180}; 182};
181 183
182/* 184/*
@@ -288,8 +290,12 @@ static void unlock_page_cgroup(struct page *page)
288static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz, 290static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
289 struct page_cgroup *pc) 291 struct page_cgroup *pc)
290{ 292{
291 int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; 293 int lru = LRU_BASE;
292 int lru = !!from; 294
295 if (pc->flags & PAGE_CGROUP_FLAG_ACTIVE)
296 lru += LRU_ACTIVE;
297 if (pc->flags & PAGE_CGROUP_FLAG_FILE)
298 lru += LRU_FILE;
293 299
294 MEM_CGROUP_ZSTAT(mz, lru) -= 1; 300 MEM_CGROUP_ZSTAT(mz, lru) -= 1;
295 301
@@ -300,10 +306,12 @@ static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
300static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, 306static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
301 struct page_cgroup *pc) 307 struct page_cgroup *pc)
302{ 308{
303 int lru = LRU_INACTIVE; 309 int lru = LRU_BASE;
304 310
305 if (pc->flags & PAGE_CGROUP_FLAG_ACTIVE) 311 if (pc->flags & PAGE_CGROUP_FLAG_ACTIVE)
306 lru += LRU_ACTIVE; 312 lru += LRU_ACTIVE;
313 if (pc->flags & PAGE_CGROUP_FLAG_FILE)
314 lru += LRU_FILE;
307 315
308 MEM_CGROUP_ZSTAT(mz, lru) += 1; 316 MEM_CGROUP_ZSTAT(mz, lru) += 1;
309 list_add(&pc->lru, &mz->lists[lru]); 317 list_add(&pc->lru, &mz->lists[lru]);
@@ -314,10 +322,9 @@ static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
314static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active) 322static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
315{ 323{
316 struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); 324 struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
317 int lru = LRU_INACTIVE; 325 int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
318 326 int file = pc->flags & PAGE_CGROUP_FLAG_FILE;
319 if (pc->flags & PAGE_CGROUP_FLAG_ACTIVE) 327 int lru = LRU_FILE * !!file + !!from;
320 lru += LRU_ACTIVE;
321 328
322 MEM_CGROUP_ZSTAT(mz, lru) -= 1; 329 MEM_CGROUP_ZSTAT(mz, lru) -= 1;
323 330
@@ -326,7 +333,7 @@ static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
326 else 333 else
327 pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE; 334 pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE;
328 335
329 lru = !!active; 336 lru = LRU_FILE * !!file + !!active;
330 MEM_CGROUP_ZSTAT(mz, lru) += 1; 337 MEM_CGROUP_ZSTAT(mz, lru) += 1;
331 list_move(&pc->lru, &mz->lists[lru]); 338 list_move(&pc->lru, &mz->lists[lru]);
332} 339}
@@ -391,21 +398,6 @@ int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
391} 398}
392 399
393/* 400/*
394 * This function is called from vmscan.c. In page reclaiming loop. balance
395 * between active and inactive list is calculated. For memory controller
396 * page reclaiming, we should use using mem_cgroup's imbalance rather than
397 * zone's global lru imbalance.
398 */
399long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem)
400{
401 unsigned long active, inactive;
402 /* active and inactive are the number of pages. 'long' is ok.*/
403 active = mem_cgroup_get_all_zonestat(mem, LRU_ACTIVE);
404 inactive = mem_cgroup_get_all_zonestat(mem, LRU_INACTIVE);
405 return (long) (active / (inactive + 1));
406}
407
408/*
409 * prev_priority control...this will be used in memory reclaim path. 401 * prev_priority control...this will be used in memory reclaim path.
410 */ 402 */
411int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) 403int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
@@ -450,7 +442,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
450 unsigned long *scanned, int order, 442 unsigned long *scanned, int order,
451 int mode, struct zone *z, 443 int mode, struct zone *z,
452 struct mem_cgroup *mem_cont, 444 struct mem_cgroup *mem_cont,
453 int active) 445 int active, int file)
454{ 446{
455 unsigned long nr_taken = 0; 447 unsigned long nr_taken = 0;
456 struct page *page; 448 struct page *page;
@@ -461,7 +453,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
461 int nid = z->zone_pgdat->node_id; 453 int nid = z->zone_pgdat->node_id;
462 int zid = zone_idx(z); 454 int zid = zone_idx(z);
463 struct mem_cgroup_per_zone *mz; 455 struct mem_cgroup_per_zone *mz;
464 int lru = !!active; 456 int lru = LRU_FILE * !!file + !!active;
465 457
466 BUG_ON(!mem_cont); 458 BUG_ON(!mem_cont);
467 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 459 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
@@ -477,6 +469,9 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
477 if (unlikely(!PageLRU(page))) 469 if (unlikely(!PageLRU(page)))
478 continue; 470 continue;
479 471
472 /*
473 * TODO: play better with lumpy reclaim, grabbing anything.
474 */
480 if (PageActive(page) && !active) { 475 if (PageActive(page) && !active) {
481 __mem_cgroup_move_lists(pc, true); 476 __mem_cgroup_move_lists(pc, true);
482 continue; 477 continue;
@@ -489,7 +484,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
489 scan++; 484 scan++;
490 list_move(&pc->lru, &pc_list); 485 list_move(&pc->lru, &pc_list);
491 486
492 if (__isolate_lru_page(page, mode) == 0) { 487 if (__isolate_lru_page(page, mode, file) == 0) {
493 list_move(&page->lru, dst); 488 list_move(&page->lru, dst);
494 nr_taken++; 489 nr_taken++;
495 } 490 }
@@ -575,10 +570,16 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
575 * If a page is accounted as a page cache, insert to inactive list. 570 * If a page is accounted as a page cache, insert to inactive list.
576 * If anon, insert to active list. 571 * If anon, insert to active list.
577 */ 572 */
578 if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) 573 if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) {
579 pc->flags = PAGE_CGROUP_FLAG_CACHE; 574 pc->flags = PAGE_CGROUP_FLAG_CACHE;
580 else 575 if (page_is_file_cache(page))
576 pc->flags |= PAGE_CGROUP_FLAG_FILE;
577 else
578 pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;
579 } else if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
581 pc->flags = PAGE_CGROUP_FLAG_ACTIVE; 580 pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
581 else /* MEM_CGROUP_CHARGE_TYPE_SHMEM */
582 pc->flags = PAGE_CGROUP_FLAG_CACHE | PAGE_CGROUP_FLAG_ACTIVE;
582 583
583 lock_page_cgroup(page); 584 lock_page_cgroup(page);
584 if (unlikely(page_get_page_cgroup(page))) { 585 if (unlikely(page_get_page_cgroup(page))) {
@@ -737,8 +738,12 @@ int mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
737 if (pc) { 738 if (pc) {
738 mem = pc->mem_cgroup; 739 mem = pc->mem_cgroup;
739 css_get(&mem->css); 740 css_get(&mem->css);
740 if (pc->flags & PAGE_CGROUP_FLAG_CACHE) 741 if (pc->flags & PAGE_CGROUP_FLAG_CACHE) {
741 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 742 if (page_is_file_cache(page))
743 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
744 else
745 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
746 }
742 } 747 }
743 unlock_page_cgroup(page); 748 unlock_page_cgroup(page);
744 if (mem) { 749 if (mem) {
@@ -982,14 +987,21 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
982 } 987 }
983 /* showing # of active pages */ 988 /* showing # of active pages */
984 { 989 {
985 unsigned long active, inactive; 990 unsigned long active_anon, inactive_anon;
986 991 unsigned long active_file, inactive_file;
987 inactive = mem_cgroup_get_all_zonestat(mem_cont, 992
988 LRU_INACTIVE); 993 inactive_anon = mem_cgroup_get_all_zonestat(mem_cont,
989 active = mem_cgroup_get_all_zonestat(mem_cont, 994 LRU_INACTIVE_ANON);
990 LRU_ACTIVE); 995 active_anon = mem_cgroup_get_all_zonestat(mem_cont,
991 cb->fill(cb, "active", (active) * PAGE_SIZE); 996 LRU_ACTIVE_ANON);
992 cb->fill(cb, "inactive", (inactive) * PAGE_SIZE); 997 inactive_file = mem_cgroup_get_all_zonestat(mem_cont,
998 LRU_INACTIVE_FILE);
999 active_file = mem_cgroup_get_all_zonestat(mem_cont,
1000 LRU_ACTIVE_FILE);
1001 cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE);
1002 cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE);
1003 cb->fill(cb, "active_file", (active_file) * PAGE_SIZE);
1004 cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE);
993 } 1005 }
994 return 0; 1006 return 0;
995} 1007}
diff --git a/mm/memory.c b/mm/memory.c
index 7512933dcc10..71cdefd1ef14 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1889,7 +1889,7 @@ gotten:
1889 set_pte_at(mm, address, page_table, entry); 1889 set_pte_at(mm, address, page_table, entry);
1890 update_mmu_cache(vma, address, entry); 1890 update_mmu_cache(vma, address, entry);
1891 SetPageSwapBacked(new_page); 1891 SetPageSwapBacked(new_page);
1892 lru_cache_add_active(new_page); 1892 lru_cache_add_active_anon(new_page);
1893 page_add_new_anon_rmap(new_page, vma, address); 1893 page_add_new_anon_rmap(new_page, vma, address);
1894 1894
1895 if (old_page) { 1895 if (old_page) {
@@ -2384,7 +2384,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2384 goto release; 2384 goto release;
2385 inc_mm_counter(mm, anon_rss); 2385 inc_mm_counter(mm, anon_rss);
2386 SetPageSwapBacked(page); 2386 SetPageSwapBacked(page);
2387 lru_cache_add_active(page); 2387 lru_cache_add_active_anon(page);
2388 page_add_new_anon_rmap(page, vma, address); 2388 page_add_new_anon_rmap(page, vma, address);
2389 set_pte_at(mm, address, page_table, entry); 2389 set_pte_at(mm, address, page_table, entry);
2390 2390
@@ -2526,7 +2526,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2526 if (anon) { 2526 if (anon) {
2527 inc_mm_counter(mm, anon_rss); 2527 inc_mm_counter(mm, anon_rss);
2528 SetPageSwapBacked(page); 2528 SetPageSwapBacked(page);
2529 lru_cache_add_active(page); 2529 lru_cache_add_active_anon(page);
2530 page_add_new_anon_rmap(page, vma, address); 2530 page_add_new_anon_rmap(page, vma, address);
2531 } else { 2531 } else {
2532 inc_mm_counter(mm, file_rss); 2532 inc_mm_counter(mm, file_rss);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index b40f6d5f8fe9..2970e35fd03f 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -329,9 +329,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
329 struct zone *z = 329 struct zone *z =
330 &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; 330 &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
331 331
332 x += zone_page_state(z, NR_FREE_PAGES) 332 x += zone_page_state(z, NR_FREE_PAGES) + zone_lru_pages(z);
333 + zone_page_state(z, NR_INACTIVE)
334 + zone_page_state(z, NR_ACTIVE);
335 } 333 }
336 /* 334 /*
337 * Make sure that the number of highmem pages is never larger 335 * Make sure that the number of highmem pages is never larger
@@ -355,9 +353,7 @@ unsigned long determine_dirtyable_memory(void)
355{ 353{
356 unsigned long x; 354 unsigned long x;
357 355
358 x = global_page_state(NR_FREE_PAGES) 356 x = global_page_state(NR_FREE_PAGES) + global_lru_pages();
359 + global_page_state(NR_INACTIVE)
360 + global_page_state(NR_ACTIVE);
361 357
362 if (!vm_highmem_is_dirtyable) 358 if (!vm_highmem_is_dirtyable)
363 x -= highmem_dirtyable_memory(x); 359 x -= highmem_dirtyable_memory(x);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2099904d6cc4..740a16a32c22 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1864,10 +1864,13 @@ void show_free_areas(void)
1864 } 1864 }
1865 } 1865 }
1866 1866
1867 printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n" 1867 printk("Active_anon:%lu active_file:%lu inactive_anon%lu\n"
1868 " inactive_file:%lu dirty:%lu writeback:%lu unstable:%lu\n"
1868 " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", 1869 " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n",
1869 global_page_state(NR_ACTIVE), 1870 global_page_state(NR_ACTIVE_ANON),
1870 global_page_state(NR_INACTIVE), 1871 global_page_state(NR_ACTIVE_FILE),
1872 global_page_state(NR_INACTIVE_ANON),
1873 global_page_state(NR_INACTIVE_FILE),
1871 global_page_state(NR_FILE_DIRTY), 1874 global_page_state(NR_FILE_DIRTY),
1872 global_page_state(NR_WRITEBACK), 1875 global_page_state(NR_WRITEBACK),
1873 global_page_state(NR_UNSTABLE_NFS), 1876 global_page_state(NR_UNSTABLE_NFS),
@@ -1890,8 +1893,10 @@ void show_free_areas(void)
1890 " min:%lukB" 1893 " min:%lukB"
1891 " low:%lukB" 1894 " low:%lukB"
1892 " high:%lukB" 1895 " high:%lukB"
1893 " active:%lukB" 1896 " active_anon:%lukB"
1894 " inactive:%lukB" 1897 " inactive_anon:%lukB"
1898 " active_file:%lukB"
1899 " inactive_file:%lukB"
1895 " present:%lukB" 1900 " present:%lukB"
1896 " pages_scanned:%lu" 1901 " pages_scanned:%lu"
1897 " all_unreclaimable? %s" 1902 " all_unreclaimable? %s"
@@ -1901,8 +1906,10 @@ void show_free_areas(void)
1901 K(zone->pages_min), 1906 K(zone->pages_min),
1902 K(zone->pages_low), 1907 K(zone->pages_low),
1903 K(zone->pages_high), 1908 K(zone->pages_high),
1904 K(zone_page_state(zone, NR_ACTIVE)), 1909 K(zone_page_state(zone, NR_ACTIVE_ANON)),
1905 K(zone_page_state(zone, NR_INACTIVE)), 1910 K(zone_page_state(zone, NR_INACTIVE_ANON)),
1911 K(zone_page_state(zone, NR_ACTIVE_FILE)),
1912 K(zone_page_state(zone, NR_INACTIVE_FILE)),
1906 K(zone->present_pages), 1913 K(zone->present_pages),
1907 zone->pages_scanned, 1914 zone->pages_scanned,
1908 (zone_is_all_unreclaimable(zone) ? "yes" : "no") 1915 (zone_is_all_unreclaimable(zone) ? "yes" : "no")
@@ -3472,6 +3479,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
3472 INIT_LIST_HEAD(&zone->lru[l].list); 3479 INIT_LIST_HEAD(&zone->lru[l].list);
3473 zone->lru[l].nr_scan = 0; 3480 zone->lru[l].nr_scan = 0;
3474 } 3481 }
3482 zone->recent_rotated[0] = 0;
3483 zone->recent_rotated[1] = 0;
3484 zone->recent_scanned[0] = 0;
3485 zone->recent_scanned[1] = 0;
3475 zap_zone_vm_stats(zone); 3486 zap_zone_vm_stats(zone);
3476 zone->flags = 0; 3487 zone->flags = 0;
3477 if (!size) 3488 if (!size)
diff --git a/mm/readahead.c b/mm/readahead.c
index 6cbd9a72fde2..bec83c15a78f 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -229,7 +229,7 @@ int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
229 */ 229 */
230unsigned long max_sane_readahead(unsigned long nr) 230unsigned long max_sane_readahead(unsigned long nr)
231{ 231{
232 return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE) 232 return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE_FILE)
233 + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); 233 + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2);
234} 234}
235 235
diff --git a/mm/shmem.c b/mm/shmem.c
index fd421ed703ed..fc2ccf79a776 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -199,7 +199,7 @@ static struct vm_operations_struct shmem_vm_ops;
199 199
200static struct backing_dev_info shmem_backing_dev_info __read_mostly = { 200static struct backing_dev_info shmem_backing_dev_info __read_mostly = {
201 .ra_pages = 0, /* No readahead */ 201 .ra_pages = 0, /* No readahead */
202 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 202 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
203 .unplug_io_fn = default_unplug_io_fn, 203 .unplug_io_fn = default_unplug_io_fn,
204}; 204};
205 205
diff --git a/mm/swap.c b/mm/swap.c
index 88a394872677..0b1974a08974 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -116,7 +116,8 @@ static void pagevec_move_tail(struct pagevec *pvec)
116 spin_lock(&zone->lru_lock); 116 spin_lock(&zone->lru_lock);
117 } 117 }
118 if (PageLRU(page) && !PageActive(page)) { 118 if (PageLRU(page) && !PageActive(page)) {
119 list_move_tail(&page->lru, &zone->lru[LRU_INACTIVE].list); 119 int lru = page_is_file_cache(page);
120 list_move_tail(&page->lru, &zone->lru[lru].list);
120 pgmoved++; 121 pgmoved++;
121 } 122 }
122 } 123 }
@@ -157,11 +158,18 @@ void activate_page(struct page *page)
157 158
158 spin_lock_irq(&zone->lru_lock); 159 spin_lock_irq(&zone->lru_lock);
159 if (PageLRU(page) && !PageActive(page)) { 160 if (PageLRU(page) && !PageActive(page)) {
160 del_page_from_inactive_list(zone, page); 161 int file = page_is_file_cache(page);
162 int lru = LRU_BASE + file;
163 del_page_from_lru_list(zone, page, lru);
164
161 SetPageActive(page); 165 SetPageActive(page);
162 add_page_to_active_list(zone, page); 166 lru += LRU_ACTIVE;
167 add_page_to_lru_list(zone, page, lru);
163 __count_vm_event(PGACTIVATE); 168 __count_vm_event(PGACTIVATE);
164 mem_cgroup_move_lists(page, true); 169 mem_cgroup_move_lists(page, true);
170
171 zone->recent_rotated[!!file]++;
172 zone->recent_scanned[!!file]++;
165 } 173 }
166 spin_unlock_irq(&zone->lru_lock); 174 spin_unlock_irq(&zone->lru_lock);
167} 175}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 7a3ece0b5a3b..ea62084ed402 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -33,7 +33,7 @@ static const struct address_space_operations swap_aops = {
33}; 33};
34 34
35static struct backing_dev_info swap_backing_dev_info = { 35static struct backing_dev_info swap_backing_dev_info = {
36 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 36 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
37 .unplug_io_fn = swap_unplug_io_fn, 37 .unplug_io_fn = swap_unplug_io_fn,
38}; 38};
39 39
@@ -310,7 +310,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
310 /* 310 /*
311 * Initiate read into locked page and return. 311 * Initiate read into locked page and return.
312 */ 312 */
313 lru_cache_add_active(new_page); 313 lru_cache_add_active_anon(new_page);
314 swap_readpage(NULL, new_page); 314 swap_readpage(NULL, new_page);
315 return new_page; 315 return new_page;
316 } 316 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e656035d3406..d10d2f9a33f3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -78,7 +78,7 @@ struct scan_control {
78 unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst, 78 unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
79 unsigned long *scanned, int order, int mode, 79 unsigned long *scanned, int order, int mode,
80 struct zone *z, struct mem_cgroup *mem_cont, 80 struct zone *z, struct mem_cgroup *mem_cont,
81 int active); 81 int active, int file);
82}; 82};
83 83
84#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 84#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
@@ -680,7 +680,7 @@ keep:
680 * 680 *
681 * returns 0 on success, -ve errno on failure. 681 * returns 0 on success, -ve errno on failure.
682 */ 682 */
683int __isolate_lru_page(struct page *page, int mode) 683int __isolate_lru_page(struct page *page, int mode, int file)
684{ 684{
685 int ret = -EINVAL; 685 int ret = -EINVAL;
686 686
@@ -696,6 +696,9 @@ int __isolate_lru_page(struct page *page, int mode)
696 if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) 696 if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
697 return ret; 697 return ret;
698 698
699 if (mode != ISOLATE_BOTH && (!page_is_file_cache(page) != !file))
700 return ret;
701
699 ret = -EBUSY; 702 ret = -EBUSY;
700 if (likely(get_page_unless_zero(page))) { 703 if (likely(get_page_unless_zero(page))) {
701 /* 704 /*
@@ -726,12 +729,13 @@ int __isolate_lru_page(struct page *page, int mode)
726 * @scanned: The number of pages that were scanned. 729 * @scanned: The number of pages that were scanned.
727 * @order: The caller's attempted allocation order 730 * @order: The caller's attempted allocation order
728 * @mode: One of the LRU isolation modes 731 * @mode: One of the LRU isolation modes
732 * @file: True [1] if isolating file [!anon] pages
729 * 733 *
730 * returns how many pages were moved onto *@dst. 734 * returns how many pages were moved onto *@dst.
731 */ 735 */
732static unsigned long isolate_lru_pages(unsigned long nr_to_scan, 736static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
733 struct list_head *src, struct list_head *dst, 737 struct list_head *src, struct list_head *dst,
734 unsigned long *scanned, int order, int mode) 738 unsigned long *scanned, int order, int mode, int file)
735{ 739{
736 unsigned long nr_taken = 0; 740 unsigned long nr_taken = 0;
737 unsigned long scan; 741 unsigned long scan;
@@ -748,7 +752,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
748 752
749 VM_BUG_ON(!PageLRU(page)); 753 VM_BUG_ON(!PageLRU(page));
750 754
751 switch (__isolate_lru_page(page, mode)) { 755 switch (__isolate_lru_page(page, mode, file)) {
752 case 0: 756 case 0:
753 list_move(&page->lru, dst); 757 list_move(&page->lru, dst);
754 nr_taken++; 758 nr_taken++;
@@ -791,10 +795,11 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
791 break; 795 break;
792 796
793 cursor_page = pfn_to_page(pfn); 797 cursor_page = pfn_to_page(pfn);
798
794 /* Check that we have not crossed a zone boundary. */ 799 /* Check that we have not crossed a zone boundary. */
795 if (unlikely(page_zone_id(cursor_page) != zone_id)) 800 if (unlikely(page_zone_id(cursor_page) != zone_id))
796 continue; 801 continue;
797 switch (__isolate_lru_page(cursor_page, mode)) { 802 switch (__isolate_lru_page(cursor_page, mode, file)) {
798 case 0: 803 case 0:
799 list_move(&cursor_page->lru, dst); 804 list_move(&cursor_page->lru, dst);
800 nr_taken++; 805 nr_taken++;
@@ -819,30 +824,37 @@ static unsigned long isolate_pages_global(unsigned long nr,
819 unsigned long *scanned, int order, 824 unsigned long *scanned, int order,
820 int mode, struct zone *z, 825 int mode, struct zone *z,
821 struct mem_cgroup *mem_cont, 826 struct mem_cgroup *mem_cont,
822 int active) 827 int active, int file)
823{ 828{
829 int lru = LRU_BASE;
824 if (active) 830 if (active)
825 return isolate_lru_pages(nr, &z->lru[LRU_ACTIVE].list, dst, 831 lru += LRU_ACTIVE;
826 scanned, order, mode); 832 if (file)
827 else 833 lru += LRU_FILE;
828 return isolate_lru_pages(nr, &z->lru[LRU_INACTIVE].list, dst, 834 return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
829 scanned, order, mode); 835 mode, !!file);
830} 836}
831 837
832/* 838/*
833 * clear_active_flags() is a helper for shrink_active_list(), clearing 839 * clear_active_flags() is a helper for shrink_active_list(), clearing
834 * any active bits from the pages in the list. 840 * any active bits from the pages in the list.
835 */ 841 */
836static unsigned long clear_active_flags(struct list_head *page_list) 842static unsigned long clear_active_flags(struct list_head *page_list,
843 unsigned int *count)
837{ 844{
838 int nr_active = 0; 845 int nr_active = 0;
846 int lru;
839 struct page *page; 847 struct page *page;
840 848
841 list_for_each_entry(page, page_list, lru) 849 list_for_each_entry(page, page_list, lru) {
850 lru = page_is_file_cache(page);
842 if (PageActive(page)) { 851 if (PageActive(page)) {
852 lru += LRU_ACTIVE;
843 ClearPageActive(page); 853 ClearPageActive(page);
844 nr_active++; 854 nr_active++;
845 } 855 }
856 count[lru]++;
857 }
846 858
847 return nr_active; 859 return nr_active;
848} 860}
@@ -880,12 +892,12 @@ int isolate_lru_page(struct page *page)
880 892
881 spin_lock_irq(&zone->lru_lock); 893 spin_lock_irq(&zone->lru_lock);
882 if (PageLRU(page) && get_page_unless_zero(page)) { 894 if (PageLRU(page) && get_page_unless_zero(page)) {
895 int lru = LRU_BASE;
883 ret = 0; 896 ret = 0;
884 ClearPageLRU(page); 897 ClearPageLRU(page);
885 if (PageActive(page)) 898
886 del_page_from_active_list(zone, page); 899 lru += page_is_file_cache(page) + !!PageActive(page);
887 else 900 del_page_from_lru_list(zone, page, lru);
888 del_page_from_inactive_list(zone, page);
889 } 901 }
890 spin_unlock_irq(&zone->lru_lock); 902 spin_unlock_irq(&zone->lru_lock);
891 } 903 }
@@ -897,7 +909,7 @@ int isolate_lru_page(struct page *page)
897 * of reclaimed pages 909 * of reclaimed pages
898 */ 910 */
899static unsigned long shrink_inactive_list(unsigned long max_scan, 911static unsigned long shrink_inactive_list(unsigned long max_scan,
900 struct zone *zone, struct scan_control *sc) 912 struct zone *zone, struct scan_control *sc, int file)
901{ 913{
902 LIST_HEAD(page_list); 914 LIST_HEAD(page_list);
903 struct pagevec pvec; 915 struct pagevec pvec;
@@ -914,20 +926,32 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
914 unsigned long nr_scan; 926 unsigned long nr_scan;
915 unsigned long nr_freed; 927 unsigned long nr_freed;
916 unsigned long nr_active; 928 unsigned long nr_active;
929 unsigned int count[NR_LRU_LISTS] = { 0, };
930 int mode = (sc->order > PAGE_ALLOC_COSTLY_ORDER) ?
931 ISOLATE_BOTH : ISOLATE_INACTIVE;
917 932
918 nr_taken = sc->isolate_pages(sc->swap_cluster_max, 933 nr_taken = sc->isolate_pages(sc->swap_cluster_max,
919 &page_list, &nr_scan, sc->order, 934 &page_list, &nr_scan, sc->order, mode,
920 (sc->order > PAGE_ALLOC_COSTLY_ORDER)? 935 zone, sc->mem_cgroup, 0, file);
921 ISOLATE_BOTH : ISOLATE_INACTIVE, 936 nr_active = clear_active_flags(&page_list, count);
922 zone, sc->mem_cgroup, 0);
923 nr_active = clear_active_flags(&page_list);
924 __count_vm_events(PGDEACTIVATE, nr_active); 937 __count_vm_events(PGDEACTIVATE, nr_active);
925 938
926 __mod_zone_page_state(zone, NR_ACTIVE, -nr_active); 939 __mod_zone_page_state(zone, NR_ACTIVE_FILE,
927 __mod_zone_page_state(zone, NR_INACTIVE, 940 -count[LRU_ACTIVE_FILE]);
928 -(nr_taken - nr_active)); 941 __mod_zone_page_state(zone, NR_INACTIVE_FILE,
929 if (scan_global_lru(sc)) 942 -count[LRU_INACTIVE_FILE]);
943 __mod_zone_page_state(zone, NR_ACTIVE_ANON,
944 -count[LRU_ACTIVE_ANON]);
945 __mod_zone_page_state(zone, NR_INACTIVE_ANON,
946 -count[LRU_INACTIVE_ANON]);
947
948 if (scan_global_lru(sc)) {
930 zone->pages_scanned += nr_scan; 949 zone->pages_scanned += nr_scan;
950 zone->recent_scanned[0] += count[LRU_INACTIVE_ANON];
951 zone->recent_scanned[0] += count[LRU_ACTIVE_ANON];
952 zone->recent_scanned[1] += count[LRU_INACTIVE_FILE];
953 zone->recent_scanned[1] += count[LRU_ACTIVE_FILE];
954 }
931 spin_unlock_irq(&zone->lru_lock); 955 spin_unlock_irq(&zone->lru_lock);
932 956
933 nr_scanned += nr_scan; 957 nr_scanned += nr_scan;
@@ -947,7 +971,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
947 * The attempt at page out may have made some 971 * The attempt at page out may have made some
948 * of the pages active, mark them inactive again. 972 * of the pages active, mark them inactive again.
949 */ 973 */
950 nr_active = clear_active_flags(&page_list); 974 nr_active = clear_active_flags(&page_list, count);
951 count_vm_events(PGDEACTIVATE, nr_active); 975 count_vm_events(PGDEACTIVATE, nr_active);
952 976
953 nr_freed += shrink_page_list(&page_list, sc, 977 nr_freed += shrink_page_list(&page_list, sc,
@@ -977,6 +1001,10 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
977 SetPageLRU(page); 1001 SetPageLRU(page);
978 list_del(&page->lru); 1002 list_del(&page->lru);
979 add_page_to_lru_list(zone, page, page_lru(page)); 1003 add_page_to_lru_list(zone, page, page_lru(page));
1004 if (PageActive(page) && scan_global_lru(sc)) {
1005 int file = !!page_is_file_cache(page);
1006 zone->recent_rotated[file]++;
1007 }
980 if (!pagevec_add(&pvec, page)) { 1008 if (!pagevec_add(&pvec, page)) {
981 spin_unlock_irq(&zone->lru_lock); 1009 spin_unlock_irq(&zone->lru_lock);
982 __pagevec_release(&pvec); 1010 __pagevec_release(&pvec);
@@ -1007,115 +1035,7 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority)
1007 1035
1008static inline int zone_is_near_oom(struct zone *zone) 1036static inline int zone_is_near_oom(struct zone *zone)
1009{ 1037{
1010 return zone->pages_scanned >= (zone_page_state(zone, NR_ACTIVE) 1038 return zone->pages_scanned >= (zone_lru_pages(zone) * 3);
1011 + zone_page_state(zone, NR_INACTIVE))*3;
1012}
1013
1014/*
1015 * Determine we should try to reclaim mapped pages.
1016 * This is called only when sc->mem_cgroup is NULL.
1017 */
1018static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone,
1019 int priority)
1020{
1021 long mapped_ratio;
1022 long distress;
1023 long swap_tendency;
1024 long imbalance;
1025 int reclaim_mapped = 0;
1026 int prev_priority;
1027
1028 if (scan_global_lru(sc) && zone_is_near_oom(zone))
1029 return 1;
1030 /*
1031 * `distress' is a measure of how much trouble we're having
1032 * reclaiming pages. 0 -> no problems. 100 -> great trouble.
1033 */
1034 if (scan_global_lru(sc))
1035 prev_priority = zone->prev_priority;
1036 else
1037 prev_priority = mem_cgroup_get_reclaim_priority(sc->mem_cgroup);
1038
1039 distress = 100 >> min(prev_priority, priority);
1040
1041 /*
1042 * The point of this algorithm is to decide when to start
1043 * reclaiming mapped memory instead of just pagecache. Work out
1044 * how much memory
1045 * is mapped.
1046 */
1047 if (scan_global_lru(sc))
1048 mapped_ratio = ((global_page_state(NR_FILE_MAPPED) +
1049 global_page_state(NR_ANON_PAGES)) * 100) /
1050 vm_total_pages;
1051 else
1052 mapped_ratio = mem_cgroup_calc_mapped_ratio(sc->mem_cgroup);
1053
1054 /*
1055 * Now decide how much we really want to unmap some pages. The
1056 * mapped ratio is downgraded - just because there's a lot of
1057 * mapped memory doesn't necessarily mean that page reclaim
1058 * isn't succeeding.
1059 *
1060 * The distress ratio is important - we don't want to start
1061 * going oom.
1062 *
1063 * A 100% value of vm_swappiness overrides this algorithm
1064 * altogether.
1065 */
1066 swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
1067
1068 /*
1069 * If there's huge imbalance between active and inactive
1070 * (think active 100 times larger than inactive) we should
1071 * become more permissive, or the system will take too much
1072 * cpu before it start swapping during memory pressure.
1073 * Distress is about avoiding early-oom, this is about
1074 * making swappiness graceful despite setting it to low
1075 * values.
1076 *
1077 * Avoid div by zero with nr_inactive+1, and max resulting
1078 * value is vm_total_pages.
1079 */
1080 if (scan_global_lru(sc)) {
1081 imbalance = zone_page_state(zone, NR_ACTIVE);
1082 imbalance /= zone_page_state(zone, NR_INACTIVE) + 1;
1083 } else
1084 imbalance = mem_cgroup_reclaim_imbalance(sc->mem_cgroup);
1085
1086 /*
1087 * Reduce the effect of imbalance if swappiness is low,
1088 * this means for a swappiness very low, the imbalance
1089 * must be much higher than 100 for this logic to make
1090 * the difference.
1091 *
1092 * Max temporary value is vm_total_pages*100.
1093 */
1094 imbalance *= (vm_swappiness + 1);
1095 imbalance /= 100;
1096
1097 /*
1098 * If not much of the ram is mapped, makes the imbalance
1099 * less relevant, it's high priority we refill the inactive
1100 * list with mapped pages only in presence of high ratio of
1101 * mapped pages.
1102 *
1103 * Max temporary value is vm_total_pages*100.
1104 */
1105 imbalance *= mapped_ratio;
1106 imbalance /= 100;
1107
1108 /* apply imbalance feedback to swap_tendency */
1109 swap_tendency += imbalance;
1110
1111 /*
1112 * Now use this metric to decide whether to start moving mapped
1113 * memory onto the inactive list.
1114 */
1115 if (swap_tendency >= 100)
1116 reclaim_mapped = 1;
1117
1118 return reclaim_mapped;
1119} 1039}
1120 1040
1121/* 1041/*
@@ -1138,7 +1058,7 @@ static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone,
1138 1058
1139 1059
1140static void shrink_active_list(unsigned long nr_pages, struct zone *zone, 1060static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1141 struct scan_control *sc, int priority) 1061 struct scan_control *sc, int priority, int file)
1142{ 1062{
1143 unsigned long pgmoved; 1063 unsigned long pgmoved;
1144 int pgdeactivate = 0; 1064 int pgdeactivate = 0;
@@ -1148,43 +1068,42 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1148 LIST_HEAD(l_inactive); 1068 LIST_HEAD(l_inactive);
1149 struct page *page; 1069 struct page *page;
1150 struct pagevec pvec; 1070 struct pagevec pvec;
1151 int reclaim_mapped = 0; 1071 enum lru_list lru;
1152
1153 if (sc->may_swap)
1154 reclaim_mapped = calc_reclaim_mapped(sc, zone, priority);
1155 1072
1156 lru_add_drain(); 1073 lru_add_drain();
1157 spin_lock_irq(&zone->lru_lock); 1074 spin_lock_irq(&zone->lru_lock);
1158 pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, 1075 pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
1159 ISOLATE_ACTIVE, zone, 1076 ISOLATE_ACTIVE, zone,
1160 sc->mem_cgroup, 1); 1077 sc->mem_cgroup, 1, file);
1161 /* 1078 /*
1162 * zone->pages_scanned is used for detect zone's oom 1079 * zone->pages_scanned is used for detect zone's oom
1163 * mem_cgroup remembers nr_scan by itself. 1080 * mem_cgroup remembers nr_scan by itself.
1164 */ 1081 */
1165 if (scan_global_lru(sc)) 1082 if (scan_global_lru(sc)) {
1166 zone->pages_scanned += pgscanned; 1083 zone->pages_scanned += pgscanned;
1084 zone->recent_scanned[!!file] += pgmoved;
1085 }
1167 1086
1168 __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved); 1087 if (file)
1088 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved);
1089 else
1090 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved);
1169 spin_unlock_irq(&zone->lru_lock); 1091 spin_unlock_irq(&zone->lru_lock);
1170 1092
1171 while (!list_empty(&l_hold)) { 1093 while (!list_empty(&l_hold)) {
1172 cond_resched(); 1094 cond_resched();
1173 page = lru_to_page(&l_hold); 1095 page = lru_to_page(&l_hold);
1174 list_del(&page->lru); 1096 list_del(&page->lru);
1175 if (page_mapped(page)) {
1176 if (!reclaim_mapped ||
1177 (total_swap_pages == 0 && PageAnon(page)) ||
1178 page_referenced(page, 0, sc->mem_cgroup)) {
1179 list_add(&page->lru, &l_active);
1180 continue;
1181 }
1182 }
1183 list_add(&page->lru, &l_inactive); 1097 list_add(&page->lru, &l_inactive);
1184 } 1098 }
1185 1099
1100 /*
1101 * Now put the pages back on the appropriate [file or anon] inactive
1102 * and active lists.
1103 */
1186 pagevec_init(&pvec, 1); 1104 pagevec_init(&pvec, 1);
1187 pgmoved = 0; 1105 pgmoved = 0;
1106 lru = LRU_BASE + file * LRU_FILE;
1188 spin_lock_irq(&zone->lru_lock); 1107 spin_lock_irq(&zone->lru_lock);
1189 while (!list_empty(&l_inactive)) { 1108 while (!list_empty(&l_inactive)) {
1190 page = lru_to_page(&l_inactive); 1109 page = lru_to_page(&l_inactive);
@@ -1194,11 +1113,11 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1194 VM_BUG_ON(!PageActive(page)); 1113 VM_BUG_ON(!PageActive(page));
1195 ClearPageActive(page); 1114 ClearPageActive(page);
1196 1115
1197 list_move(&page->lru, &zone->lru[LRU_INACTIVE].list); 1116 list_move(&page->lru, &zone->lru[lru].list);
1198 mem_cgroup_move_lists(page, false); 1117 mem_cgroup_move_lists(page, false);
1199 pgmoved++; 1118 pgmoved++;
1200 if (!pagevec_add(&pvec, page)) { 1119 if (!pagevec_add(&pvec, page)) {
1201 __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); 1120 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1202 spin_unlock_irq(&zone->lru_lock); 1121 spin_unlock_irq(&zone->lru_lock);
1203 pgdeactivate += pgmoved; 1122 pgdeactivate += pgmoved;
1204 pgmoved = 0; 1123 pgmoved = 0;
@@ -1208,7 +1127,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1208 spin_lock_irq(&zone->lru_lock); 1127 spin_lock_irq(&zone->lru_lock);
1209 } 1128 }
1210 } 1129 }
1211 __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); 1130 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1212 pgdeactivate += pgmoved; 1131 pgdeactivate += pgmoved;
1213 if (buffer_heads_over_limit) { 1132 if (buffer_heads_over_limit) {
1214 spin_unlock_irq(&zone->lru_lock); 1133 spin_unlock_irq(&zone->lru_lock);
@@ -1217,6 +1136,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1217 } 1136 }
1218 1137
1219 pgmoved = 0; 1138 pgmoved = 0;
1139 lru = LRU_ACTIVE + file * LRU_FILE;
1220 while (!list_empty(&l_active)) { 1140 while (!list_empty(&l_active)) {
1221 page = lru_to_page(&l_active); 1141 page = lru_to_page(&l_active);
1222 prefetchw_prev_lru_page(page, &l_active, flags); 1142 prefetchw_prev_lru_page(page, &l_active, flags);
@@ -1224,11 +1144,11 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1224 SetPageLRU(page); 1144 SetPageLRU(page);
1225 VM_BUG_ON(!PageActive(page)); 1145 VM_BUG_ON(!PageActive(page));
1226 1146
1227 list_move(&page->lru, &zone->lru[LRU_ACTIVE].list); 1147 list_move(&page->lru, &zone->lru[lru].list);
1228 mem_cgroup_move_lists(page, true); 1148 mem_cgroup_move_lists(page, true);
1229 pgmoved++; 1149 pgmoved++;
1230 if (!pagevec_add(&pvec, page)) { 1150 if (!pagevec_add(&pvec, page)) {
1231 __mod_zone_page_state(zone, NR_ACTIVE, pgmoved); 1151 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1232 pgmoved = 0; 1152 pgmoved = 0;
1233 spin_unlock_irq(&zone->lru_lock); 1153 spin_unlock_irq(&zone->lru_lock);
1234 if (vm_swap_full()) 1154 if (vm_swap_full())
@@ -1237,7 +1157,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1237 spin_lock_irq(&zone->lru_lock); 1157 spin_lock_irq(&zone->lru_lock);
1238 } 1158 }
1239 } 1159 }
1240 __mod_zone_page_state(zone, NR_ACTIVE, pgmoved); 1160 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1161 zone->recent_rotated[!!file] += pgmoved;
1241 1162
1242 __count_zone_vm_events(PGREFILL, zone, pgscanned); 1163 __count_zone_vm_events(PGREFILL, zone, pgscanned);
1243 __count_vm_events(PGDEACTIVATE, pgdeactivate); 1164 __count_vm_events(PGDEACTIVATE, pgdeactivate);
@@ -1248,16 +1169,103 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1248 pagevec_release(&pvec); 1169 pagevec_release(&pvec);
1249} 1170}
1250 1171
1251static unsigned long shrink_list(enum lru_list l, unsigned long nr_to_scan, 1172static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1252 struct zone *zone, struct scan_control *sc, int priority) 1173 struct zone *zone, struct scan_control *sc, int priority)
1253{ 1174{
1254 if (l == LRU_ACTIVE) { 1175 int file = is_file_lru(lru);
1255 shrink_active_list(nr_to_scan, zone, sc, priority); 1176
1177 if (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE) {
1178 shrink_active_list(nr_to_scan, zone, sc, priority, file);
1256 return 0; 1179 return 0;
1257 } 1180 }
1258 return shrink_inactive_list(nr_to_scan, zone, sc); 1181 return shrink_inactive_list(nr_to_scan, zone, sc, file);
1182}
1183
1184/*
1185 * Determine how aggressively the anon and file LRU lists should be
1186 * scanned. The relative value of each set of LRU lists is determined
1187 * by looking at the fraction of the pages scanned we did rotate back
1188 * onto the active list instead of evict.
1189 *
1190 * percent[0] specifies how much pressure to put on ram/swap backed
1191 * memory, while percent[1] determines pressure on the file LRUs.
1192 */
1193static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1194 unsigned long *percent)
1195{
1196 unsigned long anon, file, free;
1197 unsigned long anon_prio, file_prio;
1198 unsigned long ap, fp;
1199
1200 anon = zone_page_state(zone, NR_ACTIVE_ANON) +
1201 zone_page_state(zone, NR_INACTIVE_ANON);
1202 file = zone_page_state(zone, NR_ACTIVE_FILE) +
1203 zone_page_state(zone, NR_INACTIVE_FILE);
1204 free = zone_page_state(zone, NR_FREE_PAGES);
1205
1206 /* If we have no swap space, do not bother scanning anon pages. */
1207 if (nr_swap_pages <= 0) {
1208 percent[0] = 0;
1209 percent[1] = 100;
1210 return;
1211 }
1212
1213 /* If we have very few page cache pages, force-scan anon pages. */
1214 if (unlikely(file + free <= zone->pages_high)) {
1215 percent[0] = 100;
1216 percent[1] = 0;
1217 return;
1218 }
1219
1220 /*
1221 * OK, so we have swap space and a fair amount of page cache
1222 * pages. We use the recently rotated / recently scanned
1223 * ratios to determine how valuable each cache is.
1224 *
1225 * Because workloads change over time (and to avoid overflow)
1226 * we keep these statistics as a floating average, which ends
1227 * up weighing recent references more than old ones.
1228 *
1229 * anon in [0], file in [1]
1230 */
1231 if (unlikely(zone->recent_scanned[0] > anon / 4)) {
1232 spin_lock_irq(&zone->lru_lock);
1233 zone->recent_scanned[0] /= 2;
1234 zone->recent_rotated[0] /= 2;
1235 spin_unlock_irq(&zone->lru_lock);
1236 }
1237
1238 if (unlikely(zone->recent_scanned[1] > file / 4)) {
1239 spin_lock_irq(&zone->lru_lock);
1240 zone->recent_scanned[1] /= 2;
1241 zone->recent_rotated[1] /= 2;
1242 spin_unlock_irq(&zone->lru_lock);
1243 }
1244
1245 /*
1246 * With swappiness at 100, anonymous and file have the same priority.
1247 * This scanning priority is essentially the inverse of IO cost.
1248 */
1249 anon_prio = sc->swappiness;
1250 file_prio = 200 - sc->swappiness;
1251
1252 /*
1253 * anon recent_rotated[0]
1254 * %anon = 100 * ----------- / ----------------- * IO cost
1255 * anon + file rotate_sum
1256 */
1257 ap = (anon_prio + 1) * (zone->recent_scanned[0] + 1);
1258 ap /= zone->recent_rotated[0] + 1;
1259
1260 fp = (file_prio + 1) * (zone->recent_scanned[1] + 1);
1261 fp /= zone->recent_rotated[1] + 1;
1262
1263 /* Normalize to percentages */
1264 percent[0] = 100 * ap / (ap + fp + 1);
1265 percent[1] = 100 - percent[0];
1259} 1266}
1260 1267
1268
1261/* 1269/*
1262 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 1270 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
1263 */ 1271 */
@@ -1267,36 +1275,43 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
1267 unsigned long nr[NR_LRU_LISTS]; 1275 unsigned long nr[NR_LRU_LISTS];
1268 unsigned long nr_to_scan; 1276 unsigned long nr_to_scan;
1269 unsigned long nr_reclaimed = 0; 1277 unsigned long nr_reclaimed = 0;
1278 unsigned long percent[2]; /* anon @ 0; file @ 1 */
1270 enum lru_list l; 1279 enum lru_list l;
1271 1280
1272 if (scan_global_lru(sc)) { 1281 get_scan_ratio(zone, sc, percent);
1273 /* 1282
1274 * Add one to nr_to_scan just to make sure that the kernel 1283 for_each_lru(l) {
1275 * will slowly sift through the active list. 1284 if (scan_global_lru(sc)) {
1276 */ 1285 int file = is_file_lru(l);
1277 for_each_lru(l) { 1286 int scan;
1278 zone->lru[l].nr_scan += (zone_page_state(zone, 1287 /*
1279 NR_LRU_BASE + l) >> priority) + 1; 1288 * Add one to nr_to_scan just to make sure that the
1289 * kernel will slowly sift through each list.
1290 */
1291 scan = zone_page_state(zone, NR_LRU_BASE + l);
1292 if (priority) {
1293 scan >>= priority;
1294 scan = (scan * percent[file]) / 100;
1295 }
1296 zone->lru[l].nr_scan += scan + 1;
1280 nr[l] = zone->lru[l].nr_scan; 1297 nr[l] = zone->lru[l].nr_scan;
1281 if (nr[l] >= sc->swap_cluster_max) 1298 if (nr[l] >= sc->swap_cluster_max)
1282 zone->lru[l].nr_scan = 0; 1299 zone->lru[l].nr_scan = 0;
1283 else 1300 else
1284 nr[l] = 0; 1301 nr[l] = 0;
1302 } else {
1303 /*
1304 * This reclaim occurs not because zone memory shortage
1305 * but because memory controller hits its limit.
1306 * Don't modify zone reclaim related data.
1307 */
1308 nr[l] = mem_cgroup_calc_reclaim(sc->mem_cgroup, zone,
1309 priority, l);
1285 } 1310 }
1286 } else {
1287 /*
1288 * This reclaim occurs not because zone memory shortage but
1289 * because memory controller hits its limit.
1290 * Then, don't modify zone reclaim related data.
1291 */
1292 nr[LRU_ACTIVE] = mem_cgroup_calc_reclaim(sc->mem_cgroup,
1293 zone, priority, LRU_ACTIVE);
1294
1295 nr[LRU_INACTIVE] = mem_cgroup_calc_reclaim(sc->mem_cgroup,
1296 zone, priority, LRU_INACTIVE);
1297 } 1311 }
1298 1312
1299 while (nr[LRU_ACTIVE] || nr[LRU_INACTIVE]) { 1313 while (nr[LRU_ACTIVE_ANON] || nr[LRU_INACTIVE_ANON] ||
1314 nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) {
1300 for_each_lru(l) { 1315 for_each_lru(l) {
1301 if (nr[l]) { 1316 if (nr[l]) {
1302 nr_to_scan = min(nr[l], 1317 nr_to_scan = min(nr[l],
@@ -1369,7 +1384,7 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
1369 1384
1370 return nr_reclaimed; 1385 return nr_reclaimed;
1371} 1386}
1372 1387
1373/* 1388/*
1374 * This is the main entry point to direct page reclaim. 1389 * This is the main entry point to direct page reclaim.
1375 * 1390 *
@@ -1412,8 +1427,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1412 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1427 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1413 continue; 1428 continue;
1414 1429
1415 lru_pages += zone_page_state(zone, NR_ACTIVE) 1430 lru_pages += zone_lru_pages(zone);
1416 + zone_page_state(zone, NR_INACTIVE);
1417 } 1431 }
1418 } 1432 }
1419 1433
@@ -1615,8 +1629,7 @@ loop_again:
1615 for (i = 0; i <= end_zone; i++) { 1629 for (i = 0; i <= end_zone; i++) {
1616 struct zone *zone = pgdat->node_zones + i; 1630 struct zone *zone = pgdat->node_zones + i;
1617 1631
1618 lru_pages += zone_page_state(zone, NR_ACTIVE) 1632 lru_pages += zone_lru_pages(zone);
1619 + zone_page_state(zone, NR_INACTIVE);
1620 } 1633 }
1621 1634
1622 /* 1635 /*
@@ -1660,8 +1673,7 @@ loop_again:
1660 if (zone_is_all_unreclaimable(zone)) 1673 if (zone_is_all_unreclaimable(zone))
1661 continue; 1674 continue;
1662 if (nr_slab == 0 && zone->pages_scanned >= 1675 if (nr_slab == 0 && zone->pages_scanned >=
1663 (zone_page_state(zone, NR_ACTIVE) 1676 (zone_lru_pages(zone) * 6))
1664 + zone_page_state(zone, NR_INACTIVE)) * 6)
1665 zone_set_flag(zone, 1677 zone_set_flag(zone,
1666 ZONE_ALL_UNRECLAIMABLE); 1678 ZONE_ALL_UNRECLAIMABLE);
1667 /* 1679 /*
@@ -1715,7 +1727,7 @@ out:
1715 1727
1716/* 1728/*
1717 * The background pageout daemon, started as a kernel thread 1729 * The background pageout daemon, started as a kernel thread
1718 * from the init process. 1730 * from the init process.
1719 * 1731 *
1720 * This basically trickles out pages so that we have _some_ 1732 * This basically trickles out pages so that we have _some_
1721 * free memory available even if there is no other activity 1733 * free memory available even if there is no other activity
@@ -1809,6 +1821,14 @@ void wakeup_kswapd(struct zone *zone, int order)
1809 wake_up_interruptible(&pgdat->kswapd_wait); 1821 wake_up_interruptible(&pgdat->kswapd_wait);
1810} 1822}
1811 1823
1824unsigned long global_lru_pages(void)
1825{
1826 return global_page_state(NR_ACTIVE_ANON)
1827 + global_page_state(NR_ACTIVE_FILE)
1828 + global_page_state(NR_INACTIVE_ANON)
1829 + global_page_state(NR_INACTIVE_FILE);
1830}
1831
1812#ifdef CONFIG_PM 1832#ifdef CONFIG_PM
1813/* 1833/*
1814 * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages 1834 * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages
@@ -1834,7 +1854,8 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
1834 1854
1835 for_each_lru(l) { 1855 for_each_lru(l) {
1836 /* For pass = 0 we don't shrink the active list */ 1856 /* For pass = 0 we don't shrink the active list */
1837 if (pass == 0 && l == LRU_ACTIVE) 1857 if (pass == 0 &&
1858 (l == LRU_ACTIVE || l == LRU_ACTIVE_FILE))
1838 continue; 1859 continue;
1839 1860
1840 zone->lru[l].nr_scan += 1861 zone->lru[l].nr_scan +=
@@ -1856,11 +1877,6 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
1856 return ret; 1877 return ret;
1857} 1878}
1858 1879
1859static unsigned long count_lru_pages(void)
1860{
1861 return global_page_state(NR_ACTIVE) + global_page_state(NR_INACTIVE);
1862}
1863
1864/* 1880/*
1865 * Try to free `nr_pages' of memory, system-wide, and return the number of 1881 * Try to free `nr_pages' of memory, system-wide, and return the number of
1866 * freed pages. 1882 * freed pages.
@@ -1886,7 +1902,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
1886 1902
1887 current->reclaim_state = &reclaim_state; 1903 current->reclaim_state = &reclaim_state;
1888 1904
1889 lru_pages = count_lru_pages(); 1905 lru_pages = global_lru_pages();
1890 nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); 1906 nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
1891 /* If slab caches are huge, it's better to hit them first */ 1907 /* If slab caches are huge, it's better to hit them first */
1892 while (nr_slab >= lru_pages) { 1908 while (nr_slab >= lru_pages) {
@@ -1929,7 +1945,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
1929 1945
1930 reclaim_state.reclaimed_slab = 0; 1946 reclaim_state.reclaimed_slab = 0;
1931 shrink_slab(sc.nr_scanned, sc.gfp_mask, 1947 shrink_slab(sc.nr_scanned, sc.gfp_mask,
1932 count_lru_pages()); 1948 global_lru_pages());
1933 ret += reclaim_state.reclaimed_slab; 1949 ret += reclaim_state.reclaimed_slab;
1934 if (ret >= nr_pages) 1950 if (ret >= nr_pages)
1935 goto out; 1951 goto out;
@@ -1946,7 +1962,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
1946 if (!ret) { 1962 if (!ret) {
1947 do { 1963 do {
1948 reclaim_state.reclaimed_slab = 0; 1964 reclaim_state.reclaimed_slab = 0;
1949 shrink_slab(nr_pages, sc.gfp_mask, count_lru_pages()); 1965 shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages());
1950 ret += reclaim_state.reclaimed_slab; 1966 ret += reclaim_state.reclaimed_slab;
1951 } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); 1967 } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
1952 } 1968 }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 52c0335c1b71..27400b7da7c4 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -619,8 +619,10 @@ const struct seq_operations pagetypeinfo_op = {
619static const char * const vmstat_text[] = { 619static const char * const vmstat_text[] = {
620 /* Zoned VM counters */ 620 /* Zoned VM counters */
621 "nr_free_pages", 621 "nr_free_pages",
622 "nr_inactive", 622 "nr_inactive_anon",
623 "nr_active", 623 "nr_active_anon",
624 "nr_inactive_file",
625 "nr_active_file",
624 "nr_anon_pages", 626 "nr_anon_pages",
625 "nr_mapped", 627 "nr_mapped",
626 "nr_file_pages", 628 "nr_file_pages",
@@ -688,7 +690,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
688 "\n min %lu" 690 "\n min %lu"
689 "\n low %lu" 691 "\n low %lu"
690 "\n high %lu" 692 "\n high %lu"
691 "\n scanned %lu (a: %lu i: %lu)" 693 "\n scanned %lu (aa: %lu ia: %lu af: %lu if: %lu)"
692 "\n spanned %lu" 694 "\n spanned %lu"
693 "\n present %lu", 695 "\n present %lu",
694 zone_page_state(zone, NR_FREE_PAGES), 696 zone_page_state(zone, NR_FREE_PAGES),
@@ -696,8 +698,10 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
696 zone->pages_low, 698 zone->pages_low,
697 zone->pages_high, 699 zone->pages_high,
698 zone->pages_scanned, 700 zone->pages_scanned,
699 zone->lru[LRU_ACTIVE].nr_scan, 701 zone->lru[LRU_ACTIVE_ANON].nr_scan,
700 zone->lru[LRU_INACTIVE].nr_scan, 702 zone->lru[LRU_INACTIVE_ANON].nr_scan,
703 zone->lru[LRU_ACTIVE_FILE].nr_scan,
704 zone->lru[LRU_INACTIVE_FILE].nr_scan,
701 zone->spanned_pages, 705 zone->spanned_pages,
702 zone->present_pages); 706 zone->present_pages);
703 707