aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorYing Han <yinghan@google.com>2011-05-26 19:25:38 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-05-26 20:12:36 -0400
commit456f998ec817ebfa254464be4f089542fa390645 (patch)
tree5976aa500638f0bbade1a672233cad71765b89b8 /mm
parent406eb0c9ba765eb066406fd5ce9d5e2b169a4d5a (diff)
memcg: add the pagefault count into memcg stats
Two new stats in per-memcg memory.stat which tracks the number of page faults and number of major page faults. "pgfault" "pgmajfault" They are different from "pgpgin"/"pgpgout" stat which count number of pages charged/discharged to the cgroup and have no meaning of reading/ writing page to disk. It is valuable to track the two stats for both measuring application's performance as well as the efficiency of the kernel page reclaim path. Counting pagefaults per process is useful, but we also need the aggregated value since processes are monitored and controlled in cgroup basis in memcg. Functional test: check the total number of pgfault/pgmajfault of all memcgs and compare with global vmstat value: $ cat /proc/vmstat | grep fault pgfault 1070751 pgmajfault 553 $ cat /dev/cgroup/memory.stat | grep fault pgfault 1071138 pgmajfault 553 total_pgfault 1071142 total_pgmajfault 553 $ cat /dev/cgroup/A/memory.stat | grep fault pgfault 199 pgmajfault 0 total_pgfault 199 total_pgmajfault 0 Performance test: run page fault test(pft) wit 16 thread on faulting in 15G anon pages in 16G container. There is no regression noticed on the "flt/cpu/s" Sample output from pft: TAG pft:anon-sys-default: Gb Thr CLine User System Wall flt/cpu/s fault/wsec 15 16 1 0.67s 233.41s 14.76s 16798.546 266356.260 +-------------------------------------------------------------------------+ N Min Max Median Avg Stddev x 10 16682.962 17344.027 16913.524 16928.812 166.5362 + 10 16695.568 16923.896 16820.604 16824.652 84.816568 No difference proven at 95.0% confidence [akpm@linux-foundation.org: fix build] [hughd@google.com: shmem fix] Signed-off-by: Ying Han <yinghan@google.com> Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Reviewed-by: Minchan Kim <minchan.kim@gmail.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com> Signed-off-by: Hugh Dickins <hughd@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/filemap.c1
-rw-r--r--mm/memcontrol.c47
-rw-r--r--mm/memory.c2
-rw-r--r--mm/shmem.c11
4 files changed, 56 insertions, 5 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 7455ccd8bda8..bcdc393b6580 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1661,6 +1661,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1661 /* No page in the page cache at all */ 1661 /* No page in the page cache at all */
1662 do_sync_mmap_readahead(vma, ra, file, offset); 1662 do_sync_mmap_readahead(vma, ra, file, offset);
1663 count_vm_event(PGMAJFAULT); 1663 count_vm_event(PGMAJFAULT);
1664 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
1664 ret = VM_FAULT_MAJOR; 1665 ret = VM_FAULT_MAJOR;
1665retry_find: 1666retry_find:
1666 page = find_get_page(mapping, offset); 1667 page = find_get_page(mapping, offset);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4021fcd71b60..bd9052a5d3ad 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -94,6 +94,8 @@ enum mem_cgroup_events_index {
94 MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ 94 MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */
95 MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ 95 MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */
96 MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */ 96 MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */
97 MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */
98 MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */
97 MEM_CGROUP_EVENTS_NSTATS, 99 MEM_CGROUP_EVENTS_NSTATS,
98}; 100};
99/* 101/*
@@ -590,6 +592,16 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
590 this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); 592 this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
591} 593}
592 594
595void mem_cgroup_pgfault(struct mem_cgroup *mem, int val)
596{
597 this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val);
598}
599
600void mem_cgroup_pgmajfault(struct mem_cgroup *mem, int val)
601{
602 this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val);
603}
604
593static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem, 605static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem,
594 enum mem_cgroup_events_index idx) 606 enum mem_cgroup_events_index idx)
595{ 607{
@@ -827,6 +839,33 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
827 return (mem == root_mem_cgroup); 839 return (mem == root_mem_cgroup);
828} 840}
829 841
842void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
843{
844 struct mem_cgroup *mem;
845
846 if (!mm)
847 return;
848
849 rcu_read_lock();
850 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
851 if (unlikely(!mem))
852 goto out;
853
854 switch (idx) {
855 case PGMAJFAULT:
856 mem_cgroup_pgmajfault(mem, 1);
857 break;
858 case PGFAULT:
859 mem_cgroup_pgfault(mem, 1);
860 break;
861 default:
862 BUG();
863 }
864out:
865 rcu_read_unlock();
866}
867EXPORT_SYMBOL(mem_cgroup_count_vm_event);
868
830/* 869/*
831 * Following LRU functions are allowed to be used without PCG_LOCK. 870 * Following LRU functions are allowed to be used without PCG_LOCK.
832 * Operations are called by routine of global LRU independently from memcg. 871 * Operations are called by routine of global LRU independently from memcg.
@@ -3958,6 +3997,8 @@ enum {
3958 MCS_PGPGIN, 3997 MCS_PGPGIN,
3959 MCS_PGPGOUT, 3998 MCS_PGPGOUT,
3960 MCS_SWAP, 3999 MCS_SWAP,
4000 MCS_PGFAULT,
4001 MCS_PGMAJFAULT,
3961 MCS_INACTIVE_ANON, 4002 MCS_INACTIVE_ANON,
3962 MCS_ACTIVE_ANON, 4003 MCS_ACTIVE_ANON,
3963 MCS_INACTIVE_FILE, 4004 MCS_INACTIVE_FILE,
@@ -3980,6 +4021,8 @@ struct {
3980 {"pgpgin", "total_pgpgin"}, 4021 {"pgpgin", "total_pgpgin"},
3981 {"pgpgout", "total_pgpgout"}, 4022 {"pgpgout", "total_pgpgout"},
3982 {"swap", "total_swap"}, 4023 {"swap", "total_swap"},
4024 {"pgfault", "total_pgfault"},
4025 {"pgmajfault", "total_pgmajfault"},
3983 {"inactive_anon", "total_inactive_anon"}, 4026 {"inactive_anon", "total_inactive_anon"},
3984 {"active_anon", "total_active_anon"}, 4027 {"active_anon", "total_active_anon"},
3985 {"inactive_file", "total_inactive_file"}, 4028 {"inactive_file", "total_inactive_file"},
@@ -4008,6 +4051,10 @@ mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
4008 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); 4051 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
4009 s->stat[MCS_SWAP] += val * PAGE_SIZE; 4052 s->stat[MCS_SWAP] += val * PAGE_SIZE;
4010 } 4053 }
4054 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGFAULT);
4055 s->stat[MCS_PGFAULT] += val;
4056 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGMAJFAULT);
4057 s->stat[MCS_PGMAJFAULT] += val;
4011 4058
4012 /* per zone stat */ 4059 /* per zone stat */
4013 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); 4060 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
diff --git a/mm/memory.c b/mm/memory.c
index fc24f7d788bd..6953d3926e01 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2874,6 +2874,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2874 /* Had to read the page from swap area: Major fault */ 2874 /* Had to read the page from swap area: Major fault */
2875 ret = VM_FAULT_MAJOR; 2875 ret = VM_FAULT_MAJOR;
2876 count_vm_event(PGMAJFAULT); 2876 count_vm_event(PGMAJFAULT);
2877 mem_cgroup_count_vm_event(mm, PGMAJFAULT);
2877 } else if (PageHWPoison(page)) { 2878 } else if (PageHWPoison(page)) {
2878 /* 2879 /*
2879 * hwpoisoned dirty swapcache pages are kept for killing 2880 * hwpoisoned dirty swapcache pages are kept for killing
@@ -3413,6 +3414,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3413 __set_current_state(TASK_RUNNING); 3414 __set_current_state(TASK_RUNNING);
3414 3415
3415 count_vm_event(PGFAULT); 3416 count_vm_event(PGFAULT);
3417 mem_cgroup_count_vm_event(mm, PGFAULT);
3416 3418
3417 /* do counter updates before entering really critical section. */ 3419 /* do counter updates before entering really critical section. */
3418 check_sync_rss_stat(current); 3420 check_sync_rss_stat(current);
diff --git a/mm/shmem.c b/mm/shmem.c
index 69edb45a9f28..1acfb2687bfa 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1305,12 +1305,10 @@ repeat:
1305 swappage = lookup_swap_cache(swap); 1305 swappage = lookup_swap_cache(swap);
1306 if (!swappage) { 1306 if (!swappage) {
1307 shmem_swp_unmap(entry); 1307 shmem_swp_unmap(entry);
1308 spin_unlock(&info->lock);
1308 /* here we actually do the io */ 1309 /* here we actually do the io */
1309 if (type && !(*type & VM_FAULT_MAJOR)) { 1310 if (type)
1310 __count_vm_event(PGMAJFAULT);
1311 *type |= VM_FAULT_MAJOR; 1311 *type |= VM_FAULT_MAJOR;
1312 }
1313 spin_unlock(&info->lock);
1314 swappage = shmem_swapin(swap, gfp, info, idx); 1312 swappage = shmem_swapin(swap, gfp, info, idx);
1315 if (!swappage) { 1313 if (!swappage) {
1316 spin_lock(&info->lock); 1314 spin_lock(&info->lock);
@@ -1549,7 +1547,10 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1549 error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); 1547 error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
1550 if (error) 1548 if (error)
1551 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); 1549 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
1552 1550 if (ret & VM_FAULT_MAJOR) {
1551 count_vm_event(PGMAJFAULT);
1552 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
1553 }
1553 return ret | VM_FAULT_LOCKED; 1554 return ret | VM_FAULT_LOCKED;
1554} 1555}
1555 1556