diff options
author | KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> | 2011-07-26 19:08:26 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-07-26 19:49:42 -0400 |
commit | 82f9d486e59f588c7d100865c36510644abda356 (patch) | |
tree | 266f3dcf4f57538196bddd77a129adfb2752335b /mm/memcontrol.c | |
parent | 108b6a78463bb8c7163e4f9779f36ad8bbade334 (diff) |
memcg: add memory.vmscan_stat
The commit log of 0ae5e89c60c9 ("memcg: count the soft_limit reclaim
in...") says it adds scanning stats to memory.stat file. But it doesn't
because we considered we needed to make a concensus for such new APIs.
This patch is a trial to add memory.scan_stat. This shows
- the number of scanned pages(total, anon, file)
- the number of rotated pages(total, anon, file)
- the number of freed pages(total, anon, file)
- the number of elaplsed time (including sleep/pause time)
for both of direct/soft reclaim.
The biggest difference with oringinal Ying's one is that this file
can be reset by some write, as
# echo 0 ...../memory.scan_stat
Example of output is here. This is a result after make -j 6 kernel
under 300M limit.
[kamezawa@bluextal ~]$ cat /cgroup/memory/A/memory.scan_stat
[kamezawa@bluextal ~]$ cat /cgroup/memory/A/memory.vmscan_stat
scanned_pages_by_limit 9471864
scanned_anon_pages_by_limit 6640629
scanned_file_pages_by_limit 2831235
rotated_pages_by_limit 4243974
rotated_anon_pages_by_limit 3971968
rotated_file_pages_by_limit 272006
freed_pages_by_limit 2318492
freed_anon_pages_by_limit 962052
freed_file_pages_by_limit 1356440
elapsed_ns_by_limit 351386416101
scanned_pages_by_system 0
scanned_anon_pages_by_system 0
scanned_file_pages_by_system 0
rotated_pages_by_system 0
rotated_anon_pages_by_system 0
rotated_file_pages_by_system 0
freed_pages_by_system 0
freed_anon_pages_by_system 0
freed_file_pages_by_system 0
elapsed_ns_by_system 0
scanned_pages_by_limit_under_hierarchy 9471864
scanned_anon_pages_by_limit_under_hierarchy 6640629
scanned_file_pages_by_limit_under_hierarchy 2831235
rotated_pages_by_limit_under_hierarchy 4243974
rotated_anon_pages_by_limit_under_hierarchy 3971968
rotated_file_pages_by_limit_under_hierarchy 272006
freed_pages_by_limit_under_hierarchy 2318492
freed_anon_pages_by_limit_under_hierarchy 962052
freed_file_pages_by_limit_under_hierarchy 1356440
elapsed_ns_by_limit_under_hierarchy 351386416101
scanned_pages_by_system_under_hierarchy 0
scanned_anon_pages_by_system_under_hierarchy 0
scanned_file_pages_by_system_under_hierarchy 0
rotated_pages_by_system_under_hierarchy 0
rotated_anon_pages_by_system_under_hierarchy 0
rotated_file_pages_by_system_under_hierarchy 0
freed_pages_by_system_under_hierarchy 0
freed_anon_pages_by_system_under_hierarchy 0
freed_file_pages_by_system_under_hierarchy 0
elapsed_ns_by_system_under_hierarchy 0
total_xxxx is for hierarchy management.
This will be useful for further memcg developments and need to be
developped before we do some complicated rework on LRU/softlimit
management.
This patch adds a new struct memcg_scanrecord into scan_control struct.
sc->nr_scanned at el is not designed for exporting information. For
example, nr_scanned is reset frequentrly and incremented +2 at scanning
mapped pages.
To avoid complexity, I added a new param in scan_control which is for
exporting scanning score.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Ying Han <yinghan@google.com>
Cc: Andrew Bresticker <abrestic@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 172 |
1 files changed, 166 insertions, 6 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index dfeca594fd7a..04e505bfd7dd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -205,6 +205,50 @@ struct mem_cgroup_eventfd_list { | |||
205 | static void mem_cgroup_threshold(struct mem_cgroup *mem); | 205 | static void mem_cgroup_threshold(struct mem_cgroup *mem); |
206 | static void mem_cgroup_oom_notify(struct mem_cgroup *mem); | 206 | static void mem_cgroup_oom_notify(struct mem_cgroup *mem); |
207 | 207 | ||
208 | enum { | ||
209 | SCAN_BY_LIMIT, | ||
210 | SCAN_BY_SYSTEM, | ||
211 | NR_SCAN_CONTEXT, | ||
212 | SCAN_BY_SHRINK, /* not recorded now */ | ||
213 | }; | ||
214 | |||
215 | enum { | ||
216 | SCAN, | ||
217 | SCAN_ANON, | ||
218 | SCAN_FILE, | ||
219 | ROTATE, | ||
220 | ROTATE_ANON, | ||
221 | ROTATE_FILE, | ||
222 | FREED, | ||
223 | FREED_ANON, | ||
224 | FREED_FILE, | ||
225 | ELAPSED, | ||
226 | NR_SCANSTATS, | ||
227 | }; | ||
228 | |||
229 | struct scanstat { | ||
230 | spinlock_t lock; | ||
231 | unsigned long stats[NR_SCAN_CONTEXT][NR_SCANSTATS]; | ||
232 | unsigned long rootstats[NR_SCAN_CONTEXT][NR_SCANSTATS]; | ||
233 | }; | ||
234 | |||
235 | const char *scanstat_string[NR_SCANSTATS] = { | ||
236 | "scanned_pages", | ||
237 | "scanned_anon_pages", | ||
238 | "scanned_file_pages", | ||
239 | "rotated_pages", | ||
240 | "rotated_anon_pages", | ||
241 | "rotated_file_pages", | ||
242 | "freed_pages", | ||
243 | "freed_anon_pages", | ||
244 | "freed_file_pages", | ||
245 | "elapsed_ns", | ||
246 | }; | ||
247 | #define SCANSTAT_WORD_LIMIT "_by_limit" | ||
248 | #define SCANSTAT_WORD_SYSTEM "_by_system" | ||
249 | #define SCANSTAT_WORD_HIERARCHY "_under_hierarchy" | ||
250 | |||
251 | |||
208 | /* | 252 | /* |
209 | * The memory controller data structure. The memory controller controls both | 253 | * The memory controller data structure. The memory controller controls both |
210 | * page cache and RSS per cgroup. We would eventually like to provide | 254 | * page cache and RSS per cgroup. We would eventually like to provide |
@@ -270,7 +314,8 @@ struct mem_cgroup { | |||
270 | 314 | ||
271 | /* For oom notifier event fd */ | 315 | /* For oom notifier event fd */ |
272 | struct list_head oom_notify; | 316 | struct list_head oom_notify; |
273 | 317 | /* For recording LRU-scan statistics */ | |
318 | struct scanstat scanstat; | ||
274 | /* | 319 | /* |
275 | * Should we move charges of a task when a task is moved into this | 320 | * Should we move charges of a task when a task is moved into this |
276 | * mem_cgroup ? And what type of charges should we move ? | 321 | * mem_cgroup ? And what type of charges should we move ? |
@@ -1623,6 +1668,44 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) | |||
1623 | } | 1668 | } |
1624 | #endif | 1669 | #endif |
1625 | 1670 | ||
1671 | static void __mem_cgroup_record_scanstat(unsigned long *stats, | ||
1672 | struct memcg_scanrecord *rec) | ||
1673 | { | ||
1674 | |||
1675 | stats[SCAN] += rec->nr_scanned[0] + rec->nr_scanned[1]; | ||
1676 | stats[SCAN_ANON] += rec->nr_scanned[0]; | ||
1677 | stats[SCAN_FILE] += rec->nr_scanned[1]; | ||
1678 | |||
1679 | stats[ROTATE] += rec->nr_rotated[0] + rec->nr_rotated[1]; | ||
1680 | stats[ROTATE_ANON] += rec->nr_rotated[0]; | ||
1681 | stats[ROTATE_FILE] += rec->nr_rotated[1]; | ||
1682 | |||
1683 | stats[FREED] += rec->nr_freed[0] + rec->nr_freed[1]; | ||
1684 | stats[FREED_ANON] += rec->nr_freed[0]; | ||
1685 | stats[FREED_FILE] += rec->nr_freed[1]; | ||
1686 | |||
1687 | stats[ELAPSED] += rec->elapsed; | ||
1688 | } | ||
1689 | |||
1690 | static void mem_cgroup_record_scanstat(struct memcg_scanrecord *rec) | ||
1691 | { | ||
1692 | struct mem_cgroup *mem; | ||
1693 | int context = rec->context; | ||
1694 | |||
1695 | if (context >= NR_SCAN_CONTEXT) | ||
1696 | return; | ||
1697 | |||
1698 | mem = rec->mem; | ||
1699 | spin_lock(&mem->scanstat.lock); | ||
1700 | __mem_cgroup_record_scanstat(mem->scanstat.stats[context], rec); | ||
1701 | spin_unlock(&mem->scanstat.lock); | ||
1702 | |||
1703 | mem = rec->root; | ||
1704 | spin_lock(&mem->scanstat.lock); | ||
1705 | __mem_cgroup_record_scanstat(mem->scanstat.rootstats[context], rec); | ||
1706 | spin_unlock(&mem->scanstat.lock); | ||
1707 | } | ||
1708 | |||
1626 | /* | 1709 | /* |
1627 | * Scan the hierarchy if needed to reclaim memory. We remember the last child | 1710 | * Scan the hierarchy if needed to reclaim memory. We remember the last child |
1628 | * we reclaimed from, so that we don't end up penalizing one child extensively | 1711 | * we reclaimed from, so that we don't end up penalizing one child extensively |
@@ -1647,8 +1730,9 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1647 | bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; | 1730 | bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; |
1648 | bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; | 1731 | bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; |
1649 | bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; | 1732 | bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; |
1733 | struct memcg_scanrecord rec; | ||
1650 | unsigned long excess; | 1734 | unsigned long excess; |
1651 | unsigned long nr_scanned; | 1735 | unsigned long scanned; |
1652 | 1736 | ||
1653 | excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; | 1737 | excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; |
1654 | 1738 | ||
@@ -1656,6 +1740,15 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1656 | if (!check_soft && !shrink && root_mem->memsw_is_minimum) | 1740 | if (!check_soft && !shrink && root_mem->memsw_is_minimum) |
1657 | noswap = true; | 1741 | noswap = true; |
1658 | 1742 | ||
1743 | if (shrink) | ||
1744 | rec.context = SCAN_BY_SHRINK; | ||
1745 | else if (check_soft) | ||
1746 | rec.context = SCAN_BY_SYSTEM; | ||
1747 | else | ||
1748 | rec.context = SCAN_BY_LIMIT; | ||
1749 | |||
1750 | rec.root = root_mem; | ||
1751 | |||
1659 | while (1) { | 1752 | while (1) { |
1660 | victim = mem_cgroup_select_victim(root_mem); | 1753 | victim = mem_cgroup_select_victim(root_mem); |
1661 | if (victim == root_mem) { | 1754 | if (victim == root_mem) { |
@@ -1696,14 +1789,23 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1696 | css_put(&victim->css); | 1789 | css_put(&victim->css); |
1697 | continue; | 1790 | continue; |
1698 | } | 1791 | } |
1792 | rec.mem = victim; | ||
1793 | rec.nr_scanned[0] = 0; | ||
1794 | rec.nr_scanned[1] = 0; | ||
1795 | rec.nr_rotated[0] = 0; | ||
1796 | rec.nr_rotated[1] = 0; | ||
1797 | rec.nr_freed[0] = 0; | ||
1798 | rec.nr_freed[1] = 0; | ||
1799 | rec.elapsed = 0; | ||
1699 | /* we use swappiness of local cgroup */ | 1800 | /* we use swappiness of local cgroup */ |
1700 | if (check_soft) { | 1801 | if (check_soft) { |
1701 | ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, | 1802 | ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, |
1702 | noswap, zone, &nr_scanned); | 1803 | noswap, zone, &rec, &scanned); |
1703 | *total_scanned += nr_scanned; | 1804 | *total_scanned += scanned; |
1704 | } else | 1805 | } else |
1705 | ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, | 1806 | ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, |
1706 | noswap); | 1807 | noswap, &rec); |
1808 | mem_cgroup_record_scanstat(&rec); | ||
1707 | css_put(&victim->css); | 1809 | css_put(&victim->css); |
1708 | /* | 1810 | /* |
1709 | * At shrinking usage, we can't check we should stop here or | 1811 | * At shrinking usage, we can't check we should stop here or |
@@ -3792,14 +3894,18 @@ try_to_free: | |||
3792 | /* try to free all pages in this cgroup */ | 3894 | /* try to free all pages in this cgroup */ |
3793 | shrink = 1; | 3895 | shrink = 1; |
3794 | while (nr_retries && mem->res.usage > 0) { | 3896 | while (nr_retries && mem->res.usage > 0) { |
3897 | struct memcg_scanrecord rec; | ||
3795 | int progress; | 3898 | int progress; |
3796 | 3899 | ||
3797 | if (signal_pending(current)) { | 3900 | if (signal_pending(current)) { |
3798 | ret = -EINTR; | 3901 | ret = -EINTR; |
3799 | goto out; | 3902 | goto out; |
3800 | } | 3903 | } |
3904 | rec.context = SCAN_BY_SHRINK; | ||
3905 | rec.mem = mem; | ||
3906 | rec.root = mem; | ||
3801 | progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, | 3907 | progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, |
3802 | false); | 3908 | false, &rec); |
3803 | if (!progress) { | 3909 | if (!progress) { |
3804 | nr_retries--; | 3910 | nr_retries--; |
3805 | /* maybe some writeback is necessary */ | 3911 | /* maybe some writeback is necessary */ |
@@ -4643,6 +4749,54 @@ static int mem_control_numa_stat_open(struct inode *unused, struct file *file) | |||
4643 | } | 4749 | } |
4644 | #endif /* CONFIG_NUMA */ | 4750 | #endif /* CONFIG_NUMA */ |
4645 | 4751 | ||
4752 | static int mem_cgroup_vmscan_stat_read(struct cgroup *cgrp, | ||
4753 | struct cftype *cft, | ||
4754 | struct cgroup_map_cb *cb) | ||
4755 | { | ||
4756 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); | ||
4757 | char string[64]; | ||
4758 | int i; | ||
4759 | |||
4760 | for (i = 0; i < NR_SCANSTATS; i++) { | ||
4761 | strcpy(string, scanstat_string[i]); | ||
4762 | strcat(string, SCANSTAT_WORD_LIMIT); | ||
4763 | cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_LIMIT][i]); | ||
4764 | } | ||
4765 | |||
4766 | for (i = 0; i < NR_SCANSTATS; i++) { | ||
4767 | strcpy(string, scanstat_string[i]); | ||
4768 | strcat(string, SCANSTAT_WORD_SYSTEM); | ||
4769 | cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_SYSTEM][i]); | ||
4770 | } | ||
4771 | |||
4772 | for (i = 0; i < NR_SCANSTATS; i++) { | ||
4773 | strcpy(string, scanstat_string[i]); | ||
4774 | strcat(string, SCANSTAT_WORD_LIMIT); | ||
4775 | strcat(string, SCANSTAT_WORD_HIERARCHY); | ||
4776 | cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_LIMIT][i]); | ||
4777 | } | ||
4778 | for (i = 0; i < NR_SCANSTATS; i++) { | ||
4779 | strcpy(string, scanstat_string[i]); | ||
4780 | strcat(string, SCANSTAT_WORD_SYSTEM); | ||
4781 | strcat(string, SCANSTAT_WORD_HIERARCHY); | ||
4782 | cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_SYSTEM][i]); | ||
4783 | } | ||
4784 | return 0; | ||
4785 | } | ||
4786 | |||
4787 | static int mem_cgroup_reset_vmscan_stat(struct cgroup *cgrp, | ||
4788 | unsigned int event) | ||
4789 | { | ||
4790 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); | ||
4791 | |||
4792 | spin_lock(&mem->scanstat.lock); | ||
4793 | memset(&mem->scanstat.stats, 0, sizeof(mem->scanstat.stats)); | ||
4794 | memset(&mem->scanstat.rootstats, 0, sizeof(mem->scanstat.rootstats)); | ||
4795 | spin_unlock(&mem->scanstat.lock); | ||
4796 | return 0; | ||
4797 | } | ||
4798 | |||
4799 | |||
4646 | static struct cftype mem_cgroup_files[] = { | 4800 | static struct cftype mem_cgroup_files[] = { |
4647 | { | 4801 | { |
4648 | .name = "usage_in_bytes", | 4802 | .name = "usage_in_bytes", |
@@ -4713,6 +4867,11 @@ static struct cftype mem_cgroup_files[] = { | |||
4713 | .mode = S_IRUGO, | 4867 | .mode = S_IRUGO, |
4714 | }, | 4868 | }, |
4715 | #endif | 4869 | #endif |
4870 | { | ||
4871 | .name = "vmscan_stat", | ||
4872 | .read_map = mem_cgroup_vmscan_stat_read, | ||
4873 | .trigger = mem_cgroup_reset_vmscan_stat, | ||
4874 | }, | ||
4716 | }; | 4875 | }; |
4717 | 4876 | ||
4718 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 4877 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
@@ -4976,6 +5135,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
4976 | atomic_set(&mem->refcnt, 1); | 5135 | atomic_set(&mem->refcnt, 1); |
4977 | mem->move_charge_at_immigrate = 0; | 5136 | mem->move_charge_at_immigrate = 0; |
4978 | mutex_init(&mem->thresholds_lock); | 5137 | mutex_init(&mem->thresholds_lock); |
5138 | spin_lock_init(&mem->scanstat.lock); | ||
4979 | return &mem->css; | 5139 | return &mem->css; |
4980 | free_out: | 5140 | free_out: |
4981 | __mem_cgroup_free(mem); | 5141 | __mem_cgroup_free(mem); |