aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
authorKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>2011-07-26 19:08:26 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-07-26 19:49:42 -0400
commit82f9d486e59f588c7d100865c36510644abda356 (patch)
tree266f3dcf4f57538196bddd77a129adfb2752335b /mm/memcontrol.c
parent108b6a78463bb8c7163e4f9779f36ad8bbade334 (diff)
memcg: add memory.vmscan_stat
The commit log of 0ae5e89c60c9 ("memcg: count the soft_limit reclaim in...") says it adds scanning stats to memory.stat file. But it doesn't because we considered we needed to make a concensus for such new APIs. This patch is a trial to add memory.scan_stat. This shows - the number of scanned pages(total, anon, file) - the number of rotated pages(total, anon, file) - the number of freed pages(total, anon, file) - the number of elaplsed time (including sleep/pause time) for both of direct/soft reclaim. The biggest difference with oringinal Ying's one is that this file can be reset by some write, as # echo 0 ...../memory.scan_stat Example of output is here. This is a result after make -j 6 kernel under 300M limit. [kamezawa@bluextal ~]$ cat /cgroup/memory/A/memory.scan_stat [kamezawa@bluextal ~]$ cat /cgroup/memory/A/memory.vmscan_stat scanned_pages_by_limit 9471864 scanned_anon_pages_by_limit 6640629 scanned_file_pages_by_limit 2831235 rotated_pages_by_limit 4243974 rotated_anon_pages_by_limit 3971968 rotated_file_pages_by_limit 272006 freed_pages_by_limit 2318492 freed_anon_pages_by_limit 962052 freed_file_pages_by_limit 1356440 elapsed_ns_by_limit 351386416101 scanned_pages_by_system 0 scanned_anon_pages_by_system 0 scanned_file_pages_by_system 0 rotated_pages_by_system 0 rotated_anon_pages_by_system 0 rotated_file_pages_by_system 0 freed_pages_by_system 0 freed_anon_pages_by_system 0 freed_file_pages_by_system 0 elapsed_ns_by_system 0 scanned_pages_by_limit_under_hierarchy 9471864 scanned_anon_pages_by_limit_under_hierarchy 6640629 scanned_file_pages_by_limit_under_hierarchy 2831235 rotated_pages_by_limit_under_hierarchy 4243974 rotated_anon_pages_by_limit_under_hierarchy 3971968 rotated_file_pages_by_limit_under_hierarchy 272006 freed_pages_by_limit_under_hierarchy 2318492 freed_anon_pages_by_limit_under_hierarchy 962052 freed_file_pages_by_limit_under_hierarchy 1356440 elapsed_ns_by_limit_under_hierarchy 351386416101 scanned_pages_by_system_under_hierarchy 0 scanned_anon_pages_by_system_under_hierarchy 0 scanned_file_pages_by_system_under_hierarchy 0 rotated_pages_by_system_under_hierarchy 0 rotated_anon_pages_by_system_under_hierarchy 0 rotated_file_pages_by_system_under_hierarchy 0 freed_pages_by_system_under_hierarchy 0 freed_anon_pages_by_system_under_hierarchy 0 freed_file_pages_by_system_under_hierarchy 0 elapsed_ns_by_system_under_hierarchy 0 total_xxxx is for hierarchy management. This will be useful for further memcg developments and need to be developped before we do some complicated rework on LRU/softlimit management. This patch adds a new struct memcg_scanrecord into scan_control struct. sc->nr_scanned at el is not designed for exporting information. For example, nr_scanned is reset frequentrly and incremented +2 at scanning mapped pages. To avoid complexity, I added a new param in scan_control which is for exporting scanning score. Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Cc: Michal Hocko <mhocko@suse.cz> Cc: Ying Han <yinghan@google.com> Cc: Andrew Bresticker <abrestic@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c172
1 files changed, 166 insertions, 6 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index dfeca594fd7a..04e505bfd7dd 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -205,6 +205,50 @@ struct mem_cgroup_eventfd_list {
205static void mem_cgroup_threshold(struct mem_cgroup *mem); 205static void mem_cgroup_threshold(struct mem_cgroup *mem);
206static void mem_cgroup_oom_notify(struct mem_cgroup *mem); 206static void mem_cgroup_oom_notify(struct mem_cgroup *mem);
207 207
208enum {
209 SCAN_BY_LIMIT,
210 SCAN_BY_SYSTEM,
211 NR_SCAN_CONTEXT,
212 SCAN_BY_SHRINK, /* not recorded now */
213};
214
215enum {
216 SCAN,
217 SCAN_ANON,
218 SCAN_FILE,
219 ROTATE,
220 ROTATE_ANON,
221 ROTATE_FILE,
222 FREED,
223 FREED_ANON,
224 FREED_FILE,
225 ELAPSED,
226 NR_SCANSTATS,
227};
228
229struct scanstat {
230 spinlock_t lock;
231 unsigned long stats[NR_SCAN_CONTEXT][NR_SCANSTATS];
232 unsigned long rootstats[NR_SCAN_CONTEXT][NR_SCANSTATS];
233};
234
235const char *scanstat_string[NR_SCANSTATS] = {
236 "scanned_pages",
237 "scanned_anon_pages",
238 "scanned_file_pages",
239 "rotated_pages",
240 "rotated_anon_pages",
241 "rotated_file_pages",
242 "freed_pages",
243 "freed_anon_pages",
244 "freed_file_pages",
245 "elapsed_ns",
246};
247#define SCANSTAT_WORD_LIMIT "_by_limit"
248#define SCANSTAT_WORD_SYSTEM "_by_system"
249#define SCANSTAT_WORD_HIERARCHY "_under_hierarchy"
250
251
208/* 252/*
209 * The memory controller data structure. The memory controller controls both 253 * The memory controller data structure. The memory controller controls both
210 * page cache and RSS per cgroup. We would eventually like to provide 254 * page cache and RSS per cgroup. We would eventually like to provide
@@ -270,7 +314,8 @@ struct mem_cgroup {
270 314
271 /* For oom notifier event fd */ 315 /* For oom notifier event fd */
272 struct list_head oom_notify; 316 struct list_head oom_notify;
273 317 /* For recording LRU-scan statistics */
318 struct scanstat scanstat;
274 /* 319 /*
275 * Should we move charges of a task when a task is moved into this 320 * Should we move charges of a task when a task is moved into this
276 * mem_cgroup ? And what type of charges should we move ? 321 * mem_cgroup ? And what type of charges should we move ?
@@ -1623,6 +1668,44 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
1623} 1668}
1624#endif 1669#endif
1625 1670
1671static void __mem_cgroup_record_scanstat(unsigned long *stats,
1672 struct memcg_scanrecord *rec)
1673{
1674
1675 stats[SCAN] += rec->nr_scanned[0] + rec->nr_scanned[1];
1676 stats[SCAN_ANON] += rec->nr_scanned[0];
1677 stats[SCAN_FILE] += rec->nr_scanned[1];
1678
1679 stats[ROTATE] += rec->nr_rotated[0] + rec->nr_rotated[1];
1680 stats[ROTATE_ANON] += rec->nr_rotated[0];
1681 stats[ROTATE_FILE] += rec->nr_rotated[1];
1682
1683 stats[FREED] += rec->nr_freed[0] + rec->nr_freed[1];
1684 stats[FREED_ANON] += rec->nr_freed[0];
1685 stats[FREED_FILE] += rec->nr_freed[1];
1686
1687 stats[ELAPSED] += rec->elapsed;
1688}
1689
1690static void mem_cgroup_record_scanstat(struct memcg_scanrecord *rec)
1691{
1692 struct mem_cgroup *mem;
1693 int context = rec->context;
1694
1695 if (context >= NR_SCAN_CONTEXT)
1696 return;
1697
1698 mem = rec->mem;
1699 spin_lock(&mem->scanstat.lock);
1700 __mem_cgroup_record_scanstat(mem->scanstat.stats[context], rec);
1701 spin_unlock(&mem->scanstat.lock);
1702
1703 mem = rec->root;
1704 spin_lock(&mem->scanstat.lock);
1705 __mem_cgroup_record_scanstat(mem->scanstat.rootstats[context], rec);
1706 spin_unlock(&mem->scanstat.lock);
1707}
1708
1626/* 1709/*
1627 * Scan the hierarchy if needed to reclaim memory. We remember the last child 1710 * Scan the hierarchy if needed to reclaim memory. We remember the last child
1628 * we reclaimed from, so that we don't end up penalizing one child extensively 1711 * we reclaimed from, so that we don't end up penalizing one child extensively
@@ -1647,8 +1730,9 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1647 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; 1730 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
1648 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; 1731 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
1649 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; 1732 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
1733 struct memcg_scanrecord rec;
1650 unsigned long excess; 1734 unsigned long excess;
1651 unsigned long nr_scanned; 1735 unsigned long scanned;
1652 1736
1653 excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; 1737 excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
1654 1738
@@ -1656,6 +1740,15 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1656 if (!check_soft && !shrink && root_mem->memsw_is_minimum) 1740 if (!check_soft && !shrink && root_mem->memsw_is_minimum)
1657 noswap = true; 1741 noswap = true;
1658 1742
1743 if (shrink)
1744 rec.context = SCAN_BY_SHRINK;
1745 else if (check_soft)
1746 rec.context = SCAN_BY_SYSTEM;
1747 else
1748 rec.context = SCAN_BY_LIMIT;
1749
1750 rec.root = root_mem;
1751
1659 while (1) { 1752 while (1) {
1660 victim = mem_cgroup_select_victim(root_mem); 1753 victim = mem_cgroup_select_victim(root_mem);
1661 if (victim == root_mem) { 1754 if (victim == root_mem) {
@@ -1696,14 +1789,23 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1696 css_put(&victim->css); 1789 css_put(&victim->css);
1697 continue; 1790 continue;
1698 } 1791 }
1792 rec.mem = victim;
1793 rec.nr_scanned[0] = 0;
1794 rec.nr_scanned[1] = 0;
1795 rec.nr_rotated[0] = 0;
1796 rec.nr_rotated[1] = 0;
1797 rec.nr_freed[0] = 0;
1798 rec.nr_freed[1] = 0;
1799 rec.elapsed = 0;
1699 /* we use swappiness of local cgroup */ 1800 /* we use swappiness of local cgroup */
1700 if (check_soft) { 1801 if (check_soft) {
1701 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, 1802 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
1702 noswap, zone, &nr_scanned); 1803 noswap, zone, &rec, &scanned);
1703 *total_scanned += nr_scanned; 1804 *total_scanned += scanned;
1704 } else 1805 } else
1705 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, 1806 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
1706 noswap); 1807 noswap, &rec);
1808 mem_cgroup_record_scanstat(&rec);
1707 css_put(&victim->css); 1809 css_put(&victim->css);
1708 /* 1810 /*
1709 * At shrinking usage, we can't check we should stop here or 1811 * At shrinking usage, we can't check we should stop here or
@@ -3792,14 +3894,18 @@ try_to_free:
3792 /* try to free all pages in this cgroup */ 3894 /* try to free all pages in this cgroup */
3793 shrink = 1; 3895 shrink = 1;
3794 while (nr_retries && mem->res.usage > 0) { 3896 while (nr_retries && mem->res.usage > 0) {
3897 struct memcg_scanrecord rec;
3795 int progress; 3898 int progress;
3796 3899
3797 if (signal_pending(current)) { 3900 if (signal_pending(current)) {
3798 ret = -EINTR; 3901 ret = -EINTR;
3799 goto out; 3902 goto out;
3800 } 3903 }
3904 rec.context = SCAN_BY_SHRINK;
3905 rec.mem = mem;
3906 rec.root = mem;
3801 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, 3907 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
3802 false); 3908 false, &rec);
3803 if (!progress) { 3909 if (!progress) {
3804 nr_retries--; 3910 nr_retries--;
3805 /* maybe some writeback is necessary */ 3911 /* maybe some writeback is necessary */
@@ -4643,6 +4749,54 @@ static int mem_control_numa_stat_open(struct inode *unused, struct file *file)
4643} 4749}
4644#endif /* CONFIG_NUMA */ 4750#endif /* CONFIG_NUMA */
4645 4751
4752static int mem_cgroup_vmscan_stat_read(struct cgroup *cgrp,
4753 struct cftype *cft,
4754 struct cgroup_map_cb *cb)
4755{
4756 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
4757 char string[64];
4758 int i;
4759
4760 for (i = 0; i < NR_SCANSTATS; i++) {
4761 strcpy(string, scanstat_string[i]);
4762 strcat(string, SCANSTAT_WORD_LIMIT);
4763 cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_LIMIT][i]);
4764 }
4765
4766 for (i = 0; i < NR_SCANSTATS; i++) {
4767 strcpy(string, scanstat_string[i]);
4768 strcat(string, SCANSTAT_WORD_SYSTEM);
4769 cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_SYSTEM][i]);
4770 }
4771
4772 for (i = 0; i < NR_SCANSTATS; i++) {
4773 strcpy(string, scanstat_string[i]);
4774 strcat(string, SCANSTAT_WORD_LIMIT);
4775 strcat(string, SCANSTAT_WORD_HIERARCHY);
4776 cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_LIMIT][i]);
4777 }
4778 for (i = 0; i < NR_SCANSTATS; i++) {
4779 strcpy(string, scanstat_string[i]);
4780 strcat(string, SCANSTAT_WORD_SYSTEM);
4781 strcat(string, SCANSTAT_WORD_HIERARCHY);
4782 cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_SYSTEM][i]);
4783 }
4784 return 0;
4785}
4786
4787static int mem_cgroup_reset_vmscan_stat(struct cgroup *cgrp,
4788 unsigned int event)
4789{
4790 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
4791
4792 spin_lock(&mem->scanstat.lock);
4793 memset(&mem->scanstat.stats, 0, sizeof(mem->scanstat.stats));
4794 memset(&mem->scanstat.rootstats, 0, sizeof(mem->scanstat.rootstats));
4795 spin_unlock(&mem->scanstat.lock);
4796 return 0;
4797}
4798
4799
4646static struct cftype mem_cgroup_files[] = { 4800static struct cftype mem_cgroup_files[] = {
4647 { 4801 {
4648 .name = "usage_in_bytes", 4802 .name = "usage_in_bytes",
@@ -4713,6 +4867,11 @@ static struct cftype mem_cgroup_files[] = {
4713 .mode = S_IRUGO, 4867 .mode = S_IRUGO,
4714 }, 4868 },
4715#endif 4869#endif
4870 {
4871 .name = "vmscan_stat",
4872 .read_map = mem_cgroup_vmscan_stat_read,
4873 .trigger = mem_cgroup_reset_vmscan_stat,
4874 },
4716}; 4875};
4717 4876
4718#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4877#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
@@ -4976,6 +5135,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
4976 atomic_set(&mem->refcnt, 1); 5135 atomic_set(&mem->refcnt, 1);
4977 mem->move_charge_at_immigrate = 0; 5136 mem->move_charge_at_immigrate = 0;
4978 mutex_init(&mem->thresholds_lock); 5137 mutex_init(&mem->thresholds_lock);
5138 spin_lock_init(&mem->scanstat.lock);
4979 return &mem->css; 5139 return &mem->css;
4980free_out: 5140free_out:
4981 __mem_cgroup_free(mem); 5141 __mem_cgroup_free(mem);