diff options
author | KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> | 2011-07-26 19:08:26 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-07-26 19:49:42 -0400 |
commit | 82f9d486e59f588c7d100865c36510644abda356 (patch) | |
tree | 266f3dcf4f57538196bddd77a129adfb2752335b | |
parent | 108b6a78463bb8c7163e4f9779f36ad8bbade334 (diff) |
memcg: add memory.vmscan_stat
The commit log of 0ae5e89c60c9 ("memcg: count the soft_limit reclaim
in...") says it adds scanning stats to memory.stat file. But it doesn't
because we considered we needed to make a concensus for such new APIs.
This patch is a trial to add memory.scan_stat. This shows
- the number of scanned pages(total, anon, file)
- the number of rotated pages(total, anon, file)
- the number of freed pages(total, anon, file)
- the number of elaplsed time (including sleep/pause time)
for both of direct/soft reclaim.
The biggest difference with oringinal Ying's one is that this file
can be reset by some write, as
# echo 0 ...../memory.scan_stat
Example of output is here. This is a result after make -j 6 kernel
under 300M limit.
[kamezawa@bluextal ~]$ cat /cgroup/memory/A/memory.scan_stat
[kamezawa@bluextal ~]$ cat /cgroup/memory/A/memory.vmscan_stat
scanned_pages_by_limit 9471864
scanned_anon_pages_by_limit 6640629
scanned_file_pages_by_limit 2831235
rotated_pages_by_limit 4243974
rotated_anon_pages_by_limit 3971968
rotated_file_pages_by_limit 272006
freed_pages_by_limit 2318492
freed_anon_pages_by_limit 962052
freed_file_pages_by_limit 1356440
elapsed_ns_by_limit 351386416101
scanned_pages_by_system 0
scanned_anon_pages_by_system 0
scanned_file_pages_by_system 0
rotated_pages_by_system 0
rotated_anon_pages_by_system 0
rotated_file_pages_by_system 0
freed_pages_by_system 0
freed_anon_pages_by_system 0
freed_file_pages_by_system 0
elapsed_ns_by_system 0
scanned_pages_by_limit_under_hierarchy 9471864
scanned_anon_pages_by_limit_under_hierarchy 6640629
scanned_file_pages_by_limit_under_hierarchy 2831235
rotated_pages_by_limit_under_hierarchy 4243974
rotated_anon_pages_by_limit_under_hierarchy 3971968
rotated_file_pages_by_limit_under_hierarchy 272006
freed_pages_by_limit_under_hierarchy 2318492
freed_anon_pages_by_limit_under_hierarchy 962052
freed_file_pages_by_limit_under_hierarchy 1356440
elapsed_ns_by_limit_under_hierarchy 351386416101
scanned_pages_by_system_under_hierarchy 0
scanned_anon_pages_by_system_under_hierarchy 0
scanned_file_pages_by_system_under_hierarchy 0
rotated_pages_by_system_under_hierarchy 0
rotated_anon_pages_by_system_under_hierarchy 0
rotated_file_pages_by_system_under_hierarchy 0
freed_pages_by_system_under_hierarchy 0
freed_anon_pages_by_system_under_hierarchy 0
freed_file_pages_by_system_under_hierarchy 0
elapsed_ns_by_system_under_hierarchy 0
total_xxxx is for hierarchy management.
This will be useful for further memcg developments and need to be
developped before we do some complicated rework on LRU/softlimit
management.
This patch adds a new struct memcg_scanrecord into scan_control struct.
sc->nr_scanned at el is not designed for exporting information. For
example, nr_scanned is reset frequentrly and incremented +2 at scanning
mapped pages.
To avoid complexity, I added a new param in scan_control which is for
exporting scanning score.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Ying Han <yinghan@google.com>
Cc: Andrew Bresticker <abrestic@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | Documentation/cgroups/memory.txt | 85 | ||||
-rw-r--r-- | include/linux/memcontrol.h | 19 | ||||
-rw-r--r-- | include/linux/swap.h | 6 | ||||
-rw-r--r-- | mm/memcontrol.c | 172 | ||||
-rw-r--r-- | mm/vmscan.c | 39 |
5 files changed, 303 insertions, 18 deletions
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 06eb6d957c83..6f3c598971fc 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt | |||
@@ -380,7 +380,7 @@ will be charged as a new owner of it. | |||
380 | 380 | ||
381 | 5.2 stat file | 381 | 5.2 stat file |
382 | 382 | ||
383 | memory.stat file includes following statistics | 383 | 5.2.1 memory.stat file includes following statistics |
384 | 384 | ||
385 | # per-memory cgroup local status | 385 | # per-memory cgroup local status |
386 | cache - # of bytes of page cache memory. | 386 | cache - # of bytes of page cache memory. |
@@ -438,6 +438,89 @@ Note: | |||
438 | file_mapped is accounted only when the memory cgroup is owner of page | 438 | file_mapped is accounted only when the memory cgroup is owner of page |
439 | cache.) | 439 | cache.) |
440 | 440 | ||
441 | 5.2.2 memory.vmscan_stat | ||
442 | |||
443 | memory.vmscan_stat includes statistics information for memory scanning and | ||
444 | freeing, reclaiming. The statistics shows memory scanning information since | ||
445 | memory cgroup creation and can be reset to 0 by writing 0 as | ||
446 | |||
447 | #echo 0 > ../memory.vmscan_stat | ||
448 | |||
449 | This file contains following statistics. | ||
450 | |||
451 | [param]_[file_or_anon]_pages_by_[reason]_[under_heararchy] | ||
452 | [param]_elapsed_ns_by_[reason]_[under_hierarchy] | ||
453 | |||
454 | For example, | ||
455 | |||
456 | scanned_file_pages_by_limit indicates the number of scanned | ||
457 | file pages at vmscan. | ||
458 | |||
459 | Now, 3 parameters are supported | ||
460 | |||
461 | scanned - the number of pages scanned by vmscan | ||
462 | rotated - the number of pages activated at vmscan | ||
463 | freed - the number of pages freed by vmscan | ||
464 | |||
465 | If "rotated" is high against scanned/freed, the memcg seems busy. | ||
466 | |||
467 | Now, 2 reason are supported | ||
468 | |||
469 | limit - the memory cgroup's limit | ||
470 | system - global memory pressure + softlimit | ||
471 | (global memory pressure not under softlimit is not handled now) | ||
472 | |||
473 | When under_hierarchy is added in the tail, the number indicates the | ||
474 | total memcg scan of its children and itself. | ||
475 | |||
476 | elapsed_ns is a elapsed time in nanosecond. This may include sleep time | ||
477 | and not indicates CPU usage. So, please take this as just showing | ||
478 | latency. | ||
479 | |||
480 | Here is an example. | ||
481 | |||
482 | # cat /cgroup/memory/A/memory.vmscan_stat | ||
483 | scanned_pages_by_limit 9471864 | ||
484 | scanned_anon_pages_by_limit 6640629 | ||
485 | scanned_file_pages_by_limit 2831235 | ||
486 | rotated_pages_by_limit 4243974 | ||
487 | rotated_anon_pages_by_limit 3971968 | ||
488 | rotated_file_pages_by_limit 272006 | ||
489 | freed_pages_by_limit 2318492 | ||
490 | freed_anon_pages_by_limit 962052 | ||
491 | freed_file_pages_by_limit 1356440 | ||
492 | elapsed_ns_by_limit 351386416101 | ||
493 | scanned_pages_by_system 0 | ||
494 | scanned_anon_pages_by_system 0 | ||
495 | scanned_file_pages_by_system 0 | ||
496 | rotated_pages_by_system 0 | ||
497 | rotated_anon_pages_by_system 0 | ||
498 | rotated_file_pages_by_system 0 | ||
499 | freed_pages_by_system 0 | ||
500 | freed_anon_pages_by_system 0 | ||
501 | freed_file_pages_by_system 0 | ||
502 | elapsed_ns_by_system 0 | ||
503 | scanned_pages_by_limit_under_hierarchy 9471864 | ||
504 | scanned_anon_pages_by_limit_under_hierarchy 6640629 | ||
505 | scanned_file_pages_by_limit_under_hierarchy 2831235 | ||
506 | rotated_pages_by_limit_under_hierarchy 4243974 | ||
507 | rotated_anon_pages_by_limit_under_hierarchy 3971968 | ||
508 | rotated_file_pages_by_limit_under_hierarchy 272006 | ||
509 | freed_pages_by_limit_under_hierarchy 2318492 | ||
510 | freed_anon_pages_by_limit_under_hierarchy 962052 | ||
511 | freed_file_pages_by_limit_under_hierarchy 1356440 | ||
512 | elapsed_ns_by_limit_under_hierarchy 351386416101 | ||
513 | scanned_pages_by_system_under_hierarchy 0 | ||
514 | scanned_anon_pages_by_system_under_hierarchy 0 | ||
515 | scanned_file_pages_by_system_under_hierarchy 0 | ||
516 | rotated_pages_by_system_under_hierarchy 0 | ||
517 | rotated_anon_pages_by_system_under_hierarchy 0 | ||
518 | rotated_file_pages_by_system_under_hierarchy 0 | ||
519 | freed_pages_by_system_under_hierarchy 0 | ||
520 | freed_anon_pages_by_system_under_hierarchy 0 | ||
521 | freed_file_pages_by_system_under_hierarchy 0 | ||
522 | elapsed_ns_by_system_under_hierarchy 0 | ||
523 | |||
441 | 5.3 swappiness | 524 | 5.3 swappiness |
442 | 525 | ||
443 | Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only. | 526 | Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only. |
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index affd5b19b86c..b96600786913 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h | |||
@@ -39,6 +39,16 @@ extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |||
39 | struct mem_cgroup *mem_cont, | 39 | struct mem_cgroup *mem_cont, |
40 | int active, int file); | 40 | int active, int file); |
41 | 41 | ||
42 | struct memcg_scanrecord { | ||
43 | struct mem_cgroup *mem; /* scanend memory cgroup */ | ||
44 | struct mem_cgroup *root; /* scan target hierarchy root */ | ||
45 | int context; /* scanning context (see memcontrol.c) */ | ||
46 | unsigned long nr_scanned[2]; /* the number of scanned pages */ | ||
47 | unsigned long nr_rotated[2]; /* the number of rotated pages */ | ||
48 | unsigned long nr_freed[2]; /* the number of freed pages */ | ||
49 | unsigned long elapsed; /* nsec of time elapsed while scanning */ | ||
50 | }; | ||
51 | |||
42 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 52 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR |
43 | /* | 53 | /* |
44 | * All "charge" functions with gfp_mask should use GFP_KERNEL or | 54 | * All "charge" functions with gfp_mask should use GFP_KERNEL or |
@@ -119,6 +129,15 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page); | |||
119 | extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, | 129 | extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, |
120 | struct task_struct *p); | 130 | struct task_struct *p); |
121 | 131 | ||
132 | extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem, | ||
133 | gfp_t gfp_mask, bool noswap, | ||
134 | struct memcg_scanrecord *rec); | ||
135 | extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | ||
136 | gfp_t gfp_mask, bool noswap, | ||
137 | struct zone *zone, | ||
138 | struct memcg_scanrecord *rec, | ||
139 | unsigned long *nr_scanned); | ||
140 | |||
122 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 141 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
123 | extern int do_swap_account; | 142 | extern int do_swap_account; |
124 | #endif | 143 | #endif |
diff --git a/include/linux/swap.h b/include/linux/swap.h index 44558b600ee3..91d5fcc83116 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h | |||
@@ -251,12 +251,6 @@ static inline void lru_cache_add_file(struct page *page) | |||
251 | /* linux/mm/vmscan.c */ | 251 | /* linux/mm/vmscan.c */ |
252 | extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | 252 | extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, |
253 | gfp_t gfp_mask, nodemask_t *mask); | 253 | gfp_t gfp_mask, nodemask_t *mask); |
254 | extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem, | ||
255 | gfp_t gfp_mask, bool noswap); | ||
256 | extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | ||
257 | gfp_t gfp_mask, bool noswap, | ||
258 | struct zone *zone, | ||
259 | unsigned long *nr_scanned); | ||
260 | extern int __isolate_lru_page(struct page *page, int mode, int file); | 254 | extern int __isolate_lru_page(struct page *page, int mode, int file); |
261 | extern unsigned long shrink_all_memory(unsigned long nr_pages); | 255 | extern unsigned long shrink_all_memory(unsigned long nr_pages); |
262 | extern int vm_swappiness; | 256 | extern int vm_swappiness; |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index dfeca594fd7a..04e505bfd7dd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -205,6 +205,50 @@ struct mem_cgroup_eventfd_list { | |||
205 | static void mem_cgroup_threshold(struct mem_cgroup *mem); | 205 | static void mem_cgroup_threshold(struct mem_cgroup *mem); |
206 | static void mem_cgroup_oom_notify(struct mem_cgroup *mem); | 206 | static void mem_cgroup_oom_notify(struct mem_cgroup *mem); |
207 | 207 | ||
208 | enum { | ||
209 | SCAN_BY_LIMIT, | ||
210 | SCAN_BY_SYSTEM, | ||
211 | NR_SCAN_CONTEXT, | ||
212 | SCAN_BY_SHRINK, /* not recorded now */ | ||
213 | }; | ||
214 | |||
215 | enum { | ||
216 | SCAN, | ||
217 | SCAN_ANON, | ||
218 | SCAN_FILE, | ||
219 | ROTATE, | ||
220 | ROTATE_ANON, | ||
221 | ROTATE_FILE, | ||
222 | FREED, | ||
223 | FREED_ANON, | ||
224 | FREED_FILE, | ||
225 | ELAPSED, | ||
226 | NR_SCANSTATS, | ||
227 | }; | ||
228 | |||
229 | struct scanstat { | ||
230 | spinlock_t lock; | ||
231 | unsigned long stats[NR_SCAN_CONTEXT][NR_SCANSTATS]; | ||
232 | unsigned long rootstats[NR_SCAN_CONTEXT][NR_SCANSTATS]; | ||
233 | }; | ||
234 | |||
235 | const char *scanstat_string[NR_SCANSTATS] = { | ||
236 | "scanned_pages", | ||
237 | "scanned_anon_pages", | ||
238 | "scanned_file_pages", | ||
239 | "rotated_pages", | ||
240 | "rotated_anon_pages", | ||
241 | "rotated_file_pages", | ||
242 | "freed_pages", | ||
243 | "freed_anon_pages", | ||
244 | "freed_file_pages", | ||
245 | "elapsed_ns", | ||
246 | }; | ||
247 | #define SCANSTAT_WORD_LIMIT "_by_limit" | ||
248 | #define SCANSTAT_WORD_SYSTEM "_by_system" | ||
249 | #define SCANSTAT_WORD_HIERARCHY "_under_hierarchy" | ||
250 | |||
251 | |||
208 | /* | 252 | /* |
209 | * The memory controller data structure. The memory controller controls both | 253 | * The memory controller data structure. The memory controller controls both |
210 | * page cache and RSS per cgroup. We would eventually like to provide | 254 | * page cache and RSS per cgroup. We would eventually like to provide |
@@ -270,7 +314,8 @@ struct mem_cgroup { | |||
270 | 314 | ||
271 | /* For oom notifier event fd */ | 315 | /* For oom notifier event fd */ |
272 | struct list_head oom_notify; | 316 | struct list_head oom_notify; |
273 | 317 | /* For recording LRU-scan statistics */ | |
318 | struct scanstat scanstat; | ||
274 | /* | 319 | /* |
275 | * Should we move charges of a task when a task is moved into this | 320 | * Should we move charges of a task when a task is moved into this |
276 | * mem_cgroup ? And what type of charges should we move ? | 321 | * mem_cgroup ? And what type of charges should we move ? |
@@ -1623,6 +1668,44 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) | |||
1623 | } | 1668 | } |
1624 | #endif | 1669 | #endif |
1625 | 1670 | ||
1671 | static void __mem_cgroup_record_scanstat(unsigned long *stats, | ||
1672 | struct memcg_scanrecord *rec) | ||
1673 | { | ||
1674 | |||
1675 | stats[SCAN] += rec->nr_scanned[0] + rec->nr_scanned[1]; | ||
1676 | stats[SCAN_ANON] += rec->nr_scanned[0]; | ||
1677 | stats[SCAN_FILE] += rec->nr_scanned[1]; | ||
1678 | |||
1679 | stats[ROTATE] += rec->nr_rotated[0] + rec->nr_rotated[1]; | ||
1680 | stats[ROTATE_ANON] += rec->nr_rotated[0]; | ||
1681 | stats[ROTATE_FILE] += rec->nr_rotated[1]; | ||
1682 | |||
1683 | stats[FREED] += rec->nr_freed[0] + rec->nr_freed[1]; | ||
1684 | stats[FREED_ANON] += rec->nr_freed[0]; | ||
1685 | stats[FREED_FILE] += rec->nr_freed[1]; | ||
1686 | |||
1687 | stats[ELAPSED] += rec->elapsed; | ||
1688 | } | ||
1689 | |||
1690 | static void mem_cgroup_record_scanstat(struct memcg_scanrecord *rec) | ||
1691 | { | ||
1692 | struct mem_cgroup *mem; | ||
1693 | int context = rec->context; | ||
1694 | |||
1695 | if (context >= NR_SCAN_CONTEXT) | ||
1696 | return; | ||
1697 | |||
1698 | mem = rec->mem; | ||
1699 | spin_lock(&mem->scanstat.lock); | ||
1700 | __mem_cgroup_record_scanstat(mem->scanstat.stats[context], rec); | ||
1701 | spin_unlock(&mem->scanstat.lock); | ||
1702 | |||
1703 | mem = rec->root; | ||
1704 | spin_lock(&mem->scanstat.lock); | ||
1705 | __mem_cgroup_record_scanstat(mem->scanstat.rootstats[context], rec); | ||
1706 | spin_unlock(&mem->scanstat.lock); | ||
1707 | } | ||
1708 | |||
1626 | /* | 1709 | /* |
1627 | * Scan the hierarchy if needed to reclaim memory. We remember the last child | 1710 | * Scan the hierarchy if needed to reclaim memory. We remember the last child |
1628 | * we reclaimed from, so that we don't end up penalizing one child extensively | 1711 | * we reclaimed from, so that we don't end up penalizing one child extensively |
@@ -1647,8 +1730,9 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1647 | bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; | 1730 | bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; |
1648 | bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; | 1731 | bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; |
1649 | bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; | 1732 | bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; |
1733 | struct memcg_scanrecord rec; | ||
1650 | unsigned long excess; | 1734 | unsigned long excess; |
1651 | unsigned long nr_scanned; | 1735 | unsigned long scanned; |
1652 | 1736 | ||
1653 | excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; | 1737 | excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; |
1654 | 1738 | ||
@@ -1656,6 +1740,15 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1656 | if (!check_soft && !shrink && root_mem->memsw_is_minimum) | 1740 | if (!check_soft && !shrink && root_mem->memsw_is_minimum) |
1657 | noswap = true; | 1741 | noswap = true; |
1658 | 1742 | ||
1743 | if (shrink) | ||
1744 | rec.context = SCAN_BY_SHRINK; | ||
1745 | else if (check_soft) | ||
1746 | rec.context = SCAN_BY_SYSTEM; | ||
1747 | else | ||
1748 | rec.context = SCAN_BY_LIMIT; | ||
1749 | |||
1750 | rec.root = root_mem; | ||
1751 | |||
1659 | while (1) { | 1752 | while (1) { |
1660 | victim = mem_cgroup_select_victim(root_mem); | 1753 | victim = mem_cgroup_select_victim(root_mem); |
1661 | if (victim == root_mem) { | 1754 | if (victim == root_mem) { |
@@ -1696,14 +1789,23 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1696 | css_put(&victim->css); | 1789 | css_put(&victim->css); |
1697 | continue; | 1790 | continue; |
1698 | } | 1791 | } |
1792 | rec.mem = victim; | ||
1793 | rec.nr_scanned[0] = 0; | ||
1794 | rec.nr_scanned[1] = 0; | ||
1795 | rec.nr_rotated[0] = 0; | ||
1796 | rec.nr_rotated[1] = 0; | ||
1797 | rec.nr_freed[0] = 0; | ||
1798 | rec.nr_freed[1] = 0; | ||
1799 | rec.elapsed = 0; | ||
1699 | /* we use swappiness of local cgroup */ | 1800 | /* we use swappiness of local cgroup */ |
1700 | if (check_soft) { | 1801 | if (check_soft) { |
1701 | ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, | 1802 | ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, |
1702 | noswap, zone, &nr_scanned); | 1803 | noswap, zone, &rec, &scanned); |
1703 | *total_scanned += nr_scanned; | 1804 | *total_scanned += scanned; |
1704 | } else | 1805 | } else |
1705 | ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, | 1806 | ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, |
1706 | noswap); | 1807 | noswap, &rec); |
1808 | mem_cgroup_record_scanstat(&rec); | ||
1707 | css_put(&victim->css); | 1809 | css_put(&victim->css); |
1708 | /* | 1810 | /* |
1709 | * At shrinking usage, we can't check we should stop here or | 1811 | * At shrinking usage, we can't check we should stop here or |
@@ -3792,14 +3894,18 @@ try_to_free: | |||
3792 | /* try to free all pages in this cgroup */ | 3894 | /* try to free all pages in this cgroup */ |
3793 | shrink = 1; | 3895 | shrink = 1; |
3794 | while (nr_retries && mem->res.usage > 0) { | 3896 | while (nr_retries && mem->res.usage > 0) { |
3897 | struct memcg_scanrecord rec; | ||
3795 | int progress; | 3898 | int progress; |
3796 | 3899 | ||
3797 | if (signal_pending(current)) { | 3900 | if (signal_pending(current)) { |
3798 | ret = -EINTR; | 3901 | ret = -EINTR; |
3799 | goto out; | 3902 | goto out; |
3800 | } | 3903 | } |
3904 | rec.context = SCAN_BY_SHRINK; | ||
3905 | rec.mem = mem; | ||
3906 | rec.root = mem; | ||
3801 | progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, | 3907 | progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, |
3802 | false); | 3908 | false, &rec); |
3803 | if (!progress) { | 3909 | if (!progress) { |
3804 | nr_retries--; | 3910 | nr_retries--; |
3805 | /* maybe some writeback is necessary */ | 3911 | /* maybe some writeback is necessary */ |
@@ -4643,6 +4749,54 @@ static int mem_control_numa_stat_open(struct inode *unused, struct file *file) | |||
4643 | } | 4749 | } |
4644 | #endif /* CONFIG_NUMA */ | 4750 | #endif /* CONFIG_NUMA */ |
4645 | 4751 | ||
4752 | static int mem_cgroup_vmscan_stat_read(struct cgroup *cgrp, | ||
4753 | struct cftype *cft, | ||
4754 | struct cgroup_map_cb *cb) | ||
4755 | { | ||
4756 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); | ||
4757 | char string[64]; | ||
4758 | int i; | ||
4759 | |||
4760 | for (i = 0; i < NR_SCANSTATS; i++) { | ||
4761 | strcpy(string, scanstat_string[i]); | ||
4762 | strcat(string, SCANSTAT_WORD_LIMIT); | ||
4763 | cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_LIMIT][i]); | ||
4764 | } | ||
4765 | |||
4766 | for (i = 0; i < NR_SCANSTATS; i++) { | ||
4767 | strcpy(string, scanstat_string[i]); | ||
4768 | strcat(string, SCANSTAT_WORD_SYSTEM); | ||
4769 | cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_SYSTEM][i]); | ||
4770 | } | ||
4771 | |||
4772 | for (i = 0; i < NR_SCANSTATS; i++) { | ||
4773 | strcpy(string, scanstat_string[i]); | ||
4774 | strcat(string, SCANSTAT_WORD_LIMIT); | ||
4775 | strcat(string, SCANSTAT_WORD_HIERARCHY); | ||
4776 | cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_LIMIT][i]); | ||
4777 | } | ||
4778 | for (i = 0; i < NR_SCANSTATS; i++) { | ||
4779 | strcpy(string, scanstat_string[i]); | ||
4780 | strcat(string, SCANSTAT_WORD_SYSTEM); | ||
4781 | strcat(string, SCANSTAT_WORD_HIERARCHY); | ||
4782 | cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_SYSTEM][i]); | ||
4783 | } | ||
4784 | return 0; | ||
4785 | } | ||
4786 | |||
4787 | static int mem_cgroup_reset_vmscan_stat(struct cgroup *cgrp, | ||
4788 | unsigned int event) | ||
4789 | { | ||
4790 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); | ||
4791 | |||
4792 | spin_lock(&mem->scanstat.lock); | ||
4793 | memset(&mem->scanstat.stats, 0, sizeof(mem->scanstat.stats)); | ||
4794 | memset(&mem->scanstat.rootstats, 0, sizeof(mem->scanstat.rootstats)); | ||
4795 | spin_unlock(&mem->scanstat.lock); | ||
4796 | return 0; | ||
4797 | } | ||
4798 | |||
4799 | |||
4646 | static struct cftype mem_cgroup_files[] = { | 4800 | static struct cftype mem_cgroup_files[] = { |
4647 | { | 4801 | { |
4648 | .name = "usage_in_bytes", | 4802 | .name = "usage_in_bytes", |
@@ -4713,6 +4867,11 @@ static struct cftype mem_cgroup_files[] = { | |||
4713 | .mode = S_IRUGO, | 4867 | .mode = S_IRUGO, |
4714 | }, | 4868 | }, |
4715 | #endif | 4869 | #endif |
4870 | { | ||
4871 | .name = "vmscan_stat", | ||
4872 | .read_map = mem_cgroup_vmscan_stat_read, | ||
4873 | .trigger = mem_cgroup_reset_vmscan_stat, | ||
4874 | }, | ||
4716 | }; | 4875 | }; |
4717 | 4876 | ||
4718 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 4877 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
@@ -4976,6 +5135,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
4976 | atomic_set(&mem->refcnt, 1); | 5135 | atomic_set(&mem->refcnt, 1); |
4977 | mem->move_charge_at_immigrate = 0; | 5136 | mem->move_charge_at_immigrate = 0; |
4978 | mutex_init(&mem->thresholds_lock); | 5137 | mutex_init(&mem->thresholds_lock); |
5138 | spin_lock_init(&mem->scanstat.lock); | ||
4979 | return &mem->css; | 5139 | return &mem->css; |
4980 | free_out: | 5140 | free_out: |
4981 | __mem_cgroup_free(mem); | 5141 | __mem_cgroup_free(mem); |
diff --git a/mm/vmscan.c b/mm/vmscan.c index f87702a376d0..7ef69124fa3e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -105,6 +105,7 @@ struct scan_control { | |||
105 | 105 | ||
106 | /* Which cgroup do we reclaim from */ | 106 | /* Which cgroup do we reclaim from */ |
107 | struct mem_cgroup *mem_cgroup; | 107 | struct mem_cgroup *mem_cgroup; |
108 | struct memcg_scanrecord *memcg_record; | ||
108 | 109 | ||
109 | /* | 110 | /* |
110 | * Nodemask of nodes allowed by the caller. If NULL, all nodes | 111 | * Nodemask of nodes allowed by the caller. If NULL, all nodes |
@@ -1348,6 +1349,8 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc, | |||
1348 | int file = is_file_lru(lru); | 1349 | int file = is_file_lru(lru); |
1349 | int numpages = hpage_nr_pages(page); | 1350 | int numpages = hpage_nr_pages(page); |
1350 | reclaim_stat->recent_rotated[file] += numpages; | 1351 | reclaim_stat->recent_rotated[file] += numpages; |
1352 | if (!scanning_global_lru(sc)) | ||
1353 | sc->memcg_record->nr_rotated[file] += numpages; | ||
1351 | } | 1354 | } |
1352 | if (!pagevec_add(&pvec, page)) { | 1355 | if (!pagevec_add(&pvec, page)) { |
1353 | spin_unlock_irq(&zone->lru_lock); | 1356 | spin_unlock_irq(&zone->lru_lock); |
@@ -1391,6 +1394,10 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone, | |||
1391 | 1394 | ||
1392 | reclaim_stat->recent_scanned[0] += *nr_anon; | 1395 | reclaim_stat->recent_scanned[0] += *nr_anon; |
1393 | reclaim_stat->recent_scanned[1] += *nr_file; | 1396 | reclaim_stat->recent_scanned[1] += *nr_file; |
1397 | if (!scanning_global_lru(sc)) { | ||
1398 | sc->memcg_record->nr_scanned[0] += *nr_anon; | ||
1399 | sc->memcg_record->nr_scanned[1] += *nr_file; | ||
1400 | } | ||
1394 | } | 1401 | } |
1395 | 1402 | ||
1396 | /* | 1403 | /* |
@@ -1504,6 +1511,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1504 | nr_reclaimed += shrink_page_list(&page_list, zone, sc); | 1511 | nr_reclaimed += shrink_page_list(&page_list, zone, sc); |
1505 | } | 1512 | } |
1506 | 1513 | ||
1514 | if (!scanning_global_lru(sc)) | ||
1515 | sc->memcg_record->nr_freed[file] += nr_reclaimed; | ||
1516 | |||
1507 | local_irq_disable(); | 1517 | local_irq_disable(); |
1508 | if (current_is_kswapd()) | 1518 | if (current_is_kswapd()) |
1509 | __count_vm_events(KSWAPD_STEAL, nr_reclaimed); | 1519 | __count_vm_events(KSWAPD_STEAL, nr_reclaimed); |
@@ -1603,6 +1613,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1603 | } | 1613 | } |
1604 | 1614 | ||
1605 | reclaim_stat->recent_scanned[file] += nr_taken; | 1615 | reclaim_stat->recent_scanned[file] += nr_taken; |
1616 | if (!scanning_global_lru(sc)) | ||
1617 | sc->memcg_record->nr_scanned[file] += nr_taken; | ||
1606 | 1618 | ||
1607 | __count_zone_vm_events(PGREFILL, zone, pgscanned); | 1619 | __count_zone_vm_events(PGREFILL, zone, pgscanned); |
1608 | if (file) | 1620 | if (file) |
@@ -1654,6 +1666,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1654 | * get_scan_ratio. | 1666 | * get_scan_ratio. |
1655 | */ | 1667 | */ |
1656 | reclaim_stat->recent_rotated[file] += nr_rotated; | 1668 | reclaim_stat->recent_rotated[file] += nr_rotated; |
1669 | if (!scanning_global_lru(sc)) | ||
1670 | sc->memcg_record->nr_rotated[file] += nr_rotated; | ||
1657 | 1671 | ||
1658 | move_active_pages_to_lru(zone, &l_active, | 1672 | move_active_pages_to_lru(zone, &l_active, |
1659 | LRU_ACTIVE + file * LRU_FILE); | 1673 | LRU_ACTIVE + file * LRU_FILE); |
@@ -2254,9 +2268,10 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
2254 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 2268 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR |
2255 | 2269 | ||
2256 | unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | 2270 | unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, |
2257 | gfp_t gfp_mask, bool noswap, | 2271 | gfp_t gfp_mask, bool noswap, |
2258 | struct zone *zone, | 2272 | struct zone *zone, |
2259 | unsigned long *nr_scanned) | 2273 | struct memcg_scanrecord *rec, |
2274 | unsigned long *scanned) | ||
2260 | { | 2275 | { |
2261 | struct scan_control sc = { | 2276 | struct scan_control sc = { |
2262 | .nr_scanned = 0, | 2277 | .nr_scanned = 0, |
@@ -2266,7 +2281,9 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | |||
2266 | .may_swap = !noswap, | 2281 | .may_swap = !noswap, |
2267 | .order = 0, | 2282 | .order = 0, |
2268 | .mem_cgroup = mem, | 2283 | .mem_cgroup = mem, |
2284 | .memcg_record = rec, | ||
2269 | }; | 2285 | }; |
2286 | unsigned long start, end; | ||
2270 | 2287 | ||
2271 | sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | | 2288 | sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | |
2272 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); | 2289 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); |
@@ -2275,6 +2292,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | |||
2275 | sc.may_writepage, | 2292 | sc.may_writepage, |
2276 | sc.gfp_mask); | 2293 | sc.gfp_mask); |
2277 | 2294 | ||
2295 | start = sched_clock(); | ||
2278 | /* | 2296 | /* |
2279 | * NOTE: Although we can get the priority field, using it | 2297 | * NOTE: Although we can get the priority field, using it |
2280 | * here is not a good idea, since it limits the pages we can scan. | 2298 | * here is not a good idea, since it limits the pages we can scan. |
@@ -2283,19 +2301,25 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | |||
2283 | * the priority and make it zero. | 2301 | * the priority and make it zero. |
2284 | */ | 2302 | */ |
2285 | shrink_zone(0, zone, &sc); | 2303 | shrink_zone(0, zone, &sc); |
2304 | end = sched_clock(); | ||
2305 | |||
2306 | if (rec) | ||
2307 | rec->elapsed += end - start; | ||
2308 | *scanned = sc.nr_scanned; | ||
2286 | 2309 | ||
2287 | trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); | 2310 | trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); |
2288 | 2311 | ||
2289 | *nr_scanned = sc.nr_scanned; | ||
2290 | return sc.nr_reclaimed; | 2312 | return sc.nr_reclaimed; |
2291 | } | 2313 | } |
2292 | 2314 | ||
2293 | unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | 2315 | unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, |
2294 | gfp_t gfp_mask, | 2316 | gfp_t gfp_mask, |
2295 | bool noswap) | 2317 | bool noswap, |
2318 | struct memcg_scanrecord *rec) | ||
2296 | { | 2319 | { |
2297 | struct zonelist *zonelist; | 2320 | struct zonelist *zonelist; |
2298 | unsigned long nr_reclaimed; | 2321 | unsigned long nr_reclaimed; |
2322 | unsigned long start, end; | ||
2299 | int nid; | 2323 | int nid; |
2300 | struct scan_control sc = { | 2324 | struct scan_control sc = { |
2301 | .may_writepage = !laptop_mode, | 2325 | .may_writepage = !laptop_mode, |
@@ -2304,6 +2328,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
2304 | .nr_to_reclaim = SWAP_CLUSTER_MAX, | 2328 | .nr_to_reclaim = SWAP_CLUSTER_MAX, |
2305 | .order = 0, | 2329 | .order = 0, |
2306 | .mem_cgroup = mem_cont, | 2330 | .mem_cgroup = mem_cont, |
2331 | .memcg_record = rec, | ||
2307 | .nodemask = NULL, /* we don't care the placement */ | 2332 | .nodemask = NULL, /* we don't care the placement */ |
2308 | .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | | 2333 | .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | |
2309 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), | 2334 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), |
@@ -2312,6 +2337,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
2312 | .gfp_mask = sc.gfp_mask, | 2337 | .gfp_mask = sc.gfp_mask, |
2313 | }; | 2338 | }; |
2314 | 2339 | ||
2340 | start = sched_clock(); | ||
2315 | /* | 2341 | /* |
2316 | * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't | 2342 | * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't |
2317 | * take care of from where we get pages. So the node where we start the | 2343 | * take care of from where we get pages. So the node where we start the |
@@ -2326,6 +2352,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
2326 | sc.gfp_mask); | 2352 | sc.gfp_mask); |
2327 | 2353 | ||
2328 | nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); | 2354 | nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); |
2355 | end = sched_clock(); | ||
2356 | if (rec) | ||
2357 | rec->elapsed += end - start; | ||
2329 | 2358 | ||
2330 | trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); | 2359 | trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); |
2331 | 2360 | ||