diff options
author | KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> | 2008-02-07 03:14:31 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2008-02-07 11:42:21 -0500 |
commit | 6d12e2d8ddbe653d80ea4f71578481c1bc933025 (patch) | |
tree | f175cdcadd8566b8f8a2e6a155b33e531eba7f11 /mm | |
parent | c0149530d0bb356c933a09f3c8103ea02f452d8a (diff) |
per-zone and reclaim enhancements for memory controller: per-zone active inactive counter
This patch adds per-zone status in memory cgroup. These values are often read
(as per-zone value) by page reclaiming.
In current design, per-zone stat is just a unsigned long value and not an
atomic value because they are modified only under lru_lock. (So, atomic_ops
is not necessary.)
This patch adds ACTIVE and INACTIVE per-zone status values.
For handling per-zone status, this patch adds
struct mem_cgroup_per_zone {
...
}
and some helper functions. This will be useful to add per-zone objects
in mem_cgroup.
This patch turns memory controller's early_init to be 0 for calling
kmalloc() in initialization.
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: Kirill Korotaev <dev@sw.ru>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Paul Menage <menage@google.com>
Cc: Pavel Emelianov <xemul@openvz.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/memcontrol.c | 161 |
1 files changed, 154 insertions, 7 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 422f779a5b21..1637575d3339 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -78,6 +78,31 @@ static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, | |||
78 | } | 78 | } |
79 | 79 | ||
80 | /* | 80 | /* |
81 | * per-zone information in memory controller. | ||
82 | */ | ||
83 | |||
84 | enum mem_cgroup_zstat_index { | ||
85 | MEM_CGROUP_ZSTAT_ACTIVE, | ||
86 | MEM_CGROUP_ZSTAT_INACTIVE, | ||
87 | |||
88 | NR_MEM_CGROUP_ZSTAT, | ||
89 | }; | ||
90 | |||
91 | struct mem_cgroup_per_zone { | ||
92 | unsigned long count[NR_MEM_CGROUP_ZSTAT]; | ||
93 | }; | ||
94 | /* Macro for accessing counter */ | ||
95 | #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) | ||
96 | |||
97 | struct mem_cgroup_per_node { | ||
98 | struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; | ||
99 | }; | ||
100 | |||
101 | struct mem_cgroup_lru_info { | ||
102 | struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; | ||
103 | }; | ||
104 | |||
105 | /* | ||
81 | * The memory controller data structure. The memory controller controls both | 106 | * The memory controller data structure. The memory controller controls both |
82 | * page cache and RSS per cgroup. We would eventually like to provide | 107 | * page cache and RSS per cgroup. We would eventually like to provide |
83 | * statistics based on the statistics developed by Rik Van Riel for clock-pro, | 108 | * statistics based on the statistics developed by Rik Van Riel for clock-pro, |
@@ -101,6 +126,7 @@ struct mem_cgroup { | |||
101 | */ | 126 | */ |
102 | struct list_head active_list; | 127 | struct list_head active_list; |
103 | struct list_head inactive_list; | 128 | struct list_head inactive_list; |
129 | struct mem_cgroup_lru_info info; | ||
104 | /* | 130 | /* |
105 | * spin_lock to protect the per cgroup LRU | 131 | * spin_lock to protect the per cgroup LRU |
106 | */ | 132 | */ |
@@ -158,6 +184,7 @@ enum charge_type { | |||
158 | MEM_CGROUP_CHARGE_TYPE_MAPPED, | 184 | MEM_CGROUP_CHARGE_TYPE_MAPPED, |
159 | }; | 185 | }; |
160 | 186 | ||
187 | |||
161 | /* | 188 | /* |
162 | * Always modified under lru lock. Then, not necessary to preempt_disable() | 189 | * Always modified under lru lock. Then, not necessary to preempt_disable() |
163 | */ | 190 | */ |
@@ -173,7 +200,38 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags, | |||
173 | MEM_CGROUP_STAT_CACHE, val); | 200 | MEM_CGROUP_STAT_CACHE, val); |
174 | else | 201 | else |
175 | __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val); | 202 | __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val); |
203 | } | ||
204 | |||
205 | static inline struct mem_cgroup_per_zone * | ||
206 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) | ||
207 | { | ||
208 | BUG_ON(!mem->info.nodeinfo[nid]); | ||
209 | return &mem->info.nodeinfo[nid]->zoneinfo[zid]; | ||
210 | } | ||
211 | |||
212 | static inline struct mem_cgroup_per_zone * | ||
213 | page_cgroup_zoneinfo(struct page_cgroup *pc) | ||
214 | { | ||
215 | struct mem_cgroup *mem = pc->mem_cgroup; | ||
216 | int nid = page_cgroup_nid(pc); | ||
217 | int zid = page_cgroup_zid(pc); | ||
176 | 218 | ||
219 | return mem_cgroup_zoneinfo(mem, nid, zid); | ||
220 | } | ||
221 | |||
222 | static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem, | ||
223 | enum mem_cgroup_zstat_index idx) | ||
224 | { | ||
225 | int nid, zid; | ||
226 | struct mem_cgroup_per_zone *mz; | ||
227 | u64 total = 0; | ||
228 | |||
229 | for_each_online_node(nid) | ||
230 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | ||
231 | mz = mem_cgroup_zoneinfo(mem, nid, zid); | ||
232 | total += MEM_CGROUP_ZSTAT(mz, idx); | ||
233 | } | ||
234 | return total; | ||
177 | } | 235 | } |
178 | 236 | ||
179 | static struct mem_cgroup init_mem_cgroup; | 237 | static struct mem_cgroup init_mem_cgroup; |
@@ -286,12 +344,51 @@ static struct page_cgroup *clear_page_cgroup(struct page *page, | |||
286 | return ret; | 344 | return ret; |
287 | } | 345 | } |
288 | 346 | ||
347 | static void __mem_cgroup_remove_list(struct page_cgroup *pc) | ||
348 | { | ||
349 | int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; | ||
350 | struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); | ||
351 | |||
352 | if (from) | ||
353 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1; | ||
354 | else | ||
355 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1; | ||
356 | |||
357 | mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false); | ||
358 | list_del_init(&pc->lru); | ||
359 | } | ||
360 | |||
361 | static void __mem_cgroup_add_list(struct page_cgroup *pc) | ||
362 | { | ||
363 | int to = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; | ||
364 | struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); | ||
365 | |||
366 | if (!to) { | ||
367 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; | ||
368 | list_add(&pc->lru, &pc->mem_cgroup->inactive_list); | ||
369 | } else { | ||
370 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; | ||
371 | list_add(&pc->lru, &pc->mem_cgroup->active_list); | ||
372 | } | ||
373 | mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, true); | ||
374 | } | ||
375 | |||
289 | static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active) | 376 | static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active) |
290 | { | 377 | { |
378 | int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; | ||
379 | struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); | ||
380 | |||
381 | if (from) | ||
382 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1; | ||
383 | else | ||
384 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1; | ||
385 | |||
291 | if (active) { | 386 | if (active) { |
387 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; | ||
292 | pc->flags |= PAGE_CGROUP_FLAG_ACTIVE; | 388 | pc->flags |= PAGE_CGROUP_FLAG_ACTIVE; |
293 | list_move(&pc->lru, &pc->mem_cgroup->active_list); | 389 | list_move(&pc->lru, &pc->mem_cgroup->active_list); |
294 | } else { | 390 | } else { |
391 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; | ||
295 | pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE; | 392 | pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE; |
296 | list_move(&pc->lru, &pc->mem_cgroup->inactive_list); | 393 | list_move(&pc->lru, &pc->mem_cgroup->inactive_list); |
297 | } | 394 | } |
@@ -501,8 +598,7 @@ retry: | |||
501 | 598 | ||
502 | spin_lock_irqsave(&mem->lru_lock, flags); | 599 | spin_lock_irqsave(&mem->lru_lock, flags); |
503 | /* Update statistics vector */ | 600 | /* Update statistics vector */ |
504 | mem_cgroup_charge_statistics(mem, pc->flags, true); | 601 | __mem_cgroup_add_list(pc); |
505 | list_add(&pc->lru, &mem->active_list); | ||
506 | spin_unlock_irqrestore(&mem->lru_lock, flags); | 602 | spin_unlock_irqrestore(&mem->lru_lock, flags); |
507 | 603 | ||
508 | done: | 604 | done: |
@@ -571,13 +667,13 @@ void mem_cgroup_uncharge(struct page_cgroup *pc) | |||
571 | css_put(&mem->css); | 667 | css_put(&mem->css); |
572 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 668 | res_counter_uncharge(&mem->res, PAGE_SIZE); |
573 | spin_lock_irqsave(&mem->lru_lock, flags); | 669 | spin_lock_irqsave(&mem->lru_lock, flags); |
574 | list_del_init(&pc->lru); | 670 | __mem_cgroup_remove_list(pc); |
575 | mem_cgroup_charge_statistics(mem, pc->flags, false); | ||
576 | spin_unlock_irqrestore(&mem->lru_lock, flags); | 671 | spin_unlock_irqrestore(&mem->lru_lock, flags); |
577 | kfree(pc); | 672 | kfree(pc); |
578 | } | 673 | } |
579 | } | 674 | } |
580 | } | 675 | } |
676 | |||
581 | /* | 677 | /* |
582 | * Returns non-zero if a page (under migration) has valid page_cgroup member. | 678 | * Returns non-zero if a page (under migration) has valid page_cgroup member. |
583 | * Refcnt of page_cgroup is incremented. | 679 | * Refcnt of page_cgroup is incremented. |
@@ -609,16 +705,26 @@ void mem_cgroup_end_migration(struct page *page) | |||
609 | void mem_cgroup_page_migration(struct page *page, struct page *newpage) | 705 | void mem_cgroup_page_migration(struct page *page, struct page *newpage) |
610 | { | 706 | { |
611 | struct page_cgroup *pc; | 707 | struct page_cgroup *pc; |
708 | struct mem_cgroup *mem; | ||
709 | unsigned long flags; | ||
612 | retry: | 710 | retry: |
613 | pc = page_get_page_cgroup(page); | 711 | pc = page_get_page_cgroup(page); |
614 | if (!pc) | 712 | if (!pc) |
615 | return; | 713 | return; |
714 | mem = pc->mem_cgroup; | ||
616 | if (clear_page_cgroup(page, pc) != pc) | 715 | if (clear_page_cgroup(page, pc) != pc) |
617 | goto retry; | 716 | goto retry; |
717 | |||
718 | spin_lock_irqsave(&mem->lru_lock, flags); | ||
719 | |||
720 | __mem_cgroup_remove_list(pc); | ||
618 | pc->page = newpage; | 721 | pc->page = newpage; |
619 | lock_page_cgroup(newpage); | 722 | lock_page_cgroup(newpage); |
620 | page_assign_page_cgroup(newpage, pc); | 723 | page_assign_page_cgroup(newpage, pc); |
621 | unlock_page_cgroup(newpage); | 724 | unlock_page_cgroup(newpage); |
725 | __mem_cgroup_add_list(pc); | ||
726 | |||
727 | spin_unlock_irqrestore(&mem->lru_lock, flags); | ||
622 | return; | 728 | return; |
623 | } | 729 | } |
624 | 730 | ||
@@ -648,8 +754,7 @@ retry: | |||
648 | if (clear_page_cgroup(page, pc) == pc) { | 754 | if (clear_page_cgroup(page, pc) == pc) { |
649 | css_put(&mem->css); | 755 | css_put(&mem->css); |
650 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 756 | res_counter_uncharge(&mem->res, PAGE_SIZE); |
651 | list_del_init(&pc->lru); | 757 | __mem_cgroup_remove_list(pc); |
652 | mem_cgroup_charge_statistics(mem, pc->flags, false); | ||
653 | kfree(pc); | 758 | kfree(pc); |
654 | } else /* being uncharged ? ...do relax */ | 759 | } else /* being uncharged ? ...do relax */ |
655 | break; | 760 | break; |
@@ -828,6 +933,17 @@ static int mem_control_stat_show(struct seq_file *m, void *arg) | |||
828 | seq_printf(m, "%s %lld\n", mem_cgroup_stat_desc[i].msg, | 933 | seq_printf(m, "%s %lld\n", mem_cgroup_stat_desc[i].msg, |
829 | (long long)val); | 934 | (long long)val); |
830 | } | 935 | } |
936 | /* showing # of active pages */ | ||
937 | { | ||
938 | unsigned long active, inactive; | ||
939 | |||
940 | inactive = mem_cgroup_get_all_zonestat(mem_cont, | ||
941 | MEM_CGROUP_ZSTAT_INACTIVE); | ||
942 | active = mem_cgroup_get_all_zonestat(mem_cont, | ||
943 | MEM_CGROUP_ZSTAT_ACTIVE); | ||
944 | seq_printf(m, "active %ld\n", (active) * PAGE_SIZE); | ||
945 | seq_printf(m, "inactive %ld\n", (inactive) * PAGE_SIZE); | ||
946 | } | ||
831 | return 0; | 947 | return 0; |
832 | } | 948 | } |
833 | 949 | ||
@@ -881,12 +997,25 @@ static struct cftype mem_cgroup_files[] = { | |||
881 | }, | 997 | }, |
882 | }; | 998 | }; |
883 | 999 | ||
1000 | static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | ||
1001 | { | ||
1002 | struct mem_cgroup_per_node *pn; | ||
1003 | |||
1004 | pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, node); | ||
1005 | if (!pn) | ||
1006 | return 1; | ||
1007 | mem->info.nodeinfo[node] = pn; | ||
1008 | memset(pn, 0, sizeof(*pn)); | ||
1009 | return 0; | ||
1010 | } | ||
1011 | |||
884 | static struct mem_cgroup init_mem_cgroup; | 1012 | static struct mem_cgroup init_mem_cgroup; |
885 | 1013 | ||
886 | static struct cgroup_subsys_state * | 1014 | static struct cgroup_subsys_state * |
887 | mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | 1015 | mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) |
888 | { | 1016 | { |
889 | struct mem_cgroup *mem; | 1017 | struct mem_cgroup *mem; |
1018 | int node; | ||
890 | 1019 | ||
891 | if (unlikely((cont->parent) == NULL)) { | 1020 | if (unlikely((cont->parent) == NULL)) { |
892 | mem = &init_mem_cgroup; | 1021 | mem = &init_mem_cgroup; |
@@ -902,7 +1031,19 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
902 | INIT_LIST_HEAD(&mem->inactive_list); | 1031 | INIT_LIST_HEAD(&mem->inactive_list); |
903 | spin_lock_init(&mem->lru_lock); | 1032 | spin_lock_init(&mem->lru_lock); |
904 | mem->control_type = MEM_CGROUP_TYPE_ALL; | 1033 | mem->control_type = MEM_CGROUP_TYPE_ALL; |
1034 | memset(&mem->info, 0, sizeof(mem->info)); | ||
1035 | |||
1036 | for_each_node_state(node, N_POSSIBLE) | ||
1037 | if (alloc_mem_cgroup_per_zone_info(mem, node)) | ||
1038 | goto free_out; | ||
1039 | |||
905 | return &mem->css; | 1040 | return &mem->css; |
1041 | free_out: | ||
1042 | for_each_node_state(node, N_POSSIBLE) | ||
1043 | kfree(mem->info.nodeinfo[node]); | ||
1044 | if (cont->parent != NULL) | ||
1045 | kfree(mem); | ||
1046 | return NULL; | ||
906 | } | 1047 | } |
907 | 1048 | ||
908 | static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss, | 1049 | static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss, |
@@ -915,6 +1056,12 @@ static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss, | |||
915 | static void mem_cgroup_destroy(struct cgroup_subsys *ss, | 1056 | static void mem_cgroup_destroy(struct cgroup_subsys *ss, |
916 | struct cgroup *cont) | 1057 | struct cgroup *cont) |
917 | { | 1058 | { |
1059 | int node; | ||
1060 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | ||
1061 | |||
1062 | for_each_node_state(node, N_POSSIBLE) | ||
1063 | kfree(mem->info.nodeinfo[node]); | ||
1064 | |||
918 | kfree(mem_cgroup_from_cont(cont)); | 1065 | kfree(mem_cgroup_from_cont(cont)); |
919 | } | 1066 | } |
920 | 1067 | ||
@@ -967,5 +1114,5 @@ struct cgroup_subsys mem_cgroup_subsys = { | |||
967 | .destroy = mem_cgroup_destroy, | 1114 | .destroy = mem_cgroup_destroy, |
968 | .populate = mem_cgroup_populate, | 1115 | .populate = mem_cgroup_populate, |
969 | .attach = mem_cgroup_move_task, | 1116 | .attach = mem_cgroup_move_task, |
970 | .early_init = 1, | 1117 | .early_init = 0, |
971 | }; | 1118 | }; |