aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>2008-02-07 03:14:38 -0500
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2008-02-07 11:42:22 -0500
commit1ecaab2bd221251a3fd148abb08e8b877f1e93c8 (patch)
treee5ce53b4d045832382ec38555b2a03749cc9d128
parent1cfb419b394ba82745c54ff05436d598ecc2dbd5 (diff)
per-zone and reclaim enhancements for memory controller: per zone lru for cgroup
This patch implements per-zone lru for memory cgroup. This patch makes use of mem_cgroup_per_zone struct for per zone lru. LRU can be accessed by mz = mem_cgroup_zoneinfo(mem_cgroup, node, zone); &mz->active_list &mz->inactive_list or mz = page_cgroup_zoneinfo(page_cgroup); &mz->active_list &mz->inactive_list Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: David Rientjes <rientjes@google.com> Cc: Herbert Poetzl <herbert@13thfloor.at> Cc: Kirill Korotaev <dev@sw.ru> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Paul Menage <menage@google.com> Cc: Pavel Emelianov <xemul@openvz.org> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> Cc: Rik van Riel <riel@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--mm/memcontrol.c86
1 files changed, 58 insertions, 28 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 40cdba68de34..f728d67a3267 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -89,6 +89,8 @@ enum mem_cgroup_zstat_index {
89}; 89};
90 90
91struct mem_cgroup_per_zone { 91struct mem_cgroup_per_zone {
92 struct list_head active_list;
93 struct list_head inactive_list;
92 unsigned long count[NR_MEM_CGROUP_ZSTAT]; 94 unsigned long count[NR_MEM_CGROUP_ZSTAT];
93}; 95};
94/* Macro for accessing counter */ 96/* Macro for accessing counter */
@@ -122,10 +124,7 @@ struct mem_cgroup {
122 /* 124 /*
123 * Per cgroup active and inactive list, similar to the 125 * Per cgroup active and inactive list, similar to the
124 * per zone LRU lists. 126 * per zone LRU lists.
125 * TODO: Consider making these lists per zone
126 */ 127 */
127 struct list_head active_list;
128 struct list_head inactive_list;
129 struct mem_cgroup_lru_info info; 128 struct mem_cgroup_lru_info info;
130 /* 129 /*
131 * spin_lock to protect the per cgroup LRU 130 * spin_lock to protect the per cgroup LRU
@@ -366,10 +365,10 @@ static void __mem_cgroup_add_list(struct page_cgroup *pc)
366 365
367 if (!to) { 366 if (!to) {
368 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; 367 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1;
369 list_add(&pc->lru, &pc->mem_cgroup->inactive_list); 368 list_add(&pc->lru, &mz->inactive_list);
370 } else { 369 } else {
371 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; 370 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1;
372 list_add(&pc->lru, &pc->mem_cgroup->active_list); 371 list_add(&pc->lru, &mz->active_list);
373 } 372 }
374 mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, true); 373 mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, true);
375} 374}
@@ -387,11 +386,11 @@ static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
387 if (active) { 386 if (active) {
388 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; 387 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1;
389 pc->flags |= PAGE_CGROUP_FLAG_ACTIVE; 388 pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;
390 list_move(&pc->lru, &pc->mem_cgroup->active_list); 389 list_move(&pc->lru, &mz->active_list);
391 } else { 390 } else {
392 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; 391 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1;
393 pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE; 392 pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE;
394 list_move(&pc->lru, &pc->mem_cgroup->inactive_list); 393 list_move(&pc->lru, &mz->inactive_list);
395 } 394 }
396} 395}
397 396
@@ -517,11 +516,16 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
517 LIST_HEAD(pc_list); 516 LIST_HEAD(pc_list);
518 struct list_head *src; 517 struct list_head *src;
519 struct page_cgroup *pc, *tmp; 518 struct page_cgroup *pc, *tmp;
519 int nid = z->zone_pgdat->node_id;
520 int zid = zone_idx(z);
521 struct mem_cgroup_per_zone *mz;
520 522
523 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
521 if (active) 524 if (active)
522 src = &mem_cont->active_list; 525 src = &mz->active_list;
523 else 526 else
524 src = &mem_cont->inactive_list; 527 src = &mz->inactive_list;
528
525 529
526 spin_lock(&mem_cont->lru_lock); 530 spin_lock(&mem_cont->lru_lock);
527 scan = 0; 531 scan = 0;
@@ -543,13 +547,6 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
543 continue; 547 continue;
544 } 548 }
545 549
546 /*
547 * Reclaim, per zone
548 * TODO: make the active/inactive lists per zone
549 */
550 if (page_zone(page) != z)
551 continue;
552
553 scan++; 550 scan++;
554 list_move(&pc->lru, &pc_list); 551 list_move(&pc->lru, &pc_list);
555 552
@@ -826,6 +823,8 @@ mem_cgroup_force_empty_list(struct mem_cgroup *mem, struct list_head *list)
826 int count; 823 int count;
827 unsigned long flags; 824 unsigned long flags;
828 825
826 if (list_empty(list))
827 return;
829retry: 828retry:
830 count = FORCE_UNCHARGE_BATCH; 829 count = FORCE_UNCHARGE_BATCH;
831 spin_lock_irqsave(&mem->lru_lock, flags); 830 spin_lock_irqsave(&mem->lru_lock, flags);
@@ -859,20 +858,27 @@ retry:
859int mem_cgroup_force_empty(struct mem_cgroup *mem) 858int mem_cgroup_force_empty(struct mem_cgroup *mem)
860{ 859{
861 int ret = -EBUSY; 860 int ret = -EBUSY;
861 int node, zid;
862 css_get(&mem->css); 862 css_get(&mem->css);
863 /* 863 /*
864 * page reclaim code (kswapd etc..) will move pages between 864 * page reclaim code (kswapd etc..) will move pages between
865` * active_list <-> inactive_list while we don't take a lock. 865` * active_list <-> inactive_list while we don't take a lock.
866 * So, we have to do loop here until all lists are empty. 866 * So, we have to do loop here until all lists are empty.
867 */ 867 */
868 while (!(list_empty(&mem->active_list) && 868 while (mem->res.usage > 0) {
869 list_empty(&mem->inactive_list))) {
870 if (atomic_read(&mem->css.cgroup->count) > 0) 869 if (atomic_read(&mem->css.cgroup->count) > 0)
871 goto out; 870 goto out;
872 /* drop all page_cgroup in active_list */ 871 for_each_node_state(node, N_POSSIBLE)
873 mem_cgroup_force_empty_list(mem, &mem->active_list); 872 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
874 /* drop all page_cgroup in inactive_list */ 873 struct mem_cgroup_per_zone *mz;
875 mem_cgroup_force_empty_list(mem, &mem->inactive_list); 874 mz = mem_cgroup_zoneinfo(mem, node, zid);
875 /* drop all page_cgroup in active_list */
876 mem_cgroup_force_empty_list(mem,
877 &mz->active_list);
878 /* drop all page_cgroup in inactive_list */
879 mem_cgroup_force_empty_list(mem,
880 &mz->inactive_list);
881 }
876 } 882 }
877 ret = 0; 883 ret = 0;
878out: 884out:
@@ -1084,15 +1090,40 @@ static struct cftype mem_cgroup_files[] = {
1084static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 1090static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
1085{ 1091{
1086 struct mem_cgroup_per_node *pn; 1092 struct mem_cgroup_per_node *pn;
1087 1093 struct mem_cgroup_per_zone *mz;
1088 pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, node); 1094 int zone;
1095 /*
1096 * This routine is called against possible nodes.
1097 * But it's BUG to call kmalloc() against offline node.
1098 *
1099 * TODO: this routine can waste much memory for nodes which will
1100 * never be onlined. It's better to use memory hotplug callback
1101 * function.
1102 */
1103 if (node_state(node, N_HIGH_MEMORY))
1104 pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, node);
1105 else
1106 pn = kmalloc(sizeof(*pn), GFP_KERNEL);
1089 if (!pn) 1107 if (!pn)
1090 return 1; 1108 return 1;
1109
1091 mem->info.nodeinfo[node] = pn; 1110 mem->info.nodeinfo[node] = pn;
1092 memset(pn, 0, sizeof(*pn)); 1111 memset(pn, 0, sizeof(*pn));
1112
1113 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
1114 mz = &pn->zoneinfo[zone];
1115 INIT_LIST_HEAD(&mz->active_list);
1116 INIT_LIST_HEAD(&mz->inactive_list);
1117 }
1093 return 0; 1118 return 0;
1094} 1119}
1095 1120
1121static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
1122{
1123 kfree(mem->info.nodeinfo[node]);
1124}
1125
1126
1096static struct mem_cgroup init_mem_cgroup; 1127static struct mem_cgroup init_mem_cgroup;
1097 1128
1098static struct cgroup_subsys_state * 1129static struct cgroup_subsys_state *
@@ -1111,8 +1142,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
1111 return NULL; 1142 return NULL;
1112 1143
1113 res_counter_init(&mem->res); 1144 res_counter_init(&mem->res);
1114 INIT_LIST_HEAD(&mem->active_list); 1145
1115 INIT_LIST_HEAD(&mem->inactive_list);
1116 spin_lock_init(&mem->lru_lock); 1146 spin_lock_init(&mem->lru_lock);
1117 mem->control_type = MEM_CGROUP_TYPE_ALL; 1147 mem->control_type = MEM_CGROUP_TYPE_ALL;
1118 memset(&mem->info, 0, sizeof(mem->info)); 1148 memset(&mem->info, 0, sizeof(mem->info));
@@ -1124,7 +1154,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
1124 return &mem->css; 1154 return &mem->css;
1125free_out: 1155free_out:
1126 for_each_node_state(node, N_POSSIBLE) 1156 for_each_node_state(node, N_POSSIBLE)
1127 kfree(mem->info.nodeinfo[node]); 1157 free_mem_cgroup_per_zone_info(mem, node);
1128 if (cont->parent != NULL) 1158 if (cont->parent != NULL)
1129 kfree(mem); 1159 kfree(mem);
1130 return NULL; 1160 return NULL;
@@ -1144,7 +1174,7 @@ static void mem_cgroup_destroy(struct cgroup_subsys *ss,
1144 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 1174 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
1145 1175
1146 for_each_node_state(node, N_POSSIBLE) 1176 for_each_node_state(node, N_POSSIBLE)
1147 kfree(mem->info.nodeinfo[node]); 1177 free_mem_cgroup_per_zone_info(mem, node);
1148 1178
1149 kfree(mem_cgroup_from_cont(cont)); 1179 kfree(mem_cgroup_from_cont(cont));
1150} 1180}