diff options
author | KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> | 2008-02-07 03:14:38 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2008-02-07 11:42:22 -0500 |
commit | 1ecaab2bd221251a3fd148abb08e8b877f1e93c8 (patch) | |
tree | e5ce53b4d045832382ec38555b2a03749cc9d128 | |
parent | 1cfb419b394ba82745c54ff05436d598ecc2dbd5 (diff) |
per-zone and reclaim enhancements for memory controller: per zone lru for cgroup
This patch implements per-zone lru for memory cgroup.
This patch makes use of mem_cgroup_per_zone struct for per zone lru.
LRU can be accessed by
mz = mem_cgroup_zoneinfo(mem_cgroup, node, zone);
&mz->active_list
&mz->inactive_list
or
mz = page_cgroup_zoneinfo(page_cgroup);
&mz->active_list
&mz->inactive_list
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: Kirill Korotaev <dev@sw.ru>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Paul Menage <menage@google.com>
Cc: Pavel Emelianov <xemul@openvz.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | mm/memcontrol.c | 86 |
1 files changed, 58 insertions, 28 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 40cdba68de34..f728d67a3267 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -89,6 +89,8 @@ enum mem_cgroup_zstat_index { | |||
89 | }; | 89 | }; |
90 | 90 | ||
91 | struct mem_cgroup_per_zone { | 91 | struct mem_cgroup_per_zone { |
92 | struct list_head active_list; | ||
93 | struct list_head inactive_list; | ||
92 | unsigned long count[NR_MEM_CGROUP_ZSTAT]; | 94 | unsigned long count[NR_MEM_CGROUP_ZSTAT]; |
93 | }; | 95 | }; |
94 | /* Macro for accessing counter */ | 96 | /* Macro for accessing counter */ |
@@ -122,10 +124,7 @@ struct mem_cgroup { | |||
122 | /* | 124 | /* |
123 | * Per cgroup active and inactive list, similar to the | 125 | * Per cgroup active and inactive list, similar to the |
124 | * per zone LRU lists. | 126 | * per zone LRU lists. |
125 | * TODO: Consider making these lists per zone | ||
126 | */ | 127 | */ |
127 | struct list_head active_list; | ||
128 | struct list_head inactive_list; | ||
129 | struct mem_cgroup_lru_info info; | 128 | struct mem_cgroup_lru_info info; |
130 | /* | 129 | /* |
131 | * spin_lock to protect the per cgroup LRU | 130 | * spin_lock to protect the per cgroup LRU |
@@ -366,10 +365,10 @@ static void __mem_cgroup_add_list(struct page_cgroup *pc) | |||
366 | 365 | ||
367 | if (!to) { | 366 | if (!to) { |
368 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; | 367 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; |
369 | list_add(&pc->lru, &pc->mem_cgroup->inactive_list); | 368 | list_add(&pc->lru, &mz->inactive_list); |
370 | } else { | 369 | } else { |
371 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; | 370 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; |
372 | list_add(&pc->lru, &pc->mem_cgroup->active_list); | 371 | list_add(&pc->lru, &mz->active_list); |
373 | } | 372 | } |
374 | mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, true); | 373 | mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, true); |
375 | } | 374 | } |
@@ -387,11 +386,11 @@ static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active) | |||
387 | if (active) { | 386 | if (active) { |
388 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; | 387 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; |
389 | pc->flags |= PAGE_CGROUP_FLAG_ACTIVE; | 388 | pc->flags |= PAGE_CGROUP_FLAG_ACTIVE; |
390 | list_move(&pc->lru, &pc->mem_cgroup->active_list); | 389 | list_move(&pc->lru, &mz->active_list); |
391 | } else { | 390 | } else { |
392 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; | 391 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; |
393 | pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE; | 392 | pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE; |
394 | list_move(&pc->lru, &pc->mem_cgroup->inactive_list); | 393 | list_move(&pc->lru, &mz->inactive_list); |
395 | } | 394 | } |
396 | } | 395 | } |
397 | 396 | ||
@@ -517,11 +516,16 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |||
517 | LIST_HEAD(pc_list); | 516 | LIST_HEAD(pc_list); |
518 | struct list_head *src; | 517 | struct list_head *src; |
519 | struct page_cgroup *pc, *tmp; | 518 | struct page_cgroup *pc, *tmp; |
519 | int nid = z->zone_pgdat->node_id; | ||
520 | int zid = zone_idx(z); | ||
521 | struct mem_cgroup_per_zone *mz; | ||
520 | 522 | ||
523 | mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); | ||
521 | if (active) | 524 | if (active) |
522 | src = &mem_cont->active_list; | 525 | src = &mz->active_list; |
523 | else | 526 | else |
524 | src = &mem_cont->inactive_list; | 527 | src = &mz->inactive_list; |
528 | |||
525 | 529 | ||
526 | spin_lock(&mem_cont->lru_lock); | 530 | spin_lock(&mem_cont->lru_lock); |
527 | scan = 0; | 531 | scan = 0; |
@@ -543,13 +547,6 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |||
543 | continue; | 547 | continue; |
544 | } | 548 | } |
545 | 549 | ||
546 | /* | ||
547 | * Reclaim, per zone | ||
548 | * TODO: make the active/inactive lists per zone | ||
549 | */ | ||
550 | if (page_zone(page) != z) | ||
551 | continue; | ||
552 | |||
553 | scan++; | 550 | scan++; |
554 | list_move(&pc->lru, &pc_list); | 551 | list_move(&pc->lru, &pc_list); |
555 | 552 | ||
@@ -826,6 +823,8 @@ mem_cgroup_force_empty_list(struct mem_cgroup *mem, struct list_head *list) | |||
826 | int count; | 823 | int count; |
827 | unsigned long flags; | 824 | unsigned long flags; |
828 | 825 | ||
826 | if (list_empty(list)) | ||
827 | return; | ||
829 | retry: | 828 | retry: |
830 | count = FORCE_UNCHARGE_BATCH; | 829 | count = FORCE_UNCHARGE_BATCH; |
831 | spin_lock_irqsave(&mem->lru_lock, flags); | 830 | spin_lock_irqsave(&mem->lru_lock, flags); |
@@ -859,20 +858,27 @@ retry: | |||
859 | int mem_cgroup_force_empty(struct mem_cgroup *mem) | 858 | int mem_cgroup_force_empty(struct mem_cgroup *mem) |
860 | { | 859 | { |
861 | int ret = -EBUSY; | 860 | int ret = -EBUSY; |
861 | int node, zid; | ||
862 | css_get(&mem->css); | 862 | css_get(&mem->css); |
863 | /* | 863 | /* |
864 | * page reclaim code (kswapd etc..) will move pages between | 864 | * page reclaim code (kswapd etc..) will move pages between |
865 | ` * active_list <-> inactive_list while we don't take a lock. | 865 | ` * active_list <-> inactive_list while we don't take a lock. |
866 | * So, we have to do loop here until all lists are empty. | 866 | * So, we have to do loop here until all lists are empty. |
867 | */ | 867 | */ |
868 | while (!(list_empty(&mem->active_list) && | 868 | while (mem->res.usage > 0) { |
869 | list_empty(&mem->inactive_list))) { | ||
870 | if (atomic_read(&mem->css.cgroup->count) > 0) | 869 | if (atomic_read(&mem->css.cgroup->count) > 0) |
871 | goto out; | 870 | goto out; |
872 | /* drop all page_cgroup in active_list */ | 871 | for_each_node_state(node, N_POSSIBLE) |
873 | mem_cgroup_force_empty_list(mem, &mem->active_list); | 872 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { |
874 | /* drop all page_cgroup in inactive_list */ | 873 | struct mem_cgroup_per_zone *mz; |
875 | mem_cgroup_force_empty_list(mem, &mem->inactive_list); | 874 | mz = mem_cgroup_zoneinfo(mem, node, zid); |
875 | /* drop all page_cgroup in active_list */ | ||
876 | mem_cgroup_force_empty_list(mem, | ||
877 | &mz->active_list); | ||
878 | /* drop all page_cgroup in inactive_list */ | ||
879 | mem_cgroup_force_empty_list(mem, | ||
880 | &mz->inactive_list); | ||
881 | } | ||
876 | } | 882 | } |
877 | ret = 0; | 883 | ret = 0; |
878 | out: | 884 | out: |
@@ -1084,15 +1090,40 @@ static struct cftype mem_cgroup_files[] = { | |||
1084 | static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | 1090 | static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) |
1085 | { | 1091 | { |
1086 | struct mem_cgroup_per_node *pn; | 1092 | struct mem_cgroup_per_node *pn; |
1087 | 1093 | struct mem_cgroup_per_zone *mz; | |
1088 | pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, node); | 1094 | int zone; |
1095 | /* | ||
1096 | * This routine is called against possible nodes. | ||
1097 | * But it's BUG to call kmalloc() against offline node. | ||
1098 | * | ||
1099 | * TODO: this routine can waste much memory for nodes which will | ||
1100 | * never be onlined. It's better to use memory hotplug callback | ||
1101 | * function. | ||
1102 | */ | ||
1103 | if (node_state(node, N_HIGH_MEMORY)) | ||
1104 | pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, node); | ||
1105 | else | ||
1106 | pn = kmalloc(sizeof(*pn), GFP_KERNEL); | ||
1089 | if (!pn) | 1107 | if (!pn) |
1090 | return 1; | 1108 | return 1; |
1109 | |||
1091 | mem->info.nodeinfo[node] = pn; | 1110 | mem->info.nodeinfo[node] = pn; |
1092 | memset(pn, 0, sizeof(*pn)); | 1111 | memset(pn, 0, sizeof(*pn)); |
1112 | |||
1113 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | ||
1114 | mz = &pn->zoneinfo[zone]; | ||
1115 | INIT_LIST_HEAD(&mz->active_list); | ||
1116 | INIT_LIST_HEAD(&mz->inactive_list); | ||
1117 | } | ||
1093 | return 0; | 1118 | return 0; |
1094 | } | 1119 | } |
1095 | 1120 | ||
1121 | static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | ||
1122 | { | ||
1123 | kfree(mem->info.nodeinfo[node]); | ||
1124 | } | ||
1125 | |||
1126 | |||
1096 | static struct mem_cgroup init_mem_cgroup; | 1127 | static struct mem_cgroup init_mem_cgroup; |
1097 | 1128 | ||
1098 | static struct cgroup_subsys_state * | 1129 | static struct cgroup_subsys_state * |
@@ -1111,8 +1142,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
1111 | return NULL; | 1142 | return NULL; |
1112 | 1143 | ||
1113 | res_counter_init(&mem->res); | 1144 | res_counter_init(&mem->res); |
1114 | INIT_LIST_HEAD(&mem->active_list); | 1145 | |
1115 | INIT_LIST_HEAD(&mem->inactive_list); | ||
1116 | spin_lock_init(&mem->lru_lock); | 1146 | spin_lock_init(&mem->lru_lock); |
1117 | mem->control_type = MEM_CGROUP_TYPE_ALL; | 1147 | mem->control_type = MEM_CGROUP_TYPE_ALL; |
1118 | memset(&mem->info, 0, sizeof(mem->info)); | 1148 | memset(&mem->info, 0, sizeof(mem->info)); |
@@ -1124,7 +1154,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
1124 | return &mem->css; | 1154 | return &mem->css; |
1125 | free_out: | 1155 | free_out: |
1126 | for_each_node_state(node, N_POSSIBLE) | 1156 | for_each_node_state(node, N_POSSIBLE) |
1127 | kfree(mem->info.nodeinfo[node]); | 1157 | free_mem_cgroup_per_zone_info(mem, node); |
1128 | if (cont->parent != NULL) | 1158 | if (cont->parent != NULL) |
1129 | kfree(mem); | 1159 | kfree(mem); |
1130 | return NULL; | 1160 | return NULL; |
@@ -1144,7 +1174,7 @@ static void mem_cgroup_destroy(struct cgroup_subsys *ss, | |||
1144 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | 1174 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); |
1145 | 1175 | ||
1146 | for_each_node_state(node, N_POSSIBLE) | 1176 | for_each_node_state(node, N_POSSIBLE) |
1147 | kfree(mem->info.nodeinfo[node]); | 1177 | free_mem_cgroup_per_zone_info(mem, node); |
1148 | 1178 | ||
1149 | kfree(mem_cgroup_from_cont(cont)); | 1179 | kfree(mem_cgroup_from_cont(cont)); |
1150 | } | 1180 | } |