aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
authorHugh Dickins <hughd@google.com>2012-11-16 17:14:54 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-11-16 17:33:04 -0500
commitbea8c150a7efbc0f204e709b7274fe273f55e0d3 (patch)
treefab695d7d18c5bbc9a42bccd793456826e392754 /mm/memcontrol.c
parent18f694271b86ee279e88208550cc49fee206b544 (diff)
memcg: fix hotplugged memory zone oops
When MEMCG is configured on (even when it's disabled by boot option), when adding or removing a page to/from its lru list, the zone pointer used for stats updates is nowadays taken from the struct lruvec. (On many configurations, calculating zone from page is slower.) But we have no code to update all the lruvecs (per zone, per memcg) when a memory node is hotadded. Here's an extract from the oops which results when running numactl to bind a program to a newly onlined node: BUG: unable to handle kernel NULL pointer dereference at 0000000000000f60 IP: __mod_zone_page_state+0x9/0x60 Pid: 1219, comm: numactl Not tainted 3.6.0-rc5+ #180 Bochs Bochs Process numactl (pid: 1219, threadinfo ffff880039abc000, task ffff8800383c4ce0) Call Trace: __pagevec_lru_add_fn+0xdf/0x140 pagevec_lru_move_fn+0xb1/0x100 __pagevec_lru_add+0x1c/0x30 lru_add_drain_cpu+0xa3/0x130 lru_add_drain+0x2f/0x40 ... The natural solution might be to use a memcg callback whenever memory is hotadded; but that solution has not been scoped out, and it happens that we do have an easy location at which to update lruvec->zone. The lruvec pointer is discovered either by mem_cgroup_zone_lruvec() or by mem_cgroup_page_lruvec(), and both of those do know the right zone. So check and set lruvec->zone in those; and remove the inadequate attempt to set lruvec->zone from lruvec_init(), which is called before NODE_DATA(node) has been allocated in such cases. Ah, there was one exceptionr. For no particularly good reason, mem_cgroup_force_empty_list() has its own code for deciding lruvec. Change it to use the standard mem_cgroup_zone_lruvec() and mem_cgroup_get_lru_size() too. In fact it was already safe against such an oops (the lru lists in danger could only be empty), but we're better proofed against future changes this way. I've marked this for stable (3.6) since we introduced the problem in 3.5 (now closed to stable); but I have no idea if this is the only fix needed to get memory hotadd working with memcg in 3.6, and received no answer when I enquired twice before. Reported-by: Tang Chen <tangchen@cn.fujitsu.com> Signed-off-by: Hugh Dickins <hughd@google.com> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Konstantin Khlebnikov <khlebnikov@openvz.org> Cc: Wen Congyang <wency@cn.fujitsu.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c46
1 files changed, 35 insertions, 11 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 93a7e36ded89..dd39ba000b31 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1055,12 +1055,24 @@ struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
1055 struct mem_cgroup *memcg) 1055 struct mem_cgroup *memcg)
1056{ 1056{
1057 struct mem_cgroup_per_zone *mz; 1057 struct mem_cgroup_per_zone *mz;
1058 struct lruvec *lruvec;
1058 1059
1059 if (mem_cgroup_disabled()) 1060 if (mem_cgroup_disabled()) {
1060 return &zone->lruvec; 1061 lruvec = &zone->lruvec;
1062 goto out;
1063 }
1061 1064
1062 mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone)); 1065 mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
1063 return &mz->lruvec; 1066 lruvec = &mz->lruvec;
1067out:
1068 /*
1069 * Since a node can be onlined after the mem_cgroup was created,
1070 * we have to be prepared to initialize lruvec->zone here;
1071 * and if offlined then reonlined, we need to reinitialize it.
1072 */
1073 if (unlikely(lruvec->zone != zone))
1074 lruvec->zone = zone;
1075 return lruvec;
1064} 1076}
1065 1077
1066/* 1078/*
@@ -1087,9 +1099,12 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
1087 struct mem_cgroup_per_zone *mz; 1099 struct mem_cgroup_per_zone *mz;
1088 struct mem_cgroup *memcg; 1100 struct mem_cgroup *memcg;
1089 struct page_cgroup *pc; 1101 struct page_cgroup *pc;
1102 struct lruvec *lruvec;
1090 1103
1091 if (mem_cgroup_disabled()) 1104 if (mem_cgroup_disabled()) {
1092 return &zone->lruvec; 1105 lruvec = &zone->lruvec;
1106 goto out;
1107 }
1093 1108
1094 pc = lookup_page_cgroup(page); 1109 pc = lookup_page_cgroup(page);
1095 memcg = pc->mem_cgroup; 1110 memcg = pc->mem_cgroup;
@@ -1107,7 +1122,16 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
1107 pc->mem_cgroup = memcg = root_mem_cgroup; 1122 pc->mem_cgroup = memcg = root_mem_cgroup;
1108 1123
1109 mz = page_cgroup_zoneinfo(memcg, page); 1124 mz = page_cgroup_zoneinfo(memcg, page);
1110 return &mz->lruvec; 1125 lruvec = &mz->lruvec;
1126out:
1127 /*
1128 * Since a node can be onlined after the mem_cgroup was created,
1129 * we have to be prepared to initialize lruvec->zone here;
1130 * and if offlined then reonlined, we need to reinitialize it.
1131 */
1132 if (unlikely(lruvec->zone != zone))
1133 lruvec->zone = zone;
1134 return lruvec;
1111} 1135}
1112 1136
1113/** 1137/**
@@ -3697,17 +3721,17 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3697static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg, 3721static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3698 int node, int zid, enum lru_list lru) 3722 int node, int zid, enum lru_list lru)
3699{ 3723{
3700 struct mem_cgroup_per_zone *mz; 3724 struct lruvec *lruvec;
3701 unsigned long flags, loop; 3725 unsigned long flags, loop;
3702 struct list_head *list; 3726 struct list_head *list;
3703 struct page *busy; 3727 struct page *busy;
3704 struct zone *zone; 3728 struct zone *zone;
3705 3729
3706 zone = &NODE_DATA(node)->node_zones[zid]; 3730 zone = &NODE_DATA(node)->node_zones[zid];
3707 mz = mem_cgroup_zoneinfo(memcg, node, zid); 3731 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
3708 list = &mz->lruvec.lists[lru]; 3732 list = &lruvec->lists[lru];
3709 3733
3710 loop = mz->lru_size[lru]; 3734 loop = mem_cgroup_get_lru_size(lruvec, lru);
3711 /* give some margin against EBUSY etc...*/ 3735 /* give some margin against EBUSY etc...*/
3712 loop += 256; 3736 loop += 256;
3713 busy = NULL; 3737 busy = NULL;
@@ -4745,7 +4769,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4745 4769
4746 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4770 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4747 mz = &pn->zoneinfo[zone]; 4771 mz = &pn->zoneinfo[zone];
4748 lruvec_init(&mz->lruvec, &NODE_DATA(node)->node_zones[zone]); 4772 lruvec_init(&mz->lruvec);
4749 mz->usage_in_excess = 0; 4773 mz->usage_in_excess = 0;
4750 mz->on_tree = false; 4774 mz->on_tree = false;
4751 mz->memcg = memcg; 4775 mz->memcg = memcg;