diff options
author | KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> | 2011-06-15 18:08:42 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-06-15 23:04:01 -0400 |
commit | 37573e8c718277103f61f03741bdc5606d31b07e (patch) | |
tree | 7142211508420a8ea90b8ae01bfa5ee93ffa1a44 | |
parent | 8957712710e045044e3c44375c6a87d7ffa17d51 (diff) |
memcg: fix init_page_cgroup nid with sparsemem
Commit 21a3c9646873 ("memcg: allocate memory cgroup structures in local
nodes") makes page_cgroup allocation as NUMA aware. But that caused a
problem https://bugzilla.kernel.org/show_bug.cgi?id=36192.
The problem was getting a NID from invalid struct pages, which was not
initialized because it was out-of-node, out of [node_start_pfn,
node_end_pfn)
Now, with sparsemem, page_cgroup_init scans pfn from 0 to max_pfn. But
this may scan a pfn which is not on any node and can access memmap which
is not initialized.
This makes page_cgroup_init() for SPARSEMEM node aware and remove a code
to get nid from page->flags. (Then, we'll use valid NID always.)
[akpm@linux-foundation.org: try to fix up comments]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | mm/page_cgroup.c | 71 |
1 files changed, 53 insertions, 18 deletions
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 74ccff61d1be..53bffc6c293e 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c | |||
@@ -162,13 +162,13 @@ static void free_page_cgroup(void *addr) | |||
162 | } | 162 | } |
163 | #endif | 163 | #endif |
164 | 164 | ||
165 | static int __meminit init_section_page_cgroup(unsigned long pfn) | 165 | static int __meminit init_section_page_cgroup(unsigned long pfn, int nid) |
166 | { | 166 | { |
167 | struct page_cgroup *base, *pc; | 167 | struct page_cgroup *base, *pc; |
168 | struct mem_section *section; | 168 | struct mem_section *section; |
169 | unsigned long table_size; | 169 | unsigned long table_size; |
170 | unsigned long nr; | 170 | unsigned long nr; |
171 | int nid, index; | 171 | int index; |
172 | 172 | ||
173 | nr = pfn_to_section_nr(pfn); | 173 | nr = pfn_to_section_nr(pfn); |
174 | section = __nr_to_section(nr); | 174 | section = __nr_to_section(nr); |
@@ -176,7 +176,6 @@ static int __meminit init_section_page_cgroup(unsigned long pfn) | |||
176 | if (section->page_cgroup) | 176 | if (section->page_cgroup) |
177 | return 0; | 177 | return 0; |
178 | 178 | ||
179 | nid = page_to_nid(pfn_to_page(pfn)); | ||
180 | table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; | 179 | table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; |
181 | base = alloc_page_cgroup(table_size, nid); | 180 | base = alloc_page_cgroup(table_size, nid); |
182 | 181 | ||
@@ -196,7 +195,11 @@ static int __meminit init_section_page_cgroup(unsigned long pfn) | |||
196 | pc = base + index; | 195 | pc = base + index; |
197 | init_page_cgroup(pc, nr); | 196 | init_page_cgroup(pc, nr); |
198 | } | 197 | } |
199 | 198 | /* | |
199 | * The passed "pfn" may not be aligned to SECTION. For the calculation | ||
200 | * we need to apply a mask. | ||
201 | */ | ||
202 | pfn &= PAGE_SECTION_MASK; | ||
200 | section->page_cgroup = base - pfn; | 203 | section->page_cgroup = base - pfn; |
201 | total_usage += table_size; | 204 | total_usage += table_size; |
202 | return 0; | 205 | return 0; |
@@ -225,10 +228,20 @@ int __meminit online_page_cgroup(unsigned long start_pfn, | |||
225 | start = start_pfn & ~(PAGES_PER_SECTION - 1); | 228 | start = start_pfn & ~(PAGES_PER_SECTION - 1); |
226 | end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); | 229 | end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); |
227 | 230 | ||
231 | if (nid == -1) { | ||
232 | /* | ||
233 | * In this case, "nid" already exists and contains valid memory. | ||
234 | * "start_pfn" passed to us is a pfn which is an arg for | ||
235 | * online__pages(), and start_pfn should exist. | ||
236 | */ | ||
237 | nid = pfn_to_nid(start_pfn); | ||
238 | VM_BUG_ON(!node_state(nid, N_ONLINE)); | ||
239 | } | ||
240 | |||
228 | for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { | 241 | for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { |
229 | if (!pfn_present(pfn)) | 242 | if (!pfn_present(pfn)) |
230 | continue; | 243 | continue; |
231 | fail = init_section_page_cgroup(pfn); | 244 | fail = init_section_page_cgroup(pfn, nid); |
232 | } | 245 | } |
233 | if (!fail) | 246 | if (!fail) |
234 | return 0; | 247 | return 0; |
@@ -284,25 +297,47 @@ static int __meminit page_cgroup_callback(struct notifier_block *self, | |||
284 | void __init page_cgroup_init(void) | 297 | void __init page_cgroup_init(void) |
285 | { | 298 | { |
286 | unsigned long pfn; | 299 | unsigned long pfn; |
287 | int fail = 0; | 300 | int nid; |
288 | 301 | ||
289 | if (mem_cgroup_disabled()) | 302 | if (mem_cgroup_disabled()) |
290 | return; | 303 | return; |
291 | 304 | ||
292 | for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) { | 305 | for_each_node_state(nid, N_HIGH_MEMORY) { |
293 | if (!pfn_present(pfn)) | 306 | unsigned long start_pfn, end_pfn; |
294 | continue; | 307 | |
295 | fail = init_section_page_cgroup(pfn); | 308 | start_pfn = node_start_pfn(nid); |
296 | } | 309 | end_pfn = node_end_pfn(nid); |
297 | if (fail) { | 310 | /* |
298 | printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n"); | 311 | * start_pfn and end_pfn may not be aligned to SECTION and the |
299 | panic("Out of memory"); | 312 | * page->flags of out of node pages are not initialized. So we |
300 | } else { | 313 | * scan [start_pfn, the biggest section's pfn < end_pfn) here. |
301 | hotplug_memory_notifier(page_cgroup_callback, 0); | 314 | */ |
315 | for (pfn = start_pfn; | ||
316 | pfn < end_pfn; | ||
317 | pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) { | ||
318 | |||
319 | if (!pfn_valid(pfn)) | ||
320 | continue; | ||
321 | /* | ||
322 | * Nodes's pfns can be overlapping. | ||
323 | * We know some arch can have a nodes layout such as | ||
324 | * -------------pfn--------------> | ||
325 | * N0 | N1 | N2 | N0 | N1 | N2|.... | ||
326 | */ | ||
327 | if (pfn_to_nid(pfn) != nid) | ||
328 | continue; | ||
329 | if (init_section_page_cgroup(pfn, nid)) | ||
330 | goto oom; | ||
331 | } | ||
302 | } | 332 | } |
333 | hotplug_memory_notifier(page_cgroup_callback, 0); | ||
303 | printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); | 334 | printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); |
304 | printk(KERN_INFO "please try 'cgroup_disable=memory' option if you don't" | 335 | printk(KERN_INFO "please try 'cgroup_disable=memory' option if you " |
305 | " want memory cgroups\n"); | 336 | "don't want memory cgroups\n"); |
337 | return; | ||
338 | oom: | ||
339 | printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n"); | ||
340 | panic("Out of memory"); | ||
306 | } | 341 | } |
307 | 342 | ||
308 | void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) | 343 | void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) |