diff options
author | KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> | 2008-12-01 16:13:48 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2008-12-01 22:55:24 -0500 |
commit | dc19f9db38295f811d9041bd89b113beccbd763a (patch) | |
tree | 6f1ce3a71df84981b4b5b70fd03f0d1fe20b196e | |
parent | b29acbdcf877009af3f1fc0750bcac314c51e055 (diff) |
memcg: memory hotplug fix for notifier callback
Fixes for memcg/memory hotplug.
While memory hotplug allocate/free memmap, page_cgroup doesn't free
page_cgroup at OFFLINE when page_cgroup is allocated via bootomem.
(Because freeing bootmem requires special care.)
Then, if page_cgroup is allocated by bootmem and memmap is freed/allocated
by memory hotplug, page_cgroup->page == page is no longer true.
But current MEM_ONLINE handler doesn't check it and update
page_cgroup->page if it's not necessary to allocate page_cgroup. (This
was not found because memmap is not freed if SPARSEMEM_VMEMMAP is y.)
And I noticed that MEM_ONLINE can be called against "part of section".
So, freeing page_cgroup at CANCEL_ONLINE will cause trouble. (freeing
used page_cgroup) Don't rollback at CANCEL.
One more, current memory hotplug notifier is stopped by slub because it
sets NOTIFY_STOP_MASK to return vaule. So, page_cgroup's callback never
be called. (low priority than slub now.)
I think this slub's behavior is not intentional(BUG). and fixes it.
Another way to be considered about page_cgroup allocation:
- free page_cgroup at OFFLINE even if it's from bootmem
and remove specieal handler. But it requires more changes.
Addresses http://bugzilla.kernel.org/show_bug.cgi?id=12041
Signed-off-by: KAMEZAWA Hiruyoki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Tested-by: Badari Pulavarty <pbadari@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | mm/page_cgroup.c | 43 | ||||
-rw-r--r-- | mm/slub.c | 6 |
2 files changed, 33 insertions, 16 deletions
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 436c00229e70..0b3cbf090a67 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c | |||
@@ -107,19 +107,29 @@ int __init_refok init_section_page_cgroup(unsigned long pfn) | |||
107 | 107 | ||
108 | section = __pfn_to_section(pfn); | 108 | section = __pfn_to_section(pfn); |
109 | 109 | ||
110 | if (section->page_cgroup) | 110 | if (!section->page_cgroup) { |
111 | return 0; | 111 | nid = page_to_nid(pfn_to_page(pfn)); |
112 | 112 | table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; | |
113 | nid = page_to_nid(pfn_to_page(pfn)); | 113 | if (slab_is_available()) { |
114 | 114 | base = kmalloc_node(table_size, GFP_KERNEL, nid); | |
115 | table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; | 115 | if (!base) |
116 | if (slab_is_available()) { | 116 | base = vmalloc_node(table_size, nid); |
117 | base = kmalloc_node(table_size, GFP_KERNEL, nid); | 117 | } else { |
118 | if (!base) | 118 | base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), |
119 | base = vmalloc_node(table_size, nid); | 119 | table_size, |
120 | } else { | ||
121 | base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), table_size, | ||
122 | PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); | 120 | PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); |
121 | } | ||
122 | } else { | ||
123 | /* | ||
124 | * We don't have to allocate page_cgroup again, but | ||
125 | * address of memmap may be changed. So, we have to initialize | ||
126 | * again. | ||
127 | */ | ||
128 | base = section->page_cgroup + pfn; | ||
129 | table_size = 0; | ||
130 | /* check address of memmap is changed or not. */ | ||
131 | if (base->page == pfn_to_page(pfn)) | ||
132 | return 0; | ||
123 | } | 133 | } |
124 | 134 | ||
125 | if (!base) { | 135 | if (!base) { |
@@ -208,18 +218,23 @@ static int __meminit page_cgroup_callback(struct notifier_block *self, | |||
208 | ret = online_page_cgroup(mn->start_pfn, | 218 | ret = online_page_cgroup(mn->start_pfn, |
209 | mn->nr_pages, mn->status_change_nid); | 219 | mn->nr_pages, mn->status_change_nid); |
210 | break; | 220 | break; |
211 | case MEM_CANCEL_ONLINE: | ||
212 | case MEM_OFFLINE: | 221 | case MEM_OFFLINE: |
213 | offline_page_cgroup(mn->start_pfn, | 222 | offline_page_cgroup(mn->start_pfn, |
214 | mn->nr_pages, mn->status_change_nid); | 223 | mn->nr_pages, mn->status_change_nid); |
215 | break; | 224 | break; |
225 | case MEM_CANCEL_ONLINE: | ||
216 | case MEM_GOING_OFFLINE: | 226 | case MEM_GOING_OFFLINE: |
217 | break; | 227 | break; |
218 | case MEM_ONLINE: | 228 | case MEM_ONLINE: |
219 | case MEM_CANCEL_OFFLINE: | 229 | case MEM_CANCEL_OFFLINE: |
220 | break; | 230 | break; |
221 | } | 231 | } |
222 | ret = notifier_from_errno(ret); | 232 | |
233 | if (ret) | ||
234 | ret = notifier_from_errno(ret); | ||
235 | else | ||
236 | ret = NOTIFY_OK; | ||
237 | |||
223 | return ret; | 238 | return ret; |
224 | } | 239 | } |
225 | 240 | ||
@@ -2931,8 +2931,10 @@ static int slab_memory_callback(struct notifier_block *self, | |||
2931 | case MEM_CANCEL_OFFLINE: | 2931 | case MEM_CANCEL_OFFLINE: |
2932 | break; | 2932 | break; |
2933 | } | 2933 | } |
2934 | 2934 | if (ret) | |
2935 | ret = notifier_from_errno(ret); | 2935 | ret = notifier_from_errno(ret); |
2936 | else | ||
2937 | ret = NOTIFY_OK; | ||
2936 | return ret; | 2938 | return ret; |
2937 | } | 2939 | } |
2938 | 2940 | ||