summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJoonsoo Kim <iamjoonsoo.kim@lge.com>2014-12-12 19:55:46 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2014-12-13 15:42:48 -0500
commiteefa864b701d78dc9753c70a3540a2e9ae192595 (patch)
treeac638eef9f41b5857a11191f3e6e455c64778df2
parent2d48366b3ff745729815c15077508f8d7722ec5f (diff)
mm/page_ext: resurrect struct page extending code for debugging
When we debug something, we'd like to insert some information to every page. For this purpose, we sometimes modify struct page itself. But, this has drawbacks. First, it requires re-compile. This makes us hesitate to use the powerful debug feature so development process is slowed down. And, second, sometimes it is impossible to rebuild the kernel due to third party module dependency. At third, system behaviour would be largely different after re-compile, because it changes size of struct page greatly and this structure is accessed by every part of kernel. Keeping this as it is would be better to reproduce errornous situation. This feature is intended to overcome above mentioned problems. This feature allocates memory for extended data per page in certain place rather than the struct page itself. This memory can be accessed by the accessor functions provided by this code. During the boot process, it checks whether allocation of huge chunk of memory is needed or not. If not, it avoids allocating memory at all. With this advantage, we can include this feature into the kernel in default and can avoid rebuild and solve related problems. Until now, memcg uses this technique. But, now, memcg decides to embed their variable to struct page itself and it's code to extend struct page has been removed. I'd like to use this code to develop debug feature, so this patch resurrect it. To help these things to work well, this patch introduces two callbacks for clients. One is the need callback which is mandatory if user wants to avoid useless memory allocation at boot-time. The other is optional, init callback, which is used to do proper initialization after memory is allocated. Detailed explanation about purpose of these functions is in code comment. Please refer it. Others are completely same with previous extension code in memcg. Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Minchan Kim <minchan@kernel.org> Cc: Dave Hansen <dave@sr71.net> Cc: Michal Nazarewicz <mina86@mina86.com> Cc: Jungsoo Son <jungsoo.son@lge.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/mmzone.h12
-rw-r--r--include/linux/page_ext.h59
-rw-r--r--init/main.c7
-rw-r--r--mm/Kconfig.debug9
-rw-r--r--mm/Makefile1
-rw-r--r--mm/page_alloc.c2
-rw-r--r--mm/page_ext.c395
7 files changed, 485 insertions, 0 deletions
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 3879d7664dfc..2f0856d14b21 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -722,6 +722,9 @@ typedef struct pglist_data {
722 int nr_zones; 722 int nr_zones;
723#ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */ 723#ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */
724 struct page *node_mem_map; 724 struct page *node_mem_map;
725#ifdef CONFIG_PAGE_EXTENSION
726 struct page_ext *node_page_ext;
727#endif
725#endif 728#endif
726#ifndef CONFIG_NO_BOOTMEM 729#ifndef CONFIG_NO_BOOTMEM
727 struct bootmem_data *bdata; 730 struct bootmem_data *bdata;
@@ -1075,6 +1078,7 @@ static inline unsigned long early_pfn_to_nid(unsigned long pfn)
1075#define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK) 1078#define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK)
1076 1079
1077struct page; 1080struct page;
1081struct page_ext;
1078struct mem_section { 1082struct mem_section {
1079 /* 1083 /*
1080 * This is, logically, a pointer to an array of struct 1084 * This is, logically, a pointer to an array of struct
@@ -1092,6 +1096,14 @@ struct mem_section {
1092 1096
1093 /* See declaration of similar field in struct zone */ 1097 /* See declaration of similar field in struct zone */
1094 unsigned long *pageblock_flags; 1098 unsigned long *pageblock_flags;
1099#ifdef CONFIG_PAGE_EXTENSION
1100 /*
1101 * If !SPARSEMEM, pgdat doesn't have page_ext pointer. We use
1102 * section. (see page_ext.h about this.)
1103 */
1104 struct page_ext *page_ext;
1105 unsigned long pad;
1106#endif
1095 /* 1107 /*
1096 * WARNING: mem_section must be a power-of-2 in size for the 1108 * WARNING: mem_section must be a power-of-2 in size for the
1097 * calculation and use of SECTION_ROOT_MASK to make sense. 1109 * calculation and use of SECTION_ROOT_MASK to make sense.
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
new file mode 100644
index 000000000000..2ccc8b414e5c
--- /dev/null
+++ b/include/linux/page_ext.h
@@ -0,0 +1,59 @@
1#ifndef __LINUX_PAGE_EXT_H
2#define __LINUX_PAGE_EXT_H
3
4struct pglist_data;
5struct page_ext_operations {
6 bool (*need)(void);
7 void (*init)(void);
8};
9
10#ifdef CONFIG_PAGE_EXTENSION
11
12/*
13 * Page Extension can be considered as an extended mem_map.
14 * A page_ext page is associated with every page descriptor. The
15 * page_ext helps us add more information about the page.
16 * All page_ext are allocated at boot or memory hotplug event,
17 * then the page_ext for pfn always exists.
18 */
19struct page_ext {
20 unsigned long flags;
21};
22
23extern void pgdat_page_ext_init(struct pglist_data *pgdat);
24
25#ifdef CONFIG_SPARSEMEM
26static inline void page_ext_init_flatmem(void)
27{
28}
29extern void page_ext_init(void);
30#else
31extern void page_ext_init_flatmem(void);
32static inline void page_ext_init(void)
33{
34}
35#endif
36
37struct page_ext *lookup_page_ext(struct page *page);
38
39#else /* !CONFIG_PAGE_EXTENSION */
40struct page_ext;
41
42static inline void pgdat_page_ext_init(struct pglist_data *pgdat)
43{
44}
45
46static inline struct page_ext *lookup_page_ext(struct page *page)
47{
48 return NULL;
49}
50
51static inline void page_ext_init(void)
52{
53}
54
55static inline void page_ext_init_flatmem(void)
56{
57}
58#endif /* CONFIG_PAGE_EXTENSION */
59#endif /* __LINUX_PAGE_EXT_H */
diff --git a/init/main.c b/init/main.c
index ca380ec685de..ed7e7ad5fee0 100644
--- a/init/main.c
+++ b/init/main.c
@@ -51,6 +51,7 @@
51#include <linux/mempolicy.h> 51#include <linux/mempolicy.h>
52#include <linux/key.h> 52#include <linux/key.h>
53#include <linux/buffer_head.h> 53#include <linux/buffer_head.h>
54#include <linux/page_ext.h>
54#include <linux/debug_locks.h> 55#include <linux/debug_locks.h>
55#include <linux/debugobjects.h> 56#include <linux/debugobjects.h>
56#include <linux/lockdep.h> 57#include <linux/lockdep.h>
@@ -484,6 +485,11 @@ void __init __weak thread_info_cache_init(void)
484 */ 485 */
485static void __init mm_init(void) 486static void __init mm_init(void)
486{ 487{
488 /*
489 * page_ext requires contiguous pages,
490 * bigger than MAX_ORDER unless SPARSEMEM.
491 */
492 page_ext_init_flatmem();
487 mem_init(); 493 mem_init();
488 kmem_cache_init(); 494 kmem_cache_init();
489 percpu_init_late(); 495 percpu_init_late();
@@ -621,6 +627,7 @@ asmlinkage __visible void __init start_kernel(void)
621 initrd_start = 0; 627 initrd_start = 0;
622 } 628 }
623#endif 629#endif
630 page_ext_init();
624 debug_objects_mem_init(); 631 debug_objects_mem_init();
625 kmemleak_init(); 632 kmemleak_init();
626 setup_per_cpu_pageset(); 633 setup_per_cpu_pageset();
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 4b2443254de2..1ba81c7769f7 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -1,3 +1,12 @@
1config PAGE_EXTENSION
2 bool "Extend memmap on extra space for more information on page"
3 ---help---
4 Extend memmap on extra space for more information on page. This
5 could be used for debugging features that need to insert extra
6 field for every page. This extension enables us to save memory
7 by not allocating this extra memory according to boottime
8 configuration.
9
1config DEBUG_PAGEALLOC 10config DEBUG_PAGEALLOC
2 bool "Debug page memory allocations" 11 bool "Debug page memory allocations"
3 depends on DEBUG_KERNEL 12 depends on DEBUG_KERNEL
diff --git a/mm/Makefile b/mm/Makefile
index b3c6ce932c64..580cd3f392af 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -71,3 +71,4 @@ obj-$(CONFIG_ZSMALLOC) += zsmalloc.o
71obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o 71obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
72obj-$(CONFIG_CMA) += cma.o 72obj-$(CONFIG_CMA) += cma.o
73obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o 73obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
74obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2e8b7f39605a..b64666cf5865 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -48,6 +48,7 @@
48#include <linux/backing-dev.h> 48#include <linux/backing-dev.h>
49#include <linux/fault-inject.h> 49#include <linux/fault-inject.h>
50#include <linux/page-isolation.h> 50#include <linux/page-isolation.h>
51#include <linux/page_ext.h>
51#include <linux/debugobjects.h> 52#include <linux/debugobjects.h>
52#include <linux/kmemleak.h> 53#include <linux/kmemleak.h>
53#include <linux/compaction.h> 54#include <linux/compaction.h>
@@ -4856,6 +4857,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4856#endif 4857#endif
4857 init_waitqueue_head(&pgdat->kswapd_wait); 4858 init_waitqueue_head(&pgdat->kswapd_wait);
4858 init_waitqueue_head(&pgdat->pfmemalloc_wait); 4859 init_waitqueue_head(&pgdat->pfmemalloc_wait);
4860 pgdat_page_ext_init(pgdat);
4859 4861
4860 for (j = 0; j < MAX_NR_ZONES; j++) { 4862 for (j = 0; j < MAX_NR_ZONES; j++) {
4861 struct zone *zone = pgdat->node_zones + j; 4863 struct zone *zone = pgdat->node_zones + j;
diff --git a/mm/page_ext.c b/mm/page_ext.c
new file mode 100644
index 000000000000..514a3bccd63f
--- /dev/null
+++ b/mm/page_ext.c
@@ -0,0 +1,395 @@
1#include <linux/mm.h>
2#include <linux/mmzone.h>
3#include <linux/bootmem.h>
4#include <linux/page_ext.h>
5#include <linux/memory.h>
6#include <linux/vmalloc.h>
7#include <linux/kmemleak.h>
8
9/*
10 * struct page extension
11 *
12 * This is the feature to manage memory for extended data per page.
13 *
14 * Until now, we must modify struct page itself to store extra data per page.
15 * This requires rebuilding the kernel and it is really time consuming process.
16 * And, sometimes, rebuild is impossible due to third party module dependency.
17 * At last, enlarging struct page could cause un-wanted system behaviour change.
18 *
19 * This feature is intended to overcome above mentioned problems. This feature
20 * allocates memory for extended data per page in certain place rather than
21 * the struct page itself. This memory can be accessed by the accessor
22 * functions provided by this code. During the boot process, it checks whether
23 * allocation of huge chunk of memory is needed or not. If not, it avoids
24 * allocating memory at all. With this advantage, we can include this feature
25 * into the kernel in default and can avoid rebuild and solve related problems.
26 *
27 * To help these things to work well, there are two callbacks for clients. One
28 * is the need callback which is mandatory if user wants to avoid useless
29 * memory allocation at boot-time. The other is optional, init callback, which
30 * is used to do proper initialization after memory is allocated.
31 *
32 * The need callback is used to decide whether extended memory allocation is
33 * needed or not. Sometimes users want to deactivate some features in this
34 * boot and extra memory would be unneccessary. In this case, to avoid
35 * allocating huge chunk of memory, each clients represent their need of
36 * extra memory through the need callback. If one of the need callbacks
37 * returns true, it means that someone needs extra memory so that
38 * page extension core should allocates memory for page extension. If
39 * none of need callbacks return true, memory isn't needed at all in this boot
40 * and page extension core can skip to allocate memory. As result,
41 * none of memory is wasted.
42 *
43 * The init callback is used to do proper initialization after page extension
44 * is completely initialized. In sparse memory system, extra memory is
45 * allocated some time later than memmap is allocated. In other words, lifetime
46 * of memory for page extension isn't same with memmap for struct page.
47 * Therefore, clients can't store extra data until page extension is
48 * initialized, even if pages are allocated and used freely. This could
49 * cause inadequate state of extra data per page, so, to prevent it, client
50 * can utilize this callback to initialize the state of it correctly.
51 */
52
53static struct page_ext_operations *page_ext_ops[] = {
54};
55
56static unsigned long total_usage;
57
58static bool __init invoke_need_callbacks(void)
59{
60 int i;
61 int entries = ARRAY_SIZE(page_ext_ops);
62
63 for (i = 0; i < entries; i++) {
64 if (page_ext_ops[i]->need && page_ext_ops[i]->need())
65 return true;
66 }
67
68 return false;
69}
70
71static void __init invoke_init_callbacks(void)
72{
73 int i;
74 int entries = ARRAY_SIZE(page_ext_ops);
75
76 for (i = 0; i < entries; i++) {
77 if (page_ext_ops[i]->init)
78 page_ext_ops[i]->init();
79 }
80}
81
82#if !defined(CONFIG_SPARSEMEM)
83
84
85void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
86{
87 pgdat->node_page_ext = NULL;
88}
89
90struct page_ext *lookup_page_ext(struct page *page)
91{
92 unsigned long pfn = page_to_pfn(page);
93 unsigned long offset;
94 struct page_ext *base;
95
96 base = NODE_DATA(page_to_nid(page))->node_page_ext;
97#ifdef CONFIG_DEBUG_VM
98 /*
99 * The sanity checks the page allocator does upon freeing a
100 * page can reach here before the page_ext arrays are
101 * allocated when feeding a range of pages to the allocator
102 * for the first time during bootup or memory hotplug.
103 */
104 if (unlikely(!base))
105 return NULL;
106#endif
107 offset = pfn - round_down(node_start_pfn(page_to_nid(page)),
108 MAX_ORDER_NR_PAGES);
109 return base + offset;
110}
111
112static int __init alloc_node_page_ext(int nid)
113{
114 struct page_ext *base;
115 unsigned long table_size;
116 unsigned long nr_pages;
117
118 nr_pages = NODE_DATA(nid)->node_spanned_pages;
119 if (!nr_pages)
120 return 0;
121
122 /*
123 * Need extra space if node range is not aligned with
124 * MAX_ORDER_NR_PAGES. When page allocator's buddy algorithm
125 * checks buddy's status, range could be out of exact node range.
126 */
127 if (!IS_ALIGNED(node_start_pfn(nid), MAX_ORDER_NR_PAGES) ||
128 !IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES))
129 nr_pages += MAX_ORDER_NR_PAGES;
130
131 table_size = sizeof(struct page_ext) * nr_pages;
132
133 base = memblock_virt_alloc_try_nid_nopanic(
134 table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
135 BOOTMEM_ALLOC_ACCESSIBLE, nid);
136 if (!base)
137 return -ENOMEM;
138 NODE_DATA(nid)->node_page_ext = base;
139 total_usage += table_size;
140 return 0;
141}
142
143void __init page_ext_init_flatmem(void)
144{
145
146 int nid, fail;
147
148 if (!invoke_need_callbacks())
149 return;
150
151 for_each_online_node(nid) {
152 fail = alloc_node_page_ext(nid);
153 if (fail)
154 goto fail;
155 }
156 pr_info("allocated %ld bytes of page_ext\n", total_usage);
157 invoke_init_callbacks();
158 return;
159
160fail:
161 pr_crit("allocation of page_ext failed.\n");
162 panic("Out of memory");
163}
164
165#else /* CONFIG_FLAT_NODE_MEM_MAP */
166
167struct page_ext *lookup_page_ext(struct page *page)
168{
169 unsigned long pfn = page_to_pfn(page);
170 struct mem_section *section = __pfn_to_section(pfn);
171#ifdef CONFIG_DEBUG_VM
172 /*
173 * The sanity checks the page allocator does upon freeing a
174 * page can reach here before the page_ext arrays are
175 * allocated when feeding a range of pages to the allocator
176 * for the first time during bootup or memory hotplug.
177 */
178 if (!section->page_ext)
179 return NULL;
180#endif
181 return section->page_ext + pfn;
182}
183
184static void *__meminit alloc_page_ext(size_t size, int nid)
185{
186 gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN;
187 void *addr = NULL;
188
189 addr = alloc_pages_exact_nid(nid, size, flags);
190 if (addr) {
191 kmemleak_alloc(addr, size, 1, flags);
192 return addr;
193 }
194
195 if (node_state(nid, N_HIGH_MEMORY))
196 addr = vzalloc_node(size, nid);
197 else
198 addr = vzalloc(size);
199
200 return addr;
201}
202
203static int __meminit init_section_page_ext(unsigned long pfn, int nid)
204{
205 struct mem_section *section;
206 struct page_ext *base;
207 unsigned long table_size;
208
209 section = __pfn_to_section(pfn);
210
211 if (section->page_ext)
212 return 0;
213
214 table_size = sizeof(struct page_ext) * PAGES_PER_SECTION;
215 base = alloc_page_ext(table_size, nid);
216
217 /*
218 * The value stored in section->page_ext is (base - pfn)
219 * and it does not point to the memory block allocated above,
220 * causing kmemleak false positives.
221 */
222 kmemleak_not_leak(base);
223
224 if (!base) {
225 pr_err("page ext allocation failure\n");
226 return -ENOMEM;
227 }
228
229 /*
230 * The passed "pfn" may not be aligned to SECTION. For the calculation
231 * we need to apply a mask.
232 */
233 pfn &= PAGE_SECTION_MASK;
234 section->page_ext = base - pfn;
235 total_usage += table_size;
236 return 0;
237}
238#ifdef CONFIG_MEMORY_HOTPLUG
239static void free_page_ext(void *addr)
240{
241 if (is_vmalloc_addr(addr)) {
242 vfree(addr);
243 } else {
244 struct page *page = virt_to_page(addr);
245 size_t table_size;
246
247 table_size = sizeof(struct page_ext) * PAGES_PER_SECTION;
248
249 BUG_ON(PageReserved(page));
250 free_pages_exact(addr, table_size);
251 }
252}
253
254static void __free_page_ext(unsigned long pfn)
255{
256 struct mem_section *ms;
257 struct page_ext *base;
258
259 ms = __pfn_to_section(pfn);
260 if (!ms || !ms->page_ext)
261 return;
262 base = ms->page_ext + pfn;
263 free_page_ext(base);
264 ms->page_ext = NULL;
265}
266
267static int __meminit online_page_ext(unsigned long start_pfn,
268 unsigned long nr_pages,
269 int nid)
270{
271 unsigned long start, end, pfn;
272 int fail = 0;
273
274 start = SECTION_ALIGN_DOWN(start_pfn);
275 end = SECTION_ALIGN_UP(start_pfn + nr_pages);
276
277 if (nid == -1) {
278 /*
279 * In this case, "nid" already exists and contains valid memory.
280 * "start_pfn" passed to us is a pfn which is an arg for
281 * online__pages(), and start_pfn should exist.
282 */
283 nid = pfn_to_nid(start_pfn);
284 VM_BUG_ON(!node_state(nid, N_ONLINE));
285 }
286
287 for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
288 if (!pfn_present(pfn))
289 continue;
290 fail = init_section_page_ext(pfn, nid);
291 }
292 if (!fail)
293 return 0;
294
295 /* rollback */
296 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
297 __free_page_ext(pfn);
298
299 return -ENOMEM;
300}
301
302static int __meminit offline_page_ext(unsigned long start_pfn,
303 unsigned long nr_pages, int nid)
304{
305 unsigned long start, end, pfn;
306
307 start = SECTION_ALIGN_DOWN(start_pfn);
308 end = SECTION_ALIGN_UP(start_pfn + nr_pages);
309
310 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
311 __free_page_ext(pfn);
312 return 0;
313
314}
315
316static int __meminit page_ext_callback(struct notifier_block *self,
317 unsigned long action, void *arg)
318{
319 struct memory_notify *mn = arg;
320 int ret = 0;
321
322 switch (action) {
323 case MEM_GOING_ONLINE:
324 ret = online_page_ext(mn->start_pfn,
325 mn->nr_pages, mn->status_change_nid);
326 break;
327 case MEM_OFFLINE:
328 offline_page_ext(mn->start_pfn,
329 mn->nr_pages, mn->status_change_nid);
330 break;
331 case MEM_CANCEL_ONLINE:
332 offline_page_ext(mn->start_pfn,
333 mn->nr_pages, mn->status_change_nid);
334 break;
335 case MEM_GOING_OFFLINE:
336 break;
337 case MEM_ONLINE:
338 case MEM_CANCEL_OFFLINE:
339 break;
340 }
341
342 return notifier_from_errno(ret);
343}
344
345#endif
346
347void __init page_ext_init(void)
348{
349 unsigned long pfn;
350 int nid;
351
352 if (!invoke_need_callbacks())
353 return;
354
355 for_each_node_state(nid, N_MEMORY) {
356 unsigned long start_pfn, end_pfn;
357
358 start_pfn = node_start_pfn(nid);
359 end_pfn = node_end_pfn(nid);
360 /*
361 * start_pfn and end_pfn may not be aligned to SECTION and the
362 * page->flags of out of node pages are not initialized. So we
363 * scan [start_pfn, the biggest section's pfn < end_pfn) here.
364 */
365 for (pfn = start_pfn; pfn < end_pfn;
366 pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
367
368 if (!pfn_valid(pfn))
369 continue;
370 /*
371 * Nodes's pfns can be overlapping.
372 * We know some arch can have a nodes layout such as
373 * -------------pfn-------------->
374 * N0 | N1 | N2 | N0 | N1 | N2|....
375 */
376 if (pfn_to_nid(pfn) != nid)
377 continue;
378 if (init_section_page_ext(pfn, nid))
379 goto oom;
380 }
381 }
382 hotplug_memory_notifier(page_ext_callback, 0);
383 pr_info("allocated %ld bytes of page_ext\n", total_usage);
384 invoke_init_callbacks();
385 return;
386
387oom:
388 panic("Out of memory");
389}
390
391void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
392{
393}
394
395#endif