From e30825f1869a75b29a69dc8e0aaaaccc492092cf Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Fri, 12 Dec 2014 16:55:49 -0800 Subject: mm/debug-pagealloc: prepare boottime configurable on/off Until now, debug-pagealloc needs extra flags in struct page, so we need to recompile whole source code when we decide to use it. This is really painful, because it takes some time to recompile and sometimes rebuild is not possible due to third party module depending on struct page. So, we can't use this good feature in many cases. Now, we have the page extension feature that allows us to insert extra flags to outside of struct page. This gets rid of third party module issue mentioned above. And, this allows us to determine if we need extra memory for this page extension in boottime. With these property, we can avoid using debug-pagealloc in boottime with low computational overhead in the kernel built with CONFIG_DEBUG_PAGEALLOC. This will help our development process greatly. This patch is the preparation step to achive above goal. debug-pagealloc originally uses extra field of struct page, but, after this patch, it will use field of struct page_ext. Because memory for page_ext is allocated later than initialization of page allocator in CONFIG_SPARSEMEM, we should disable debug-pagealloc feature temporarily until initialization of page_ext. This patch implements this. Signed-off-by: Joonsoo Kim Cc: Mel Gorman Cc: Johannes Weiner Cc: Minchan Kim Cc: Dave Hansen Cc: Michal Nazarewicz Cc: Jungsoo Son Cc: Ingo Molnar Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 3b337efbe533..66560f1a0564 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -19,6 +19,7 @@ #include #include #include +#include struct mempolicy; struct anon_vma; @@ -2155,20 +2156,36 @@ extern void copy_user_huge_page(struct page *dst, struct page *src, unsigned int pages_per_huge_page); #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ +extern struct page_ext_operations debug_guardpage_ops; +extern struct page_ext_operations page_poisoning_ops; + #ifdef CONFIG_DEBUG_PAGEALLOC extern unsigned int _debug_guardpage_minorder; +extern bool _debug_guardpage_enabled; static inline unsigned int debug_guardpage_minorder(void) { return _debug_guardpage_minorder; } +static inline bool debug_guardpage_enabled(void) +{ + return _debug_guardpage_enabled; +} + static inline bool page_is_guard(struct page *page) { - return test_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); + struct page_ext *page_ext; + + if (!debug_guardpage_enabled()) + return false; + + page_ext = lookup_page_ext(page); + return test_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); } #else static inline unsigned int debug_guardpage_minorder(void) { return 0; } +static inline bool debug_guardpage_enabled(void) { return false; } static inline bool page_is_guard(struct page *page) { return false; } #endif /* CONFIG_DEBUG_PAGEALLOC */ -- cgit v1.2.2 From 031bc5743f158b2d5498294f489e534a31251626 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Fri, 12 Dec 2014 16:55:52 -0800 Subject: mm/debug-pagealloc: make debug-pagealloc boottime configurable Now, we have prepared to avoid using debug-pagealloc in boottime. So introduce new kernel-parameter to disable debug-pagealloc in boottime, and makes related functions to be disabled in this case. Only non-intuitive part is change of guard page functions. Because guard page is effective only if debug-pagealloc is enabled, turning off according to debug-pagealloc is reasonable thing to do. Signed-off-by: Joonsoo Kim Cc: Mel Gorman Cc: Johannes Weiner Cc: Minchan Kim Cc: Dave Hansen Cc: Michal Nazarewicz Cc: Jungsoo Son Cc: Ingo Molnar Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 66560f1a0564..8b8d77a1532f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2061,7 +2061,22 @@ static inline void vm_stat_account(struct mm_struct *mm, #endif /* CONFIG_PROC_FS */ #ifdef CONFIG_DEBUG_PAGEALLOC -extern void kernel_map_pages(struct page *page, int numpages, int enable); +extern bool _debug_pagealloc_enabled; +extern void __kernel_map_pages(struct page *page, int numpages, int enable); + +static inline bool debug_pagealloc_enabled(void) +{ + return _debug_pagealloc_enabled; +} + +static inline void +kernel_map_pages(struct page *page, int numpages, int enable) +{ + if (!debug_pagealloc_enabled()) + return; + + __kernel_map_pages(page, numpages, enable); +} #ifdef CONFIG_HIBERNATION extern bool kernel_page_present(struct page *page); #endif /* CONFIG_HIBERNATION */ -- cgit v1.2.2 From 6b4f7799c6a5703ac6b8c0649f4c22f00fa07513 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 12 Dec 2014 16:56:13 -0800 Subject: mm: vmscan: invoke slab shrinkers from shrink_zone() The slab shrinkers are currently invoked from the zonelist walkers in kswapd, direct reclaim, and zone reclaim, all of which roughly gauge the eligible LRU pages and assemble a nodemask to pass to NUMA-aware shrinkers, which then again have to walk over the nodemask. This is redundant code, extra runtime work, and fairly inaccurate when it comes to the estimation of actually scannable LRU pages. The code duplication will only get worse when making the shrinkers cgroup-aware and requiring them to have out-of-band cgroup hierarchy walks as well. Instead, invoke the shrinkers from shrink_zone(), which is where all reclaimers end up, to avoid this duplication. Take the count for eligible LRU pages out of get_scan_count(), which considers many more factors than just the availability of swap space, like zone_reclaimable_pages() currently does. Accumulate the number over all visited lruvecs to get the per-zone value. Some nodes have multiple zones due to memory addressing restrictions. To avoid putting too much pressure on the shrinkers, only invoke them once for each such node, using the class zone of the allocation as the pivot zone. For now, this integrates the slab shrinking better into the reclaim logic and gets rid of duplicative invocations from kswapd, direct reclaim, and zone reclaim. It also prepares for cgroup-awareness, allowing memcg-capable shrinkers to be added at the lruvec level without much duplication of both code and runtime work. This changes kswapd behavior, which used to invoke the shrinkers for each zone, but with scan ratios gathered from the entire node, resulting in meaningless pressure quantities on multi-zone nodes. Zone reclaim behavior also changes. It used to shrink slabs until the same amount of pages were shrunk as were reclaimed from the LRUs. Now it merely invokes the shrinkers once with the zone's scan ratio, which makes the shrinkers go easier on caches that implement aging and would prefer feeding back pressure from recently used slab objects to unused LRU pages. [vdavydov@parallels.com: assure class zone is populated] Signed-off-by: Johannes Weiner Cc: Dave Chinner Signed-off-by: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8b8d77a1532f..c0a67b894c4c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2110,9 +2110,9 @@ int drop_caches_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); #endif -unsigned long shrink_slab(struct shrink_control *shrink, - unsigned long nr_pages_scanned, - unsigned long lru_pages); +unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid, + unsigned long nr_scanned, + unsigned long nr_eligible); #ifndef CONFIG_MMU #define randomize_va_space 0 -- cgit v1.2.2