mm/vmalloc.c: preload a CPU with one object for split purpose

Refactor the NE_FIT_TYPE split case when it comes to an allocation of one extra object. We need it in order to build a remaining space. The preload is done per CPU in non-atomic context with GFP_KERNEL flags. More permissive parameters can be beneficial for systems which are suffer from high memory pressure or low memory condition. For example on my KVM system(4xCPUs, no swap, 256MB RAM) i can simulate the failure of page allocation with GFP_NOWAIT flags. Using "stress-ng" tool and starting N workers spinning on fork() and exit(), i can trigger below trace: <snip> [ 179.815161] stress-ng-fork: page allocation failure: order:0, mode:0x40800(GFP_NOWAIT|__GFP_COMP), nodemask=(null),cpuset=/,mems_allowed=0 [ 179.815168] CPU: 0 PID: 12612 Comm: stress-ng-fork Not tainted 5.2.0-rc3+ #1003 [ 179.815170] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 [ 179.815171] Call Trace: [ 179.815178] dump_stack+0x5c/0x7b [ 179.815182] warn_alloc+0x108/0x190 [ 179.815187] __alloc_pages_slowpath+0xdc7/0xdf0 [ 179.815191] __alloc_pages_nodemask+0x2de/0x330 [ 179.815194] cache_grow_begin+0x77/0x420 [ 179.815197] fallback_alloc+0x161/0x200 [ 179.815200] kmem_cache_alloc+0x1c9/0x570 [ 179.815202] alloc_vmap_area+0x32c/0x990 [ 179.815206] __get_vm_area_node+0xb0/0x170 [ 179.815208] __vmalloc_node_range+0x6d/0x230 [ 179.815211] ? _do_fork+0xce/0x3d0 [ 179.815213] copy_process.part.46+0x850/0x1b90 [ 179.815215] ? _do_fork+0xce/0x3d0 [ 179.815219] _do_fork+0xce/0x3d0 [ 179.815226] ? __do_page_fault+0x2bf/0x4e0 [ 179.815229] do_syscall_64+0x55/0x130 [ 179.815231] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 179.815234] RIP: 0033:0x7fedec4c738b ... [ 179.815237] RSP: 002b:00007ffda469d730 EFLAGS: 00000246 ORIG_RAX: 0000000000000038 [ 179.815239] RAX: ffffffffffffffda RBX: 00007ffda469d730 RCX: 00007fedec4c738b [ 179.815240] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000001200011 [ 179.815241] RBP: 00007ffda469d780 R08: 00007fededd6e300 R09: 00007ffda47f50a0 [ 179.815242] R10: 00007fededd6e5d0 R11: 0000000000000246 R12: 0000000000000000 [ 179.815243] R13: 0000000000000020 R14: 0000000000000000 R15: 0000000000000000 [ 179.815245] Mem-Info: [ 179.815249] active_anon:12686 inactive_anon:14760 isolated_anon:0 active_file:502 inactive_file:61 isolated_file:70 unevictable:2 dirty:0 writeback:0 unstable:0 slab_reclaimable:2380 slab_unreclaimable:7520 mapped:15069 shmem:14813 pagetables:10833 bounce:0 free:1922 free_pcp:229 free_cma:0 <snip> Link: http://lkml.kernel.org/r/20190606120411.8298-3-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com> Cc: Hillf Danton <hdanton@sina.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Michal Hocko <mhocko@suse.com> Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sonymobile.com> Cc: Roman Gushchin <guro@fb.com> Cc: Steven Rostedt <rostedt@goodmis.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Uladzislau Rezki (Sony) <urezki@gmail.com> 2019-07-11 23:58:57 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2019-07-12 14:05:46 -0400
commit: 82dd23e84be3ead53b6d584d836f51852d1096e6 (patch)
tree: dfc114bbf18fcf8cdc72a84a46ce6633cf55b6da /mm
parent: cacca6baf0b0a2dfe8eb3430b5f81916f35284cc (diff)
1 files changed, 51 insertions, 4 deletions
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index b645686ef9b6..45e0dc0e09f8 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -365,6 +365,13 @@ static LIST_HEAD(free_vmap_area_list);
 */
 static struct rb_root free_vmap_area_root = RB_ROOT;
+/*
+ * Preload a CPU with one object for "no edge" split case. The
+ * aim is to get rid of allocations from the atomic context, thus
+ * to use more permissive allocation masks.
+ */
+static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
 static __always_inline unsigned long
 va_size(struct vmap_area *va)
 {
@@ -951,9 +958,24 @@ adjust_va_to_fit_type(struct vmap_area *va,
                 *   L V  NVA  V R
                 * |---|-------|---|
                 */
-                lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
+                lva = __this_cpu_xchg(ne_fit_preload_node, NULL);
-                if (unlikely(!lva))
+                if (unlikely(!lva)) {
-                        return -1;
+                        /*
+                         * For percpu allocator we do not do any pre-allocation
+                         * and leave it as it is. The reason is it most likely
+                         * never ends up with NE_FIT_TYPE splitting. In case of
+                         * percpu allocations offsets and sizes are aligned to
+                         * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE
+                         * are its main fitting cases.
+                         *
+                         * There are a few exceptions though, as an example it is
+                         * a first allocation (early boot up) when we have "one"
+                         * big free space that has to be split.
+                         */
+                        lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
+                        if (!lva)
+                                return -1;
+                }
                /*
                 * Build the remainder.
@@ -1032,7 +1054,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
                                unsigned long vstart, unsigned long vend,
                                int node, gfp_t gfp_mask)
 {
-        struct vmap_area *va;
+        struct vmap_area *va, *pva;
        unsigned long addr;
        int purged = 0;
@@ -1057,7 +1079,32 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
        kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK);
 retry:
+        /*
+         * Preload this CPU with one extra vmap_area object to ensure
+         * that we have it available when fit type of free area is
+         * NE_FIT_TYPE.
+         *
+         * The preload is done in non-atomic context, thus it allows us
+         * to use more permissive allocation masks to be more stable under
+         * low memory condition and high memory pressure.
+         *
+         * Even if it fails we do not really care about that. Just proceed
+         * as it is. "overflow" path will refill the cache we allocate from.
+         */
+        preempt_disable();
+        if (!__this_cpu_read(ne_fit_preload_node)) {
+                preempt_enable();
+                pva = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, node);
+                preempt_disable();
+                if (__this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva)) {
+                        if (pva)
+                                kmem_cache_free(vmap_area_cachep, pva);
+                }
+        }
        spin_lock(&vmap_area_lock);
+        preempt_enable();
        /*
         * If an allocation fails, the "vend" address is
author	Uladzislau Rezki (Sony) <urezki@gmail.com>	2019-07-11 23:58:57 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2019-07-12 14:05:46 -0400
commit	82dd23e84be3ead53b6d584d836f51852d1096e6 (patch)
tree	dfc114bbf18fcf8cdc72a84a46ce6633cf55b6da /mm
parent	cacca6baf0b0a2dfe8eb3430b5f81916f35284cc (diff)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c index b645686ef9b6..45e0dc0e09f8 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c
@@ -365,6 +365,13 @@ static LIST_HEAD(free_vmap_area_list);
365	*/	365	*/
366	static struct rb_root free_vmap_area_root = RB_ROOT;	366	static struct rb_root free_vmap_area_root = RB_ROOT;
367		367
		368	/*
		369	* Preload a CPU with one object for "no edge" split case. The
		370	* aim is to get rid of allocations from the atomic context, thus
		371	* to use more permissive allocation masks.
		372	*/
		373	static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
		374
368	static __always_inline unsigned long	375	static __always_inline unsigned long
369	va_size(struct vmap_area *va)	376	va_size(struct vmap_area *va)
370	{	377	{
@@ -951,9 +958,24 @@ adjust_va_to_fit_type(struct vmap_area *va,
951	* L V NVA V R	958	* L V NVA V R
952	* \|---\|-------\|---\|	959	* \|---\|-------\|---\|
953	*/	960	*/
954	lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);	961	lva = __this_cpu_xchg(ne_fit_preload_node, NULL);
955	if (unlikely(!lva))	962	if (unlikely(!lva)) {
956	return -1;	963	/*
		964	* For percpu allocator we do not do any pre-allocation
		965	* and leave it as it is. The reason is it most likely
		966	* never ends up with NE_FIT_TYPE splitting. In case of
		967	* percpu allocations offsets and sizes are aligned to
		968	* fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE
		969	* are its main fitting cases.
		970	*
		971	* There are a few exceptions though, as an example it is
		972	* a first allocation (early boot up) when we have "one"
		973	* big free space that has to be split.
		974	*/
		975	lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
		976	if (!lva)
		977	return -1;
		978	}
957		979
958	/*	980	/*
959	* Build the remainder.	981	* Build the remainder.
@@ -1032,7 +1054,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
1032	unsigned long vstart, unsigned long vend,	1054	unsigned long vstart, unsigned long vend,
1033	int node, gfp_t gfp_mask)	1055	int node, gfp_t gfp_mask)
1034	{	1056	{
1035	struct vmap_area *va;	1057	struct vmap_area va, pva;
1036	unsigned long addr;	1058	unsigned long addr;
1037	int purged = 0;	1059	int purged = 0;
1038		1060
@@ -1057,7 +1079,32 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
1057	kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK);	1079	kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK);
1058		1080
1059	retry:	1081	retry:
		1082	/*
		1083	* Preload this CPU with one extra vmap_area object to ensure
		1084	* that we have it available when fit type of free area is
		1085	* NE_FIT_TYPE.
		1086	*
		1087	* The preload is done in non-atomic context, thus it allows us
		1088	* to use more permissive allocation masks to be more stable under
		1089	* low memory condition and high memory pressure.
		1090	*
		1091	* Even if it fails we do not really care about that. Just proceed
		1092	* as it is. "overflow" path will refill the cache we allocate from.
		1093	*/
		1094	preempt_disable();
		1095	if (!__this_cpu_read(ne_fit_preload_node)) {
		1096	preempt_enable();
		1097	pva = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, node);
		1098	preempt_disable();
		1099
		1100	if (__this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva)) {
		1101	if (pva)
		1102	kmem_cache_free(vmap_area_cachep, pva);
		1103	}
		1104	}
		1105
1060	spin_lock(&vmap_area_lock);	1106	spin_lock(&vmap_area_lock);
		1107	preempt_enable();
1061		1108
1062	/*	1109	/*
1063	* If an allocation fails, the "vend" address is	1110	* If an allocation fails, the "vend" address is