summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorUladzislau Rezki (Sony) <urezki@gmail.com>2019-07-11 23:58:57 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-07-12 14:05:46 -0400
commit82dd23e84be3ead53b6d584d836f51852d1096e6 (patch)
treedfc114bbf18fcf8cdc72a84a46ce6633cf55b6da /mm
parentcacca6baf0b0a2dfe8eb3430b5f81916f35284cc (diff)
mm/vmalloc.c: preload a CPU with one object for split purpose
Refactor the NE_FIT_TYPE split case when it comes to an allocation of one extra object. We need it in order to build a remaining space. The preload is done per CPU in non-atomic context with GFP_KERNEL flags. More permissive parameters can be beneficial for systems which are suffer from high memory pressure or low memory condition. For example on my KVM system(4xCPUs, no swap, 256MB RAM) i can simulate the failure of page allocation with GFP_NOWAIT flags. Using "stress-ng" tool and starting N workers spinning on fork() and exit(), i can trigger below trace: <snip> [ 179.815161] stress-ng-fork: page allocation failure: order:0, mode:0x40800(GFP_NOWAIT|__GFP_COMP), nodemask=(null),cpuset=/,mems_allowed=0 [ 179.815168] CPU: 0 PID: 12612 Comm: stress-ng-fork Not tainted 5.2.0-rc3+ #1003 [ 179.815170] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 [ 179.815171] Call Trace: [ 179.815178] dump_stack+0x5c/0x7b [ 179.815182] warn_alloc+0x108/0x190 [ 179.815187] __alloc_pages_slowpath+0xdc7/0xdf0 [ 179.815191] __alloc_pages_nodemask+0x2de/0x330 [ 179.815194] cache_grow_begin+0x77/0x420 [ 179.815197] fallback_alloc+0x161/0x200 [ 179.815200] kmem_cache_alloc+0x1c9/0x570 [ 179.815202] alloc_vmap_area+0x32c/0x990 [ 179.815206] __get_vm_area_node+0xb0/0x170 [ 179.815208] __vmalloc_node_range+0x6d/0x230 [ 179.815211] ? _do_fork+0xce/0x3d0 [ 179.815213] copy_process.part.46+0x850/0x1b90 [ 179.815215] ? _do_fork+0xce/0x3d0 [ 179.815219] _do_fork+0xce/0x3d0 [ 179.815226] ? __do_page_fault+0x2bf/0x4e0 [ 179.815229] do_syscall_64+0x55/0x130 [ 179.815231] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 179.815234] RIP: 0033:0x7fedec4c738b ... [ 179.815237] RSP: 002b:00007ffda469d730 EFLAGS: 00000246 ORIG_RAX: 0000000000000038 [ 179.815239] RAX: ffffffffffffffda RBX: 00007ffda469d730 RCX: 00007fedec4c738b [ 179.815240] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000001200011 [ 179.815241] RBP: 00007ffda469d780 R08: 00007fededd6e300 R09: 00007ffda47f50a0 [ 179.815242] R10: 00007fededd6e5d0 R11: 0000000000000246 R12: 0000000000000000 [ 179.815243] R13: 0000000000000020 R14: 0000000000000000 R15: 0000000000000000 [ 179.815245] Mem-Info: [ 179.815249] active_anon:12686 inactive_anon:14760 isolated_anon:0 active_file:502 inactive_file:61 isolated_file:70 unevictable:2 dirty:0 writeback:0 unstable:0 slab_reclaimable:2380 slab_unreclaimable:7520 mapped:15069 shmem:14813 pagetables:10833 bounce:0 free:1922 free_pcp:229 free_cma:0 <snip> Link: http://lkml.kernel.org/r/20190606120411.8298-3-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com> Cc: Hillf Danton <hdanton@sina.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Michal Hocko <mhocko@suse.com> Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sonymobile.com> Cc: Roman Gushchin <guro@fb.com> Cc: Steven Rostedt <rostedt@goodmis.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/vmalloc.c55
1 files changed, 51 insertions, 4 deletions
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index b645686ef9b6..45e0dc0e09f8 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -365,6 +365,13 @@ static LIST_HEAD(free_vmap_area_list);
365 */ 365 */
366static struct rb_root free_vmap_area_root = RB_ROOT; 366static struct rb_root free_vmap_area_root = RB_ROOT;
367 367
368/*
369 * Preload a CPU with one object for "no edge" split case. The
370 * aim is to get rid of allocations from the atomic context, thus
371 * to use more permissive allocation masks.
372 */
373static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
374
368static __always_inline unsigned long 375static __always_inline unsigned long
369va_size(struct vmap_area *va) 376va_size(struct vmap_area *va)
370{ 377{
@@ -951,9 +958,24 @@ adjust_va_to_fit_type(struct vmap_area *va,
951 * L V NVA V R 958 * L V NVA V R
952 * |---|-------|---| 959 * |---|-------|---|
953 */ 960 */
954 lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT); 961 lva = __this_cpu_xchg(ne_fit_preload_node, NULL);
955 if (unlikely(!lva)) 962 if (unlikely(!lva)) {
956 return -1; 963 /*
964 * For percpu allocator we do not do any pre-allocation
965 * and leave it as it is. The reason is it most likely
966 * never ends up with NE_FIT_TYPE splitting. In case of
967 * percpu allocations offsets and sizes are aligned to
968 * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE
969 * are its main fitting cases.
970 *
971 * There are a few exceptions though, as an example it is
972 * a first allocation (early boot up) when we have "one"
973 * big free space that has to be split.
974 */
975 lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
976 if (!lva)
977 return -1;
978 }
957 979
958 /* 980 /*
959 * Build the remainder. 981 * Build the remainder.
@@ -1032,7 +1054,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
1032 unsigned long vstart, unsigned long vend, 1054 unsigned long vstart, unsigned long vend,
1033 int node, gfp_t gfp_mask) 1055 int node, gfp_t gfp_mask)
1034{ 1056{
1035 struct vmap_area *va; 1057 struct vmap_area *va, *pva;
1036 unsigned long addr; 1058 unsigned long addr;
1037 int purged = 0; 1059 int purged = 0;
1038 1060
@@ -1057,7 +1079,32 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
1057 kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK); 1079 kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK);
1058 1080
1059retry: 1081retry:
1082 /*
1083 * Preload this CPU with one extra vmap_area object to ensure
1084 * that we have it available when fit type of free area is
1085 * NE_FIT_TYPE.
1086 *
1087 * The preload is done in non-atomic context, thus it allows us
1088 * to use more permissive allocation masks to be more stable under
1089 * low memory condition and high memory pressure.
1090 *
1091 * Even if it fails we do not really care about that. Just proceed
1092 * as it is. "overflow" path will refill the cache we allocate from.
1093 */
1094 preempt_disable();
1095 if (!__this_cpu_read(ne_fit_preload_node)) {
1096 preempt_enable();
1097 pva = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, node);
1098 preempt_disable();
1099
1100 if (__this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva)) {
1101 if (pva)
1102 kmem_cache_free(vmap_area_cachep, pva);
1103 }
1104 }
1105
1060 spin_lock(&vmap_area_lock); 1106 spin_lock(&vmap_area_lock);
1107 preempt_enable();
1061 1108
1062 /* 1109 /*
1063 * If an allocation fails, the "vend" address is 1110 * If an allocation fails, the "vend" address is