summaryrefslogtreecommitdiffstats
path: root/kernel/bpf/syscall.c
diff options
context:
space:
mode:
authorMartynas Pumputis <m@lambda.lt>2019-03-18 11:10:26 -0400
committerDaniel Borkmann <daniel@iogearbox.net>2019-03-18 11:48:25 -0400
commitf01a7dbe98ae4265023fa5d3af0f076f0b18a647 (patch)
tree1a3ed6ee305519403b394ceebde3c87013571f86 /kernel/bpf/syscall.c
parentea239314fe42ace880bdd834256834679346c80e (diff)
bpf: Try harder when allocating memory for large maps
It has been observed that sometimes a higher order memory allocation for BPF maps fails when there is no obvious memory pressure in a system. E.g. the map (BPF_MAP_TYPE_LRU_HASH, key=38, value=56, max_elems=524288) could not be created due to vmalloc unable to allocate 75497472B, when the system's memory consumption (in MB) was the following: Total: 3942 Used: 837 (21.24%) Free: 138 Buffers: 239 Cached: 2727 Later analysis [1] by Michal Hocko showed that the vmalloc was not trying to reclaim memory from the page cache and was failing prematurely due to __GFP_NORETRY. Considering dcda9b0471 ("mm, tree wide: replace __GFP_REPEAT by __GFP_RETRY_MAYFAIL with more useful semantic") and [1], we can replace __GFP_NORETRY with __GFP_RETRY_MAYFAIL, as it won't invoke OOM killer and will try harder to fulfil allocation requests. Unfortunately, replacing the body of the BPF map memory allocation function with the kvmalloc_node helper function is not an option at this point in time, given 1) kmalloc is non-optional for higher order allocations, and 2) passing __GFP_RETRY_MAYFAIL to the kmalloc would stress the slab allocator too much for large requests. The change has been tested with the workloads mentioned above and by observing oom_kill value from /proc/vmstat. [1]: https://lore.kernel.org/bpf/20190310071318.GW5232@dhcp22.suse.cz/ Signed-off-by: Martynas Pumputis <m@lambda.lt> Acked-by: Yonghong Song <yhs@fb.com> Cc: Michal Hocko <mhocko@suse.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Link: https://lore.kernel.org/bpf/20190318153940.GL8924@dhcp22.suse.cz/
Diffstat (limited to 'kernel/bpf/syscall.c')
-rw-r--r--kernel/bpf/syscall.c22
1 files changed, 15 insertions, 7 deletions
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 62f6bced3a3c..afca36f53c49 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -136,21 +136,29 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
136 136
137void *bpf_map_area_alloc(size_t size, int numa_node) 137void *bpf_map_area_alloc(size_t size, int numa_node)
138{ 138{
139 /* We definitely need __GFP_NORETRY, so OOM killer doesn't 139 /* We really just want to fail instead of triggering OOM killer
140 * trigger under memory pressure as we really just want to 140 * under memory pressure, therefore we set __GFP_NORETRY to kmalloc,
141 * fail instead. 141 * which is used for lower order allocation requests.
142 *
143 * It has been observed that higher order allocation requests done by
144 * vmalloc with __GFP_NORETRY being set might fail due to not trying
145 * to reclaim memory from the page cache, thus we set
146 * __GFP_RETRY_MAYFAIL to avoid such situations.
142 */ 147 */
143 const gfp_t flags = __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO; 148
149 const gfp_t flags = __GFP_NOWARN | __GFP_ZERO;
144 void *area; 150 void *area;
145 151
146 if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { 152 if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
147 area = kmalloc_node(size, GFP_USER | flags, numa_node); 153 area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags,
154 numa_node);
148 if (area != NULL) 155 if (area != NULL)
149 return area; 156 return area;
150 } 157 }
151 158
152 return __vmalloc_node_flags_caller(size, numa_node, GFP_KERNEL | flags, 159 return __vmalloc_node_flags_caller(size, numa_node,
153 __builtin_return_address(0)); 160 GFP_KERNEL | __GFP_RETRY_MAYFAIL |
161 flags, __builtin_return_address(0));
154} 162}
155 163
156void bpf_map_area_free(void *area) 164void bpf_map_area_free(void *area)