aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/bpf
diff options
context:
space:
mode:
authorAlexei Starovoitov <ast@fb.com>2016-03-08 00:57:17 -0500
committerDavid S. Miller <davem@davemloft.net>2016-03-08 15:28:31 -0500
commit557c0c6e7df8e14a46bd7560d193fa5bbc00a858 (patch)
treed771d9c8b211cb25c33e8bf55fcf4609f8b8b783 /kernel/bpf
parent823707b68d6e6c4b1be619b039c7045fef1740e6 (diff)
bpf: convert stackmap to pre-allocation
It was observed that calling bpf_get_stackid() from a kprobe inside slub or from spin_unlock causes similar deadlock as with hashmap, therefore convert stackmap to use pre-allocated memory. The call_rcu is no longer feasible mechanism, since delayed freeing causes bpf_get_stackid() to fail unpredictably when number of actual stacks is significantly less than user requested max_entries. Since elements are no longer freed into slub, we can push elements into freelist immediately and let them be recycled. However the very unlikley race between user space map_lookup() and program-side recycling is possible: cpu0 cpu1 ---- ---- user does lookup(stackidX) starts copying ips into buffer delete(stackidX) calls bpf_get_stackid() which recyles the element and overwrites with new stack trace To avoid user space seeing a partial stack trace consisting of two merged stack traces, do bucket = xchg(, NULL); copy; xchg(,bucket); to preserve consistent stack trace delivery to user space. Now we can move memset(,0) of left-over element value from critical path of bpf_get_stackid() into slow-path of user space lookup. Also disallow lookup() from bpf program, since it's useless and program shouldn't be messing with collected stack trace. Note that similar race between user space lookup and kernel side updates is also present in hashmap, but it's not a new race. bpf programs were always allowed to modify hash and array map elements while user space is copying them. Fixes: d5a3b1f69186 ("bpf: introduce BPF_MAP_TYPE_STACK_TRACE") Signed-off-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'kernel/bpf')
-rw-r--r--kernel/bpf/stackmap.c86
-rw-r--r--kernel/bpf/syscall.c2
2 files changed, 70 insertions, 18 deletions
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index f0a02c344358..499d9e933f8e 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -10,9 +10,10 @@
10#include <linux/vmalloc.h> 10#include <linux/vmalloc.h>
11#include <linux/stacktrace.h> 11#include <linux/stacktrace.h>
12#include <linux/perf_event.h> 12#include <linux/perf_event.h>
13#include "percpu_freelist.h"
13 14
14struct stack_map_bucket { 15struct stack_map_bucket {
15 struct rcu_head rcu; 16 struct pcpu_freelist_node fnode;
16 u32 hash; 17 u32 hash;
17 u32 nr; 18 u32 nr;
18 u64 ip[]; 19 u64 ip[];
@@ -20,10 +21,34 @@ struct stack_map_bucket {
20 21
21struct bpf_stack_map { 22struct bpf_stack_map {
22 struct bpf_map map; 23 struct bpf_map map;
24 void *elems;
25 struct pcpu_freelist freelist;
23 u32 n_buckets; 26 u32 n_buckets;
24 struct stack_map_bucket __rcu *buckets[]; 27 struct stack_map_bucket *buckets[];
25}; 28};
26 29
30static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
31{
32 u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
33 int err;
34
35 smap->elems = vzalloc(elem_size * smap->map.max_entries);
36 if (!smap->elems)
37 return -ENOMEM;
38
39 err = pcpu_freelist_init(&smap->freelist);
40 if (err)
41 goto free_elems;
42
43 pcpu_freelist_populate(&smap->freelist, smap->elems, elem_size,
44 smap->map.max_entries);
45 return 0;
46
47free_elems:
48 vfree(smap->elems);
49 return err;
50}
51
27/* Called from syscall */ 52/* Called from syscall */
28static struct bpf_map *stack_map_alloc(union bpf_attr *attr) 53static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
29{ 54{
@@ -70,12 +95,22 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
70 smap->n_buckets = n_buckets; 95 smap->n_buckets = n_buckets;
71 smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; 96 smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
72 97
98 err = bpf_map_precharge_memlock(smap->map.pages);
99 if (err)
100 goto free_smap;
101
73 err = get_callchain_buffers(); 102 err = get_callchain_buffers();
74 if (err) 103 if (err)
75 goto free_smap; 104 goto free_smap;
76 105
106 err = prealloc_elems_and_freelist(smap);
107 if (err)
108 goto put_buffers;
109
77 return &smap->map; 110 return &smap->map;
78 111
112put_buffers:
113 put_callchain_buffers();
79free_smap: 114free_smap:
80 kvfree(smap); 115 kvfree(smap);
81 return ERR_PTR(err); 116 return ERR_PTR(err);
@@ -121,7 +156,7 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
121 ips = trace->ip + skip + init_nr; 156 ips = trace->ip + skip + init_nr;
122 hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0); 157 hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0);
123 id = hash & (smap->n_buckets - 1); 158 id = hash & (smap->n_buckets - 1);
124 bucket = rcu_dereference(smap->buckets[id]); 159 bucket = READ_ONCE(smap->buckets[id]);
125 160
126 if (bucket && bucket->hash == hash) { 161 if (bucket && bucket->hash == hash) {
127 if (flags & BPF_F_FAST_STACK_CMP) 162 if (flags & BPF_F_FAST_STACK_CMP)
@@ -135,19 +170,18 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
135 if (bucket && !(flags & BPF_F_REUSE_STACKID)) 170 if (bucket && !(flags & BPF_F_REUSE_STACKID))
136 return -EEXIST; 171 return -EEXIST;
137 172
138 new_bucket = kmalloc(sizeof(struct stack_map_bucket) + map->value_size, 173 new_bucket = (struct stack_map_bucket *)
139 GFP_ATOMIC | __GFP_NOWARN); 174 pcpu_freelist_pop(&smap->freelist);
140 if (unlikely(!new_bucket)) 175 if (unlikely(!new_bucket))
141 return -ENOMEM; 176 return -ENOMEM;
142 177
143 memcpy(new_bucket->ip, ips, trace_len); 178 memcpy(new_bucket->ip, ips, trace_len);
144 memset(new_bucket->ip + trace_len / 8, 0, map->value_size - trace_len);
145 new_bucket->hash = hash; 179 new_bucket->hash = hash;
146 new_bucket->nr = trace_nr; 180 new_bucket->nr = trace_nr;
147 181
148 old_bucket = xchg(&smap->buckets[id], new_bucket); 182 old_bucket = xchg(&smap->buckets[id], new_bucket);
149 if (old_bucket) 183 if (old_bucket)
150 kfree_rcu(old_bucket, rcu); 184 pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
151 return id; 185 return id;
152} 186}
153 187
@@ -160,17 +194,34 @@ const struct bpf_func_proto bpf_get_stackid_proto = {
160 .arg3_type = ARG_ANYTHING, 194 .arg3_type = ARG_ANYTHING,
161}; 195};
162 196
163/* Called from syscall or from eBPF program */ 197/* Called from eBPF program */
164static void *stack_map_lookup_elem(struct bpf_map *map, void *key) 198static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
165{ 199{
200 return NULL;
201}
202
203/* Called from syscall */
204int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
205{
166 struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); 206 struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
167 struct stack_map_bucket *bucket; 207 struct stack_map_bucket *bucket, *old_bucket;
168 u32 id = *(u32 *)key; 208 u32 id = *(u32 *)key, trace_len;
169 209
170 if (unlikely(id >= smap->n_buckets)) 210 if (unlikely(id >= smap->n_buckets))
171 return NULL; 211 return -ENOENT;
172 bucket = rcu_dereference(smap->buckets[id]); 212
173 return bucket ? bucket->ip : NULL; 213 bucket = xchg(&smap->buckets[id], NULL);
214 if (!bucket)
215 return -ENOENT;
216
217 trace_len = bucket->nr * sizeof(u64);
218 memcpy(value, bucket->ip, trace_len);
219 memset(value + trace_len, 0, map->value_size - trace_len);
220
221 old_bucket = xchg(&smap->buckets[id], bucket);
222 if (old_bucket)
223 pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
224 return 0;
174} 225}
175 226
176static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key) 227static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
@@ -196,7 +247,7 @@ static int stack_map_delete_elem(struct bpf_map *map, void *key)
196 247
197 old_bucket = xchg(&smap->buckets[id], NULL); 248 old_bucket = xchg(&smap->buckets[id], NULL);
198 if (old_bucket) { 249 if (old_bucket) {
199 kfree_rcu(old_bucket, rcu); 250 pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
200 return 0; 251 return 0;
201 } else { 252 } else {
202 return -ENOENT; 253 return -ENOENT;
@@ -207,13 +258,12 @@ static int stack_map_delete_elem(struct bpf_map *map, void *key)
207static void stack_map_free(struct bpf_map *map) 258static void stack_map_free(struct bpf_map *map)
208{ 259{
209 struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); 260 struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
210 int i;
211 261
262 /* wait for bpf programs to complete before freeing stack map */
212 synchronize_rcu(); 263 synchronize_rcu();
213 264
214 for (i = 0; i < smap->n_buckets; i++) 265 vfree(smap->elems);
215 if (smap->buckets[i]) 266 pcpu_freelist_destroy(&smap->freelist);
216 kfree_rcu(smap->buckets[i], rcu);
217 kvfree(smap); 267 kvfree(smap);
218 put_callchain_buffers(); 268 put_callchain_buffers();
219} 269}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cbd94b2144ff..2978d0d08869 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -290,6 +290,8 @@ static int map_lookup_elem(union bpf_attr *attr)
290 err = bpf_percpu_hash_copy(map, key, value); 290 err = bpf_percpu_hash_copy(map, key, value);
291 } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { 291 } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
292 err = bpf_percpu_array_copy(map, key, value); 292 err = bpf_percpu_array_copy(map, key, value);
293 } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
294 err = bpf_stackmap_copy(map, key, value);
293 } else { 295 } else {
294 rcu_read_lock(); 296 rcu_read_lock();
295 ptr = map->ops->map_lookup_elem(map, key); 297 ptr = map->ops->map_lookup_elem(map, key);