aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/bpf/stackmap.c
diff options
context:
space:
mode:
authorSong Liu <songliubraving@fb.com>2018-03-14 13:23:21 -0400
committerDaniel Borkmann <daniel@iogearbox.net>2018-03-14 20:09:28 -0400
commit615755a77b2461ed78dfafb8a6649456201949c7 (patch)
tree0b3144d3c705b3cc1128641df6ae65121b136e47 /kernel/bpf/stackmap.c
parent6d8cb045cde681e64a5ed80a2ab70be831a7f9b0 (diff)
bpf: extend stackmap to save binary_build_id+offset instead of address
Currently, bpf stackmap store address for each entry in the call trace. To map these addresses to user space files, it is necessary to maintain the mapping from these virtual address to symbols in the binary. Usually, the user space profiler (such as perf) has to scan /proc/pid/maps at the beginning of profiling, and monitor mmap2() calls afterwards. Given the cost of maintaining the address map, this solution is not practical for system wide profiling that is always on. This patch tries to solve this problem with a variation of stackmap. This variation is enabled by flag BPF_F_STACK_BUILD_ID. Instead of storing addresses, the variation stores ELF file build_id + offset. Build ID is a 20-byte unique identifier for ELF files. The following command shows the Build ID of /bin/bash: [user@]$ readelf -n /bin/bash ... Build ID: XXXXXXXXXX ... With BPF_F_STACK_BUILD_ID, bpf_get_stackid() tries to parse Build ID for each entry in the call trace, and translate it into the following struct: struct bpf_stack_build_id_offset { __s32 status; unsigned char build_id[BPF_BUILD_ID_SIZE]; union { __u64 offset; __u64 ip; }; }; The search of build_id is limited to the first page of the file, and this page should be in page cache. Otherwise, we fallback to store ip for this entry (ip field in struct bpf_stack_build_id_offset). This requires the build_id to be stored in the first page. A quick survey of binary and dynamic library files in a few different systems shows that almost all binary and dynamic library files have build_id in the first page. Build_id is only meaningful for user stack. If a kernel stack is added to a stackmap with BPF_F_STACK_BUILD_ID, it will automatically fallback to only store ip (status == BPF_STACK_BUILD_ID_IP). Similarly, if build_id lookup failed for some reason, it will also fallback to store ip. User space can access struct bpf_stack_build_id_offset with bpf syscall BPF_MAP_LOOKUP_ELEM. It is necessary for user space to maintain mapping from build id to binary files. This mostly static mapping is much easier to maintain than per process address maps. Note: Stackmap with build_id only works in non-nmi context at this time. This is because we need to take mm->mmap_sem for find_vma(). If this changes, we would like to allow build_id lookup in nmi context. Signed-off-by: Song Liu <songliubraving@fb.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Diffstat (limited to 'kernel/bpf/stackmap.c')
-rw-r--r--kernel/bpf/stackmap.c257
1 files changed, 235 insertions, 22 deletions
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index b0ecf43f5894..57eeb1234b67 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -9,16 +9,19 @@
9#include <linux/filter.h> 9#include <linux/filter.h>
10#include <linux/stacktrace.h> 10#include <linux/stacktrace.h>
11#include <linux/perf_event.h> 11#include <linux/perf_event.h>
12#include <linux/elf.h>
13#include <linux/pagemap.h>
12#include "percpu_freelist.h" 14#include "percpu_freelist.h"
13 15
14#define STACK_CREATE_FLAG_MASK \ 16#define STACK_CREATE_FLAG_MASK \
15 (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) 17 (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \
18 BPF_F_STACK_BUILD_ID)
16 19
17struct stack_map_bucket { 20struct stack_map_bucket {
18 struct pcpu_freelist_node fnode; 21 struct pcpu_freelist_node fnode;
19 u32 hash; 22 u32 hash;
20 u32 nr; 23 u32 nr;
21 u64 ip[]; 24 u64 data[];
22}; 25};
23 26
24struct bpf_stack_map { 27struct bpf_stack_map {
@@ -29,6 +32,17 @@ struct bpf_stack_map {
29 struct stack_map_bucket *buckets[]; 32 struct stack_map_bucket *buckets[];
30}; 33};
31 34
35static inline bool stack_map_use_build_id(struct bpf_map *map)
36{
37 return (map->map_flags & BPF_F_STACK_BUILD_ID);
38}
39
40static inline int stack_map_data_size(struct bpf_map *map)
41{
42 return stack_map_use_build_id(map) ?
43 sizeof(struct bpf_stack_build_id) : sizeof(u64);
44}
45
32static int prealloc_elems_and_freelist(struct bpf_stack_map *smap) 46static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
33{ 47{
34 u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size; 48 u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
@@ -68,8 +82,16 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
68 82
69 /* check sanity of attributes */ 83 /* check sanity of attributes */
70 if (attr->max_entries == 0 || attr->key_size != 4 || 84 if (attr->max_entries == 0 || attr->key_size != 4 ||
71 value_size < 8 || value_size % 8 || 85 value_size < 8 || value_size % 8)
72 value_size / 8 > sysctl_perf_event_max_stack) 86 return ERR_PTR(-EINVAL);
87
88 BUILD_BUG_ON(sizeof(struct bpf_stack_build_id) % sizeof(u64));
89 if (attr->map_flags & BPF_F_STACK_BUILD_ID) {
90 if (value_size % sizeof(struct bpf_stack_build_id) ||
91 value_size / sizeof(struct bpf_stack_build_id)
92 > sysctl_perf_event_max_stack)
93 return ERR_PTR(-EINVAL);
94 } else if (value_size / 8 > sysctl_perf_event_max_stack)
73 return ERR_PTR(-EINVAL); 95 return ERR_PTR(-EINVAL);
74 96
75 /* hash table size must be power of 2 */ 97 /* hash table size must be power of 2 */
@@ -114,13 +136,184 @@ free_smap:
114 return ERR_PTR(err); 136 return ERR_PTR(err);
115} 137}
116 138
139#define BPF_BUILD_ID 3
140/*
141 * Parse build id from the note segment. This logic can be shared between
142 * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are
143 * identical.
144 */
145static inline int stack_map_parse_build_id(void *page_addr,
146 unsigned char *build_id,
147 void *note_start,
148 Elf32_Word note_size)
149{
150 Elf32_Word note_offs = 0, new_offs;
151
152 /* check for overflow */
153 if (note_start < page_addr || note_start + note_size < note_start)
154 return -EINVAL;
155
156 /* only supports note that fits in the first page */
157 if (note_start + note_size > page_addr + PAGE_SIZE)
158 return -EINVAL;
159
160 while (note_offs + sizeof(Elf32_Nhdr) < note_size) {
161 Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs);
162
163 if (nhdr->n_type == BPF_BUILD_ID &&
164 nhdr->n_namesz == sizeof("GNU") &&
165 nhdr->n_descsz == BPF_BUILD_ID_SIZE) {
166 memcpy(build_id,
167 note_start + note_offs +
168 ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr),
169 BPF_BUILD_ID_SIZE);
170 return 0;
171 }
172 new_offs = note_offs + sizeof(Elf32_Nhdr) +
173 ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4);
174 if (new_offs <= note_offs) /* overflow */
175 break;
176 note_offs = new_offs;
177 }
178 return -EINVAL;
179}
180
181/* Parse build ID from 32-bit ELF */
182static int stack_map_get_build_id_32(void *page_addr,
183 unsigned char *build_id)
184{
185 Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr;
186 Elf32_Phdr *phdr;
187 int i;
188
189 /* only supports phdr that fits in one page */
190 if (ehdr->e_phnum >
191 (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr))
192 return -EINVAL;
193
194 phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr));
195
196 for (i = 0; i < ehdr->e_phnum; ++i)
197 if (phdr[i].p_type == PT_NOTE)
198 return stack_map_parse_build_id(page_addr, build_id,
199 page_addr + phdr[i].p_offset,
200 phdr[i].p_filesz);
201 return -EINVAL;
202}
203
204/* Parse build ID from 64-bit ELF */
205static int stack_map_get_build_id_64(void *page_addr,
206 unsigned char *build_id)
207{
208 Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr;
209 Elf64_Phdr *phdr;
210 int i;
211
212 /* only supports phdr that fits in one page */
213 if (ehdr->e_phnum >
214 (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr))
215 return -EINVAL;
216
217 phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr));
218
219 for (i = 0; i < ehdr->e_phnum; ++i)
220 if (phdr[i].p_type == PT_NOTE)
221 return stack_map_parse_build_id(page_addr, build_id,
222 page_addr + phdr[i].p_offset,
223 phdr[i].p_filesz);
224 return -EINVAL;
225}
226
227/* Parse build ID of ELF file mapped to vma */
228static int stack_map_get_build_id(struct vm_area_struct *vma,
229 unsigned char *build_id)
230{
231 Elf32_Ehdr *ehdr;
232 struct page *page;
233 void *page_addr;
234 int ret;
235
236 /* only works for page backed storage */
237 if (!vma->vm_file)
238 return -EINVAL;
239
240 page = find_get_page(vma->vm_file->f_mapping, 0);
241 if (!page)
242 return -EFAULT; /* page not mapped */
243
244 ret = -EINVAL;
245 page_addr = page_address(page);
246 ehdr = (Elf32_Ehdr *)page_addr;
247
248 /* compare magic x7f "ELF" */
249 if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0)
250 goto out;
251
252 /* only support executable file and shared object file */
253 if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN)
254 goto out;
255
256 if (ehdr->e_ident[EI_CLASS] == ELFCLASS32)
257 ret = stack_map_get_build_id_32(page_addr, build_id);
258 else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64)
259 ret = stack_map_get_build_id_64(page_addr, build_id);
260out:
261 put_page(page);
262 return ret;
263}
264
265static void stack_map_get_build_id_offset(struct bpf_map *map,
266 struct stack_map_bucket *bucket,
267 u64 *ips, u32 trace_nr, bool user)
268{
269 int i;
270 struct vm_area_struct *vma;
271 struct bpf_stack_build_id *id_offs;
272
273 bucket->nr = trace_nr;
274 id_offs = (struct bpf_stack_build_id *)bucket->data;
275
276 /*
277 * We cannot do up_read() in nmi context, so build_id lookup is
278 * only supported for non-nmi events. If at some point, it is
279 * possible to run find_vma() without taking the semaphore, we
280 * would like to allow build_id lookup in nmi context.
281 *
282 * Same fallback is used for kernel stack (!user) on a stackmap
283 * with build_id.
284 */
285 if (!user || !current || !current->mm || in_nmi() ||
286 down_read_trylock(&current->mm->mmap_sem) == 0) {
287 /* cannot access current->mm, fall back to ips */
288 for (i = 0; i < trace_nr; i++) {
289 id_offs[i].status = BPF_STACK_BUILD_ID_IP;
290 id_offs[i].ip = ips[i];
291 }
292 return;
293 }
294
295 for (i = 0; i < trace_nr; i++) {
296 vma = find_vma(current->mm, ips[i]);
297 if (!vma || stack_map_get_build_id(vma, id_offs[i].build_id)) {
298 /* per entry fall back to ips */
299 id_offs[i].status = BPF_STACK_BUILD_ID_IP;
300 id_offs[i].ip = ips[i];
301 continue;
302 }
303 id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i]
304 - vma->vm_start;
305 id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
306 }
307 up_read(&current->mm->mmap_sem);
308}
309
117BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, 310BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
118 u64, flags) 311 u64, flags)
119{ 312{
120 struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); 313 struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
121 struct perf_callchain_entry *trace; 314 struct perf_callchain_entry *trace;
122 struct stack_map_bucket *bucket, *new_bucket, *old_bucket; 315 struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
123 u32 max_depth = map->value_size / 8; 316 u32 max_depth = map->value_size / stack_map_data_size(map);
124 /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ 317 /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
125 u32 init_nr = sysctl_perf_event_max_stack - max_depth; 318 u32 init_nr = sysctl_perf_event_max_stack - max_depth;
126 u32 skip = flags & BPF_F_SKIP_FIELD_MASK; 319 u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
@@ -128,6 +321,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
128 bool user = flags & BPF_F_USER_STACK; 321 bool user = flags & BPF_F_USER_STACK;
129 bool kernel = !user; 322 bool kernel = !user;
130 u64 *ips; 323 u64 *ips;
324 bool hash_matches;
131 325
132 if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | 326 if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
133 BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) 327 BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
@@ -156,24 +350,43 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
156 id = hash & (smap->n_buckets - 1); 350 id = hash & (smap->n_buckets - 1);
157 bucket = READ_ONCE(smap->buckets[id]); 351 bucket = READ_ONCE(smap->buckets[id]);
158 352
159 if (bucket && bucket->hash == hash) { 353 hash_matches = bucket && bucket->hash == hash;
160 if (flags & BPF_F_FAST_STACK_CMP) 354 /* fast cmp */
355 if (hash_matches && flags & BPF_F_FAST_STACK_CMP)
356 return id;
357
358 if (stack_map_use_build_id(map)) {
359 /* for build_id+offset, pop a bucket before slow cmp */
360 new_bucket = (struct stack_map_bucket *)
361 pcpu_freelist_pop(&smap->freelist);
362 if (unlikely(!new_bucket))
363 return -ENOMEM;
364 stack_map_get_build_id_offset(map, new_bucket, ips,
365 trace_nr, user);
366 trace_len = trace_nr * sizeof(struct bpf_stack_build_id);
367 if (hash_matches && bucket->nr == trace_nr &&
368 memcmp(bucket->data, new_bucket->data, trace_len) == 0) {
369 pcpu_freelist_push(&smap->freelist, &new_bucket->fnode);
161 return id; 370 return id;
162 if (bucket->nr == trace_nr && 371 }
163 memcmp(bucket->ip, ips, trace_len) == 0) 372 if (bucket && !(flags & BPF_F_REUSE_STACKID)) {
373 pcpu_freelist_push(&smap->freelist, &new_bucket->fnode);
374 return -EEXIST;
375 }
376 } else {
377 if (hash_matches && bucket->nr == trace_nr &&
378 memcmp(bucket->data, ips, trace_len) == 0)
164 return id; 379 return id;
380 if (bucket && !(flags & BPF_F_REUSE_STACKID))
381 return -EEXIST;
382
383 new_bucket = (struct stack_map_bucket *)
384 pcpu_freelist_pop(&smap->freelist);
385 if (unlikely(!new_bucket))
386 return -ENOMEM;
387 memcpy(new_bucket->data, ips, trace_len);
165 } 388 }
166 389
167 /* this call stack is not in the map, try to add it */
168 if (bucket && !(flags & BPF_F_REUSE_STACKID))
169 return -EEXIST;
170
171 new_bucket = (struct stack_map_bucket *)
172 pcpu_freelist_pop(&smap->freelist);
173 if (unlikely(!new_bucket))
174 return -ENOMEM;
175
176 memcpy(new_bucket->ip, ips, trace_len);
177 new_bucket->hash = hash; 390 new_bucket->hash = hash;
178 new_bucket->nr = trace_nr; 391 new_bucket->nr = trace_nr;
179 392
@@ -212,8 +425,8 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
212 if (!bucket) 425 if (!bucket)
213 return -ENOENT; 426 return -ENOENT;
214 427
215 trace_len = bucket->nr * sizeof(u64); 428 trace_len = bucket->nr * stack_map_data_size(map);
216 memcpy(value, bucket->ip, trace_len); 429 memcpy(value, bucket->data, trace_len);
217 memset(value + trace_len, 0, map->value_size - trace_len); 430 memset(value + trace_len, 0, map->value_size - trace_len);
218 431
219 old_bucket = xchg(&smap->buckets[id], bucket); 432 old_bucket = xchg(&smap->buckets[id], bucket);