bpf: extend stackmap to save binary_build_id+offset instead of address

Currently, bpf stackmap store address for each entry in the call trace. To map these addresses to user space files, it is necessary to maintain the mapping from these virtual address to symbols in the binary. Usually, the user space profiler (such as perf) has to scan /proc/pid/maps at the beginning of profiling, and monitor mmap2() calls afterwards. Given the cost of maintaining the address map, this solution is not practical for system wide profiling that is always on. This patch tries to solve this problem with a variation of stackmap. This variation is enabled by flag BPF_F_STACK_BUILD_ID. Instead of storing addresses, the variation stores ELF file build_id + offset. Build ID is a 20-byte unique identifier for ELF files. The following command shows the Build ID of /bin/bash: [user@]$ readelf -n /bin/bash ... Build ID: XXXXXXXXXX ... With BPF_F_STACK_BUILD_ID, bpf_get_stackid() tries to parse Build ID for each entry in the call trace, and translate it into the following struct: struct bpf_stack_build_id_offset { __s32 status; unsigned char build_id[BPF_BUILD_ID_SIZE]; union { __u64 offset; __u64 ip; }; }; The search of build_id is limited to the first page of the file, and this page should be in page cache. Otherwise, we fallback to store ip for this entry (ip field in struct bpf_stack_build_id_offset). This requires the build_id to be stored in the first page. A quick survey of binary and dynamic library files in a few different systems shows that almost all binary and dynamic library files have build_id in the first page. Build_id is only meaningful for user stack. If a kernel stack is added to a stackmap with BPF_F_STACK_BUILD_ID, it will automatically fallback to only store ip (status == BPF_STACK_BUILD_ID_IP). Similarly, if build_id lookup failed for some reason, it will also fallback to store ip. User space can access struct bpf_stack_build_id_offset with bpf syscall BPF_MAP_LOOKUP_ELEM. It is necessary for user space to maintain mapping from build id to binary files. This mostly static mapping is much easier to maintain than per process address maps. Note: Stackmap with build_id only works in non-nmi context at this time. This is because we need to take mm->mmap_sem for find_vma(). If this changes, we would like to allow build_id lookup in nmi context. Signed-off-by: Song Liu <songliubraving@fb.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
author: Song Liu <songliubraving@fb.com> 2018-03-14 13:23:21 -0400
committer: Daniel Borkmann <daniel@iogearbox.net> 2018-03-14 20:09:28 -0400
commit: 615755a77b2461ed78dfafb8a6649456201949c7 (patch)
tree: 0b3144d3c705b3cc1128641df6ae65121b136e47 /kernel/bpf/stackmap.c
parent: 6d8cb045cde681e64a5ed80a2ab70be831a7f9b0 (diff)
1 files changed, 235 insertions, 22 deletions
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index b0ecf43f5894..57eeb1234b67 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -9,16 +9,19 @@
 #include <linux/filter.h>
 #include <linux/stacktrace.h>
 #include <linux/perf_event.h>
+#include <linux/elf.h>
+#include <linux/pagemap.h>
 #include "percpu_freelist.h"
-#define STACK_CREATE_FLAG_MASK \
+#define STACK_CREATE_FLAG_MASK                                  \
-        (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+        (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY |        \
+         BPF_F_STACK_BUILD_ID)
 struct stack_map_bucket {
        struct pcpu_freelist_node fnode;
        u32 hash;
        u32 nr;
-        u64 ip[];
+        u64 data[];
 };
 struct bpf_stack_map {
@@ -29,6 +32,17 @@ struct bpf_stack_map {
        struct stack_map_bucket *buckets[];
 };
+static inline bool stack_map_use_build_id(struct bpf_map *map)
+{
+        return (map->map_flags & BPF_F_STACK_BUILD_ID);
+}
+static inline int stack_map_data_size(struct bpf_map *map)
+{
+        return stack_map_use_build_id(map) ?
+                sizeof(struct bpf_stack_build_id) : sizeof(u64);
+}
 static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
 {
        u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
@@ -68,8 +82,16 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
        /* check sanity of attributes */
        if (attr->max_entries == 0 || attr->key_size != 4 ||
-            value_size < 8 || value_size % 8 ||
+            value_size < 8 || value_size % 8)
-            value_size / 8 > sysctl_perf_event_max_stack)
+                return ERR_PTR(-EINVAL);
+        BUILD_BUG_ON(sizeof(struct bpf_stack_build_id) % sizeof(u64));
+        if (attr->map_flags & BPF_F_STACK_BUILD_ID) {
+                if (value_size % sizeof(struct bpf_stack_build_id) ||
+                    value_size / sizeof(struct bpf_stack_build_id)
+                    > sysctl_perf_event_max_stack)
+                        return ERR_PTR(-EINVAL);
+        } else if (value_size / 8 > sysctl_perf_event_max_stack)
                return ERR_PTR(-EINVAL);
        /* hash table size must be power of 2 */
@@ -114,13 +136,184 @@ free_smap:
        return ERR_PTR(err);
 }
+#define BPF_BUILD_ID 3
+/*
+ * Parse build id from the note segment. This logic can be shared between
+ * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are
+ * identical.
+ */
+static inline int stack_map_parse_build_id(void *page_addr,
+                                           unsigned char *build_id,
+                                           void *note_start,
+                                           Elf32_Word note_size)
+{
+        Elf32_Word note_offs = 0, new_offs;
+        /* check for overflow */
+        if (note_start < page_addr || note_start + note_size < note_start)
+                return -EINVAL;
+        /* only supports note that fits in the first page */
+        if (note_start + note_size > page_addr + PAGE_SIZE)
+                return -EINVAL;
+        while (note_offs + sizeof(Elf32_Nhdr) < note_size) {
+                Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs);
+                if (nhdr->n_type == BPF_BUILD_ID &&
+                    nhdr->n_namesz == sizeof("GNU") &&
+                    nhdr->n_descsz == BPF_BUILD_ID_SIZE) {
+                        memcpy(build_id,
+                               note_start + note_offs +
+                               ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr),
+                               BPF_BUILD_ID_SIZE);
+                        return 0;
+                }
+                new_offs = note_offs + sizeof(Elf32_Nhdr) +
+                        ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4);
+                if (new_offs <= note_offs)  /* overflow */
+                        break;
+                note_offs = new_offs;
+        }
+        return -EINVAL;
+}
+/* Parse build ID from 32-bit ELF */
+static int stack_map_get_build_id_32(void *page_addr,
+                                     unsigned char *build_id)
+{
+        Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr;
+        Elf32_Phdr *phdr;
+        int i;
+        /* only supports phdr that fits in one page */
+        if (ehdr->e_phnum >
+            (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr))
+                return -EINVAL;
+        phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr));
+        for (i = 0; i < ehdr->e_phnum; ++i)
+                if (phdr[i].p_type == PT_NOTE)
+                        return stack_map_parse_build_id(page_addr, build_id,
+                                        page_addr + phdr[i].p_offset,
+                                        phdr[i].p_filesz);
+        return -EINVAL;
+}
+/* Parse build ID from 64-bit ELF */
+static int stack_map_get_build_id_64(void *page_addr,
+                                     unsigned char *build_id)
+{
+        Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr;
+        Elf64_Phdr *phdr;
+        int i;
+        /* only supports phdr that fits in one page */
+        if (ehdr->e_phnum >
+            (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr))
+                return -EINVAL;
+        phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr));
+        for (i = 0; i < ehdr->e_phnum; ++i)
+                if (phdr[i].p_type == PT_NOTE)
+                        return stack_map_parse_build_id(page_addr, build_id,
+                                        page_addr + phdr[i].p_offset,
+                                        phdr[i].p_filesz);
+        return -EINVAL;
+}
+/* Parse build ID of ELF file mapped to vma */
+static int stack_map_get_build_id(struct vm_area_struct *vma,
+                                  unsigned char *build_id)
+{
+        Elf32_Ehdr *ehdr;
+        struct page *page;
+        void *page_addr;
+        int ret;
+        /* only works for page backed storage  */
+        if (!vma->vm_file)
+                return -EINVAL;
+        page = find_get_page(vma->vm_file->f_mapping, 0);
+        if (!page)
+                return -EFAULT; /* page not mapped */
+        ret = -EINVAL;
+        page_addr = page_address(page);
+        ehdr = (Elf32_Ehdr *)page_addr;
+        /* compare magic x7f "ELF" */
+        if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0)
+                goto out;
+        /* only support executable file and shared object file */
+        if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN)
+                goto out;
+        if (ehdr->e_ident[EI_CLASS] == ELFCLASS32)
+                ret = stack_map_get_build_id_32(page_addr, build_id);
+        else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64)
+                ret = stack_map_get_build_id_64(page_addr, build_id);
+out:
+        put_page(page);
+        return ret;
+}
+static void stack_map_get_build_id_offset(struct bpf_map *map,
+                                          struct stack_map_bucket *bucket,
+                                          u64 *ips, u32 trace_nr, bool user)
+{
+        int i;
+        struct vm_area_struct *vma;
+        struct bpf_stack_build_id *id_offs;
+        bucket->nr = trace_nr;
+        id_offs = (struct bpf_stack_build_id *)bucket->data;
+        /*
+         * We cannot do up_read() in nmi context, so build_id lookup is
+         * only supported for non-nmi events. If at some point, it is
+         * possible to run find_vma() without taking the semaphore, we
+         * would like to allow build_id lookup in nmi context.
+         *
+         * Same fallback is used for kernel stack (!user) on a stackmap
+         * with build_id.
+         */
+        if (!user || !current || !current->mm || in_nmi() ||
+            down_read_trylock(&current->mm->mmap_sem) == 0) {
+                /* cannot access current->mm, fall back to ips */
+                for (i = 0; i < trace_nr; i++) {
+                        id_offs[i].status = BPF_STACK_BUILD_ID_IP;
+                        id_offs[i].ip = ips[i];
+                }
+                return;
+        }
+        for (i = 0; i < trace_nr; i++) {
+                vma = find_vma(current->mm, ips[i]);
+                if (!vma || stack_map_get_build_id(vma, id_offs[i].build_id)) {
+                        /* per entry fall back to ips */
+                        id_offs[i].status = BPF_STACK_BUILD_ID_IP;
+                        id_offs[i].ip = ips[i];
+                        continue;
+                }
+                id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i]
+                        - vma->vm_start;
+                id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
+        }
+        up_read(&current->mm->mmap_sem);
+}
 BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
           u64, flags)
 {
        struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
        struct perf_callchain_entry *trace;
        struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
-        u32 max_depth = map->value_size / 8;
+        u32 max_depth = map->value_size / stack_map_data_size(map);
        /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
        u32 init_nr = sysctl_perf_event_max_stack - max_depth;
        u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
@@ -128,6 +321,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
        bool user = flags & BPF_F_USER_STACK;
        bool kernel = !user;
        u64 *ips;
+        bool hash_matches;
        if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
                               BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
@@ -156,24 +350,43 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
        id = hash & (smap->n_buckets - 1);
        bucket = READ_ONCE(smap->buckets[id]);
-        if (bucket && bucket->hash == hash) {
+        hash_matches = bucket && bucket->hash == hash;
-                if (flags & BPF_F_FAST_STACK_CMP)
+        /* fast cmp */
+        if (hash_matches && flags & BPF_F_FAST_STACK_CMP)
+                return id;
+        if (stack_map_use_build_id(map)) {
+                /* for build_id+offset, pop a bucket before slow cmp */
+                new_bucket = (struct stack_map_bucket *)
+                        pcpu_freelist_pop(&smap->freelist);
+                if (unlikely(!new_bucket))
+                        return -ENOMEM;
+                stack_map_get_build_id_offset(map, new_bucket, ips,
+                                              trace_nr, user);
+                trace_len = trace_nr * sizeof(struct bpf_stack_build_id);
+                if (hash_matches && bucket->nr == trace_nr &&
+                    memcmp(bucket->data, new_bucket->data, trace_len) == 0) {
+                        pcpu_freelist_push(&smap->freelist, &new_bucket->fnode);
                        return id;
-                if (bucket->nr == trace_nr &&
+                }
-                    memcmp(bucket->ip, ips, trace_len) == 0)
+                if (bucket && !(flags & BPF_F_REUSE_STACKID)) {
+                        pcpu_freelist_push(&smap->freelist, &new_bucket->fnode);
+                        return -EEXIST;
+                }
+        } else {
+                if (hash_matches && bucket->nr == trace_nr &&
+                    memcmp(bucket->data, ips, trace_len) == 0)
                        return id;
+                if (bucket && !(flags & BPF_F_REUSE_STACKID))
+                        return -EEXIST;
+                new_bucket = (struct stack_map_bucket *)
+                        pcpu_freelist_pop(&smap->freelist);
+                if (unlikely(!new_bucket))
+                        return -ENOMEM;
+                memcpy(new_bucket->data, ips, trace_len);
        }
-        /* this call stack is not in the map, try to add it */
-        if (bucket && !(flags & BPF_F_REUSE_STACKID))
-                return -EEXIST;
-        new_bucket = (struct stack_map_bucket *)
-                pcpu_freelist_pop(&smap->freelist);
-        if (unlikely(!new_bucket))
-                return -ENOMEM;
-        memcpy(new_bucket->ip, ips, trace_len);
        new_bucket->hash = hash;
        new_bucket->nr = trace_nr;
@@ -212,8 +425,8 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
        if (!bucket)
                return -ENOENT;
-        trace_len = bucket->nr * sizeof(u64);
+        trace_len = bucket->nr * stack_map_data_size(map);
-        memcpy(value, bucket->ip, trace_len);
+        memcpy(value, bucket->data, trace_len);
        memset(value + trace_len, 0, map->value_size - trace_len);
        old_bucket = xchg(&smap->buckets[id], bucket);
author	Song Liu <songliubraving@fb.com>	2018-03-14 13:23:21 -0400
committer	Daniel Borkmann <daniel@iogearbox.net>	2018-03-14 20:09:28 -0400
commit	615755a77b2461ed78dfafb8a6649456201949c7 (patch)
tree	0b3144d3c705b3cc1128641df6ae65121b136e47 /kernel/bpf/stackmap.c
parent	6d8cb045cde681e64a5ed80a2ab70be831a7f9b0 (diff)

diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index b0ecf43f5894..57eeb1234b67 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c
@@ -9,16 +9,19 @@
9	#include <linux/filter.h>	9	#include <linux/filter.h>
10	#include <linux/stacktrace.h>	10	#include <linux/stacktrace.h>
11	#include <linux/perf_event.h>	11	#include <linux/perf_event.h>
		12	#include <linux/elf.h>
		13	#include <linux/pagemap.h>
12	#include "percpu_freelist.h"	14	#include "percpu_freelist.h"
13		15
14	#define STACK_CREATE_FLAG_MASK \	16	#define STACK_CREATE_FLAG_MASK \
15	(BPF_F_NUMA_NODE \| BPF_F_RDONLY \| BPF_F_WRONLY)	17	(BPF_F_NUMA_NODE \| BPF_F_RDONLY \| BPF_F_WRONLY \| \
		18	BPF_F_STACK_BUILD_ID)
16		19
17	struct stack_map_bucket {	20	struct stack_map_bucket {
18	struct pcpu_freelist_node fnode;	21	struct pcpu_freelist_node fnode;
19	u32 hash;	22	u32 hash;
20	u32 nr;	23	u32 nr;
21	u64 ip[];	24	u64 data[];
22	};	25	};
23		26
24	struct bpf_stack_map {	27	struct bpf_stack_map {
@@ -29,6 +32,17 @@ struct bpf_stack_map {
29	struct stack_map_bucket *buckets[];	32	struct stack_map_bucket *buckets[];
30	};	33	};
31		34
		35	static inline bool stack_map_use_build_id(struct bpf_map *map)
		36	{
		37	return (map->map_flags & BPF_F_STACK_BUILD_ID);
		38	}
		39
		40	static inline int stack_map_data_size(struct bpf_map *map)
		41	{
		42	return stack_map_use_build_id(map) ?
		43	sizeof(struct bpf_stack_build_id) : sizeof(u64);
		44	}
		45
32	static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)	46	static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
33	{	47	{
34	u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;	48	u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
@@ -68,8 +82,16 @@ static struct bpf_map stack_map_alloc(union bpf_attr attr)
68		82
69	/* check sanity of attributes */	83	/* check sanity of attributes */
70	if (attr->max_entries == 0 \|\| attr->key_size != 4 \|\|	84	if (attr->max_entries == 0 \|\| attr->key_size != 4 \|\|
71	value_size < 8 \|\| value_size % 8 \|\|	85	value_size < 8 \|\| value_size % 8)
72	value_size / 8 > sysctl_perf_event_max_stack)	86	return ERR_PTR(-EINVAL);
		87
		88	BUILD_BUG_ON(sizeof(struct bpf_stack_build_id) % sizeof(u64));
		89	if (attr->map_flags & BPF_F_STACK_BUILD_ID) {
		90	if (value_size % sizeof(struct bpf_stack_build_id) \|\|
		91	value_size / sizeof(struct bpf_stack_build_id)
		92	> sysctl_perf_event_max_stack)
		93	return ERR_PTR(-EINVAL);
		94	} else if (value_size / 8 > sysctl_perf_event_max_stack)
73	return ERR_PTR(-EINVAL);	95	return ERR_PTR(-EINVAL);
74		96
75	/* hash table size must be power of 2 */	97	/* hash table size must be power of 2 */
@@ -114,13 +136,184 @@ free_smap:
114	return ERR_PTR(err);	136	return ERR_PTR(err);
115	}	137	}
116		138
		139	#define BPF_BUILD_ID 3
		140	/*
		141	* Parse build id from the note segment. This logic can be shared between
		142	* 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are
		143	* identical.
		144	*/
		145	static inline int stack_map_parse_build_id(void *page_addr,
		146	unsigned char *build_id,
		147	void *note_start,
		148	Elf32_Word note_size)
		149	{
		150	Elf32_Word note_offs = 0, new_offs;
		151
		152	/* check for overflow */
		153	if (note_start < page_addr \|\| note_start + note_size < note_start)
		154	return -EINVAL;
		155
		156	/* only supports note that fits in the first page */
		157	if (note_start + note_size > page_addr + PAGE_SIZE)
		158	return -EINVAL;
		159
		160	while (note_offs + sizeof(Elf32_Nhdr) < note_size) {
		161	Elf32_Nhdr nhdr = (Elf32_Nhdr )(note_start + note_offs);
		162
		163	if (nhdr->n_type == BPF_BUILD_ID &&
		164	nhdr->n_namesz == sizeof("GNU") &&
		165	nhdr->n_descsz == BPF_BUILD_ID_SIZE) {
		166	memcpy(build_id,
		167	note_start + note_offs +
		168	ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr),
		169	BPF_BUILD_ID_SIZE);
		170	return 0;
		171	}
		172	new_offs = note_offs + sizeof(Elf32_Nhdr) +
		173	ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4);
		174	if (new_offs <= note_offs) /* overflow */
		175	break;
		176	note_offs = new_offs;
		177	}
		178	return -EINVAL;
		179	}
		180
		181	/* Parse build ID from 32-bit ELF */
		182	static int stack_map_get_build_id_32(void *page_addr,
		183	unsigned char *build_id)
		184	{
		185	Elf32_Ehdr ehdr = (Elf32_Ehdr )page_addr;
		186	Elf32_Phdr *phdr;
		187	int i;
		188
		189	/* only supports phdr that fits in one page */
		190	if (ehdr->e_phnum >
		191	(PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr))
		192	return -EINVAL;
		193
		194	phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr));
		195
		196	for (i = 0; i < ehdr->e_phnum; ++i)
		197	if (phdr[i].p_type == PT_NOTE)
		198	return stack_map_parse_build_id(page_addr, build_id,
		199	page_addr + phdr[i].p_offset,
		200	phdr[i].p_filesz);
		201	return -EINVAL;
		202	}
		203
		204	/* Parse build ID from 64-bit ELF */
		205	static int stack_map_get_build_id_64(void *page_addr,
		206	unsigned char *build_id)
		207	{
		208	Elf64_Ehdr ehdr = (Elf64_Ehdr )page_addr;
		209	Elf64_Phdr *phdr;
		210	int i;
		211
		212	/* only supports phdr that fits in one page */
		213	if (ehdr->e_phnum >
		214	(PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr))
		215	return -EINVAL;
		216
		217	phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr));
		218
		219	for (i = 0; i < ehdr->e_phnum; ++i)
		220	if (phdr[i].p_type == PT_NOTE)
		221	return stack_map_parse_build_id(page_addr, build_id,
		222	page_addr + phdr[i].p_offset,
		223	phdr[i].p_filesz);
		224	return -EINVAL;
		225	}
		226
		227	/* Parse build ID of ELF file mapped to vma */
		228	static int stack_map_get_build_id(struct vm_area_struct *vma,
		229	unsigned char *build_id)
		230	{
		231	Elf32_Ehdr *ehdr;
		232	struct page *page;
		233	void *page_addr;
		234	int ret;
		235
		236	/* only works for page backed storage */
		237	if (!vma->vm_file)
		238	return -EINVAL;
		239
		240	page = find_get_page(vma->vm_file->f_mapping, 0);
		241	if (!page)
		242	return -EFAULT; /* page not mapped */
		243
		244	ret = -EINVAL;
		245	page_addr = page_address(page);
		246	ehdr = (Elf32_Ehdr *)page_addr;
		247
		248	/* compare magic x7f "ELF" */
		249	if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0)
		250	goto out;
		251
		252	/* only support executable file and shared object file */
		253	if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN)
		254	goto out;
		255
		256	if (ehdr->e_ident[EI_CLASS] == ELFCLASS32)
		257	ret = stack_map_get_build_id_32(page_addr, build_id);
		258	else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64)
		259	ret = stack_map_get_build_id_64(page_addr, build_id);
		260	out:
		261	put_page(page);
		262	return ret;
		263	}
		264
		265	static void stack_map_get_build_id_offset(struct bpf_map *map,
		266	struct stack_map_bucket *bucket,
		267	u64 *ips, u32 trace_nr, bool user)
		268	{
		269	int i;
		270	struct vm_area_struct *vma;
		271	struct bpf_stack_build_id *id_offs;
		272
		273	bucket->nr = trace_nr;
		274	id_offs = (struct bpf_stack_build_id *)bucket->data;
		275
		276	/*
		277	* We cannot do up_read() in nmi context, so build_id lookup is
		278	* only supported for non-nmi events. If at some point, it is
		279	* possible to run find_vma() without taking the semaphore, we
		280	* would like to allow build_id lookup in nmi context.
		281	*
		282	* Same fallback is used for kernel stack (!user) on a stackmap
		283	* with build_id.
		284	*/
		285	if (!user \|\| !current \|\| !current->mm \|\| in_nmi() \|\|
		286	down_read_trylock(&current->mm->mmap_sem) == 0) {
		287	/* cannot access current->mm, fall back to ips */
		288	for (i = 0; i < trace_nr; i++) {
		289	id_offs[i].status = BPF_STACK_BUILD_ID_IP;
		290	id_offs[i].ip = ips[i];
		291	}
		292	return;
		293	}
		294
		295	for (i = 0; i < trace_nr; i++) {
		296	vma = find_vma(current->mm, ips[i]);
		297	if (!vma \|\| stack_map_get_build_id(vma, id_offs[i].build_id)) {
		298	/* per entry fall back to ips */
		299	id_offs[i].status = BPF_STACK_BUILD_ID_IP;
		300	id_offs[i].ip = ips[i];
		301	continue;
		302	}
		303	id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i]
		304	- vma->vm_start;
		305	id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
		306	}
		307	up_read(&current->mm->mmap_sem);
		308	}
		309
117	BPF_CALL_3(bpf_get_stackid, struct pt_regs , regs, struct bpf_map , map,	310	BPF_CALL_3(bpf_get_stackid, struct pt_regs , regs, struct bpf_map , map,
118	u64, flags)	311	u64, flags)
119	{	312	{
120	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);	313	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
121	struct perf_callchain_entry *trace;	314	struct perf_callchain_entry *trace;
122	struct stack_map_bucket bucket, new_bucket, *old_bucket;	315	struct stack_map_bucket bucket, new_bucket, *old_bucket;
123	u32 max_depth = map->value_size / 8;	316	u32 max_depth = map->value_size / stack_map_data_size(map);
124	/* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */	317	/* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
125	u32 init_nr = sysctl_perf_event_max_stack - max_depth;	318	u32 init_nr = sysctl_perf_event_max_stack - max_depth;
126	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;	319	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
@@ -128,6 +321,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs , regs, struct bpf_map , map,
128	bool user = flags & BPF_F_USER_STACK;	321	bool user = flags & BPF_F_USER_STACK;
129	bool kernel = !user;	322	bool kernel = !user;
130	u64 *ips;	323	u64 *ips;
		324	bool hash_matches;
131		325
132	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK \| BPF_F_USER_STACK \|	326	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK \| BPF_F_USER_STACK \|
133	BPF_F_FAST_STACK_CMP \| BPF_F_REUSE_STACKID)))	327	BPF_F_FAST_STACK_CMP \| BPF_F_REUSE_STACKID)))
@@ -156,24 +350,43 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs , regs, struct bpf_map , map,
156	id = hash & (smap->n_buckets - 1);	350	id = hash & (smap->n_buckets - 1);
157	bucket = READ_ONCE(smap->buckets[id]);	351	bucket = READ_ONCE(smap->buckets[id]);
158		352
159	if (bucket && bucket->hash == hash) {	353	hash_matches = bucket && bucket->hash == hash;
160	if (flags & BPF_F_FAST_STACK_CMP)	354	/* fast cmp */
		355	if (hash_matches && flags & BPF_F_FAST_STACK_CMP)
		356	return id;
		357
		358	if (stack_map_use_build_id(map)) {
		359	/* for build_id+offset, pop a bucket before slow cmp */
		360	new_bucket = (struct stack_map_bucket *)
		361	pcpu_freelist_pop(&smap->freelist);
		362	if (unlikely(!new_bucket))
		363	return -ENOMEM;
		364	stack_map_get_build_id_offset(map, new_bucket, ips,
		365	trace_nr, user);
		366	trace_len = trace_nr * sizeof(struct bpf_stack_build_id);
		367	if (hash_matches && bucket->nr == trace_nr &&
		368	memcmp(bucket->data, new_bucket->data, trace_len) == 0) {
		369	pcpu_freelist_push(&smap->freelist, &new_bucket->fnode);
161	return id;	370	return id;
162	if (bucket->nr == trace_nr &&	371	}
163	memcmp(bucket->ip, ips, trace_len) == 0)	372	if (bucket && !(flags & BPF_F_REUSE_STACKID)) {
		373	pcpu_freelist_push(&smap->freelist, &new_bucket->fnode);
		374	return -EEXIST;
		375	}
		376	} else {
		377	if (hash_matches && bucket->nr == trace_nr &&
		378	memcmp(bucket->data, ips, trace_len) == 0)
164	return id;	379	return id;
		380	if (bucket && !(flags & BPF_F_REUSE_STACKID))
		381	return -EEXIST;
		382
		383	new_bucket = (struct stack_map_bucket *)
		384	pcpu_freelist_pop(&smap->freelist);
		385	if (unlikely(!new_bucket))
		386	return -ENOMEM;
		387	memcpy(new_bucket->data, ips, trace_len);
165	}	388	}
166		389
167	/* this call stack is not in the map, try to add it */
168	if (bucket && !(flags & BPF_F_REUSE_STACKID))
169	return -EEXIST;
170
171	new_bucket = (struct stack_map_bucket *)
172	pcpu_freelist_pop(&smap->freelist);
173	if (unlikely(!new_bucket))
174	return -ENOMEM;
175
176	memcpy(new_bucket->ip, ips, trace_len);
177	new_bucket->hash = hash;	390	new_bucket->hash = hash;
178	new_bucket->nr = trace_nr;	391	new_bucket->nr = trace_nr;
179		392
@@ -212,8 +425,8 @@ int bpf_stackmap_copy(struct bpf_map map, void key, void *value)
212	if (!bucket)	425	if (!bucket)
213	return -ENOENT;	426	return -ENOENT;
214		427
215	trace_len = bucket->nr * sizeof(u64);	428	trace_len = bucket->nr * stack_map_data_size(map);
216	memcpy(value, bucket->ip, trace_len);	429	memcpy(value, bucket->data, trace_len);
217	memset(value + trace_len, 0, map->value_size - trace_len);	430	memset(value + trace_len, 0, map->value_size - trace_len);
218		431
219	old_bucket = xchg(&smap->buckets[id], bucket);	432	old_bucket = xchg(&smap->buckets[id], bucket);