aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorDaniel Borkmann <daniel@iogearbox.net>2019-04-09 17:20:03 -0400
committerAlexei Starovoitov <ast@kernel.org>2019-04-09 20:05:46 -0400
commitd8eca5bbb2be9bc7546f9e733786fa2f1a594c67 (patch)
tree2849428915f4a9604fe11b4c9422627d6b127716 /kernel
parentff466b58055f2d28d8ddc1388af312e87a693efe (diff)
bpf: implement lookup-free direct value access for maps
This generic extension to BPF maps allows for directly loading an address residing inside a BPF map value as a single BPF ldimm64 instruction! The idea is similar to what BPF_PSEUDO_MAP_FD does today, which is a special src_reg flag for ldimm64 instruction that indicates that inside the first part of the double insns's imm field is a file descriptor which the verifier then replaces as a full 64bit address of the map into both imm parts. For the newly added BPF_PSEUDO_MAP_VALUE src_reg flag, the idea is the following: the first part of the double insns's imm field is again a file descriptor corresponding to the map, and the second part of the imm field is an offset into the value. The verifier will then replace both imm parts with an address that points into the BPF map value at the given value offset for maps that support this operation. Currently supported is array map with single entry. It is possible to support more than just single map element by reusing both 16bit off fields of the insns as a map index, so full array map lookup could be expressed that way. It hasn't been implemented here due to lack of concrete use case, but could easily be done so in future in a compatible way, since both off fields right now have to be 0 and would correctly denote a map index 0. The BPF_PSEUDO_MAP_VALUE is a distinct flag as otherwise with BPF_PSEUDO_MAP_FD we could not differ offset 0 between load of map pointer versus load of map's value at offset 0, and changing BPF_PSEUDO_MAP_FD's encoding into off by one to differ between regular map pointer and map value pointer would add unnecessary complexity and increases barrier for debugability thus less suitable. Using the second part of the imm field as an offset into the value does /not/ come with limitations since maximum possible value size is in u32 universe anyway. This optimization allows for efficiently retrieving an address to a map value memory area without having to issue a helper call which needs to prepare registers according to calling convention, etc, without needing the extra NULL test, and without having to add the offset in an additional instruction to the value base pointer. The verifier then treats the destination register as PTR_TO_MAP_VALUE with constant reg->off from the user passed offset from the second imm field, and guarantees that this is within bounds of the map value. Any subsequent operations are normally treated as typical map value handling without anything extra needed from verification side. The two map operations for direct value access have been added to array map for now. In future other types could be supported as well depending on the use case. The main use case for this commit is to allow for BPF loader support for global variables that reside in .data/.rodata/.bss sections such that we can directly load the address of them with minimal additional infrastructure required. Loader support has been added in subsequent commits for libbpf library. Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/arraymap.c32
-rw-r--r--kernel/bpf/core.c3
-rw-r--r--kernel/bpf/disasm.c5
-rw-r--r--kernel/bpf/syscall.c28
-rw-r--r--kernel/bpf/verifier.c86
5 files changed, 124 insertions, 30 deletions
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index c72e0d8e1e65..1a6e9861d554 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -160,6 +160,36 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)
160 return array->value + array->elem_size * (index & array->index_mask); 160 return array->value + array->elem_size * (index & array->index_mask);
161} 161}
162 162
163static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm,
164 u32 off)
165{
166 struct bpf_array *array = container_of(map, struct bpf_array, map);
167
168 if (map->max_entries != 1)
169 return -ENOTSUPP;
170 if (off >= map->value_size)
171 return -EINVAL;
172
173 *imm = (unsigned long)array->value;
174 return 0;
175}
176
177static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm,
178 u32 *off)
179{
180 struct bpf_array *array = container_of(map, struct bpf_array, map);
181 u64 base = (unsigned long)array->value;
182 u64 range = array->elem_size;
183
184 if (map->max_entries != 1)
185 return -ENOTSUPP;
186 if (imm < base || imm >= base + range)
187 return -ENOENT;
188
189 *off = imm - base;
190 return 0;
191}
192
163/* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ 193/* emit BPF instructions equivalent to C code of array_map_lookup_elem() */
164static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) 194static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
165{ 195{
@@ -419,6 +449,8 @@ const struct bpf_map_ops array_map_ops = {
419 .map_update_elem = array_map_update_elem, 449 .map_update_elem = array_map_update_elem,
420 .map_delete_elem = array_map_delete_elem, 450 .map_delete_elem = array_map_delete_elem,
421 .map_gen_lookup = array_map_gen_lookup, 451 .map_gen_lookup = array_map_gen_lookup,
452 .map_direct_value_addr = array_map_direct_value_addr,
453 .map_direct_value_meta = array_map_direct_value_meta,
422 .map_seq_show_elem = array_map_seq_show_elem, 454 .map_seq_show_elem = array_map_seq_show_elem,
423 .map_check_btf = array_map_check_btf, 455 .map_check_btf = array_map_check_btf,
424}; 456};
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 2966cb368bf4..ace8c22c8b0e 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -292,7 +292,8 @@ int bpf_prog_calc_tag(struct bpf_prog *fp)
292 dst[i] = fp->insnsi[i]; 292 dst[i] = fp->insnsi[i];
293 if (!was_ld_map && 293 if (!was_ld_map &&
294 dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) && 294 dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) &&
295 dst[i].src_reg == BPF_PSEUDO_MAP_FD) { 295 (dst[i].src_reg == BPF_PSEUDO_MAP_FD ||
296 dst[i].src_reg == BPF_PSEUDO_MAP_VALUE)) {
296 was_ld_map = true; 297 was_ld_map = true;
297 dst[i].imm = 0; 298 dst[i].imm = 0;
298 } else if (was_ld_map && 299 } else if (was_ld_map &&
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index de73f55e42fd..d9ce383c0f9c 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -205,10 +205,11 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
205 * part of the ldimm64 insn is accessible. 205 * part of the ldimm64 insn is accessible.
206 */ 206 */
207 u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; 207 u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
208 bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD; 208 bool is_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD ||
209 insn->src_reg == BPF_PSEUDO_MAP_VALUE;
209 char tmp[64]; 210 char tmp[64];
210 211
211 if (map_ptr && !allow_ptr_leaks) 212 if (is_ptr && !allow_ptr_leaks)
212 imm = 0; 213 imm = 0;
213 214
214 verbose(cbs->private_data, "(%02x) r%d = %s\n", 215 verbose(cbs->private_data, "(%02x) r%d = %s\n",
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 1d65e56594db..828518bb947b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2072,13 +2072,26 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
2072} 2072}
2073 2073
2074static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog, 2074static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog,
2075 unsigned long addr) 2075 unsigned long addr, u32 *off,
2076 u32 *type)
2076{ 2077{
2078 const struct bpf_map *map;
2077 int i; 2079 int i;
2078 2080
2079 for (i = 0; i < prog->aux->used_map_cnt; i++) 2081 for (i = 0, *off = 0; i < prog->aux->used_map_cnt; i++) {
2080 if (prog->aux->used_maps[i] == (void *)addr) 2082 map = prog->aux->used_maps[i];
2081 return prog->aux->used_maps[i]; 2083 if (map == (void *)addr) {
2084 *type = BPF_PSEUDO_MAP_FD;
2085 return map;
2086 }
2087 if (!map->ops->map_direct_value_meta)
2088 continue;
2089 if (!map->ops->map_direct_value_meta(map, addr, off)) {
2090 *type = BPF_PSEUDO_MAP_VALUE;
2091 return map;
2092 }
2093 }
2094
2082 return NULL; 2095 return NULL;
2083} 2096}
2084 2097
@@ -2086,6 +2099,7 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog)
2086{ 2099{
2087 const struct bpf_map *map; 2100 const struct bpf_map *map;
2088 struct bpf_insn *insns; 2101 struct bpf_insn *insns;
2102 u32 off, type;
2089 u64 imm; 2103 u64 imm;
2090 int i; 2104 int i;
2091 2105
@@ -2113,11 +2127,11 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog)
2113 continue; 2127 continue;
2114 2128
2115 imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm; 2129 imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm;
2116 map = bpf_map_from_imm(prog, imm); 2130 map = bpf_map_from_imm(prog, imm, &off, &type);
2117 if (map) { 2131 if (map) {
2118 insns[i].src_reg = BPF_PSEUDO_MAP_FD; 2132 insns[i].src_reg = type;
2119 insns[i].imm = map->id; 2133 insns[i].imm = map->id;
2120 insns[i + 1].imm = 0; 2134 insns[i + 1].imm = off;
2121 continue; 2135 continue;
2122 } 2136 }
2123 } 2137 }
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 48718e1da16d..6ab7a23fc924 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5056,18 +5056,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
5056 return 0; 5056 return 0;
5057} 5057}
5058 5058
5059/* return the map pointer stored inside BPF_LD_IMM64 instruction */
5060static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
5061{
5062 u64 imm64 = ((u64) (u32) insn[0].imm) | ((u64) (u32) insn[1].imm) << 32;
5063
5064 return (struct bpf_map *) (unsigned long) imm64;
5065}
5066
5067/* verify BPF_LD_IMM64 instruction */ 5059/* verify BPF_LD_IMM64 instruction */
5068static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) 5060static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
5069{ 5061{
5062 struct bpf_insn_aux_data *aux = cur_aux(env);
5070 struct bpf_reg_state *regs = cur_regs(env); 5063 struct bpf_reg_state *regs = cur_regs(env);
5064 struct bpf_map *map;
5071 int err; 5065 int err;
5072 5066
5073 if (BPF_SIZE(insn->code) != BPF_DW) { 5067 if (BPF_SIZE(insn->code) != BPF_DW) {
@@ -5091,11 +5085,22 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
5091 return 0; 5085 return 0;
5092 } 5086 }
5093 5087
5094 /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */ 5088 map = env->used_maps[aux->map_index];
5095 BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD); 5089 mark_reg_known_zero(env, regs, insn->dst_reg);
5090 regs[insn->dst_reg].map_ptr = map;
5091
5092 if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) {
5093 regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
5094 regs[insn->dst_reg].off = aux->map_off;
5095 if (map_value_has_spin_lock(map))
5096 regs[insn->dst_reg].id = ++env->id_gen;
5097 } else if (insn->src_reg == BPF_PSEUDO_MAP_FD) {
5098 regs[insn->dst_reg].type = CONST_PTR_TO_MAP;
5099 } else {
5100 verbose(env, "bpf verifier is misconfigured\n");
5101 return -EINVAL;
5102 }
5096 5103
5097 regs[insn->dst_reg].type = CONST_PTR_TO_MAP;
5098 regs[insn->dst_reg].map_ptr = ld_imm64_to_map_ptr(insn);
5099 return 0; 5104 return 0;
5100} 5105}
5101 5106
@@ -6803,8 +6808,10 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
6803 } 6808 }
6804 6809
6805 if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) { 6810 if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
6811 struct bpf_insn_aux_data *aux;
6806 struct bpf_map *map; 6812 struct bpf_map *map;
6807 struct fd f; 6813 struct fd f;
6814 u64 addr;
6808 6815
6809 if (i == insn_cnt - 1 || insn[1].code != 0 || 6816 if (i == insn_cnt - 1 || insn[1].code != 0 ||
6810 insn[1].dst_reg != 0 || insn[1].src_reg != 0 || 6817 insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
@@ -6813,13 +6820,19 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
6813 return -EINVAL; 6820 return -EINVAL;
6814 } 6821 }
6815 6822
6816 if (insn->src_reg == 0) 6823 if (insn[0].src_reg == 0)
6817 /* valid generic load 64-bit imm */ 6824 /* valid generic load 64-bit imm */
6818 goto next_insn; 6825 goto next_insn;
6819 6826
6820 if (insn[0].src_reg != BPF_PSEUDO_MAP_FD || 6827 /* In final convert_pseudo_ld_imm64() step, this is
6821 insn[1].imm != 0) { 6828 * converted into regular 64-bit imm load insn.
6822 verbose(env, "unrecognized bpf_ld_imm64 insn\n"); 6829 */
6830 if ((insn[0].src_reg != BPF_PSEUDO_MAP_FD &&
6831 insn[0].src_reg != BPF_PSEUDO_MAP_VALUE) ||
6832 (insn[0].src_reg == BPF_PSEUDO_MAP_FD &&
6833 insn[1].imm != 0)) {
6834 verbose(env,
6835 "unrecognized bpf_ld_imm64 insn\n");
6823 return -EINVAL; 6836 return -EINVAL;
6824 } 6837 }
6825 6838
@@ -6837,16 +6850,47 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
6837 return err; 6850 return err;
6838 } 6851 }
6839 6852
6840 /* store map pointer inside BPF_LD_IMM64 instruction */ 6853 aux = &env->insn_aux_data[i];
6841 insn[0].imm = (u32) (unsigned long) map; 6854 if (insn->src_reg == BPF_PSEUDO_MAP_FD) {
6842 insn[1].imm = ((u64) (unsigned long) map) >> 32; 6855 addr = (unsigned long)map;
6856 } else {
6857 u32 off = insn[1].imm;
6858
6859 if (off >= BPF_MAX_VAR_OFF) {
6860 verbose(env, "direct value offset of %u is not allowed\n", off);
6861 fdput(f);
6862 return -EINVAL;
6863 }
6864
6865 if (!map->ops->map_direct_value_addr) {
6866 verbose(env, "no direct value access support for this map type\n");
6867 fdput(f);
6868 return -EINVAL;
6869 }
6870
6871 err = map->ops->map_direct_value_addr(map, &addr, off);
6872 if (err) {
6873 verbose(env, "invalid access to map value pointer, value_size=%u off=%u\n",
6874 map->value_size, off);
6875 fdput(f);
6876 return err;
6877 }
6878
6879 aux->map_off = off;
6880 addr += off;
6881 }
6882
6883 insn[0].imm = (u32)addr;
6884 insn[1].imm = addr >> 32;
6843 6885
6844 /* check whether we recorded this map already */ 6886 /* check whether we recorded this map already */
6845 for (j = 0; j < env->used_map_cnt; j++) 6887 for (j = 0; j < env->used_map_cnt; j++) {
6846 if (env->used_maps[j] == map) { 6888 if (env->used_maps[j] == map) {
6889 aux->map_index = j;
6847 fdput(f); 6890 fdput(f);
6848 goto next_insn; 6891 goto next_insn;
6849 } 6892 }
6893 }
6850 6894
6851 if (env->used_map_cnt >= MAX_USED_MAPS) { 6895 if (env->used_map_cnt >= MAX_USED_MAPS) {
6852 fdput(f); 6896 fdput(f);
@@ -6863,6 +6907,8 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
6863 fdput(f); 6907 fdput(f);
6864 return PTR_ERR(map); 6908 return PTR_ERR(map);
6865 } 6909 }
6910
6911 aux->map_index = env->used_map_cnt;
6866 env->used_maps[env->used_map_cnt++] = map; 6912 env->used_maps[env->used_map_cnt++] = map;
6867 6913
6868 if (bpf_map_is_cgroup_storage(map) && 6914 if (bpf_map_is_cgroup_storage(map) &&