diff options
| author | Alexei Starovoitov <ast@kernel.org> | 2019-01-31 18:40:04 -0500 |
|---|---|---|
| committer | Daniel Borkmann <daniel@iogearbox.net> | 2019-02-01 14:55:38 -0500 |
| commit | d83525ca62cf8ebe3271d14c36fb900c294274a2 (patch) | |
| tree | 14c11f7a76bf1d9778eaa29a37d734818f02e2e0 /kernel | |
| parent | 1832f4ef5867fd3898d8a6c6c1978b75d76fc246 (diff) | |
bpf: introduce bpf_spin_lock
Introduce 'struct bpf_spin_lock' and bpf_spin_lock/unlock() helpers to let
bpf program serialize access to other variables.
Example:
struct hash_elem {
int cnt;
struct bpf_spin_lock lock;
};
struct hash_elem * val = bpf_map_lookup_elem(&hash_map, &key);
if (val) {
bpf_spin_lock(&val->lock);
val->cnt++;
bpf_spin_unlock(&val->lock);
}
Restrictions and safety checks:
- bpf_spin_lock is only allowed inside HASH and ARRAY maps.
- BTF description of the map is mandatory for safety analysis.
- bpf program can take one bpf_spin_lock at a time, since two or more can
cause dead locks.
- only one 'struct bpf_spin_lock' is allowed per map element.
It drastically simplifies implementation yet allows bpf program to use
any number of bpf_spin_locks.
- when bpf_spin_lock is taken the calls (either bpf2bpf or helpers) are not allowed.
- bpf program must bpf_spin_unlock() before return.
- bpf program can access 'struct bpf_spin_lock' only via
bpf_spin_lock()/bpf_spin_unlock() helpers.
- load/store into 'struct bpf_spin_lock lock;' field is not allowed.
- to use bpf_spin_lock() helper the BTF description of map value must be
a struct and have 'struct bpf_spin_lock anyname;' field at the top level.
Nested lock inside another struct is not allowed.
- syscall map_lookup doesn't copy bpf_spin_lock field to user space.
- syscall map_update and program map_update do not update bpf_spin_lock field.
- bpf_spin_lock cannot be on the stack or inside networking packet.
bpf_spin_lock can only be inside HASH or ARRAY map value.
- bpf_spin_lock is available to root only and to all program types.
- bpf_spin_lock is not allowed in inner maps of map-in-map.
- ld_abs is not allowed inside spin_lock-ed region.
- tracing progs and socket filter progs cannot use bpf_spin_lock due to
insufficient preemption checks
Implementation details:
- cgroup-bpf class of programs can nest with xdp/tc programs.
Hence bpf_spin_lock is equivalent to spin_lock_irqsave.
Other solutions to avoid nested bpf_spin_lock are possible.
Like making sure that all networking progs run with softirq disabled.
spin_lock_irqsave is the simplest and doesn't add overhead to the
programs that don't use it.
- arch_spinlock_t is used when its implemented as queued_spin_lock
- archs can force their own arch_spinlock_t
- on architectures where queued_spin_lock is not available and
sizeof(arch_spinlock_t) != sizeof(__u32) trivial lock is used.
- presence of bpf_spin_lock inside map value could have been indicated via
extra flag during map_create, but specifying it via BTF is cleaner.
It provides introspection for map key/value and reduces user mistakes.
Next steps:
- allow bpf_spin_lock in other map types (like cgroup local storage)
- introduce BPF_F_LOCK flag for bpf_map_update() syscall and helper
to request kernel to grab bpf_spin_lock before rewriting the value.
That will serialize access to map elements.
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/Kconfig.locks | 3 | ||||
| -rw-r--r-- | kernel/bpf/arraymap.c | 7 | ||||
| -rw-r--r-- | kernel/bpf/btf.c | 42 | ||||
| -rw-r--r-- | kernel/bpf/core.c | 2 | ||||
| -rw-r--r-- | kernel/bpf/hashtab.c | 21 | ||||
| -rw-r--r-- | kernel/bpf/helpers.c | 80 | ||||
| -rw-r--r-- | kernel/bpf/map_in_map.c | 5 | ||||
| -rw-r--r-- | kernel/bpf/syscall.c | 21 | ||||
| -rw-r--r-- | kernel/bpf/verifier.c | 169 |
9 files changed, 331 insertions, 19 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 84d882f3e299..fbba478ae522 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks | |||
| @@ -242,6 +242,9 @@ config QUEUED_SPINLOCKS | |||
| 242 | def_bool y if ARCH_USE_QUEUED_SPINLOCKS | 242 | def_bool y if ARCH_USE_QUEUED_SPINLOCKS |
| 243 | depends on SMP | 243 | depends on SMP |
| 244 | 244 | ||
| 245 | config BPF_ARCH_SPINLOCK | ||
| 246 | bool | ||
| 247 | |||
| 245 | config ARCH_USE_QUEUED_RWLOCKS | 248 | config ARCH_USE_QUEUED_RWLOCKS |
| 246 | bool | 249 | bool |
| 247 | 250 | ||
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 25632a75d630..d6d979910a2a 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c | |||
| @@ -270,9 +270,10 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, | |||
| 270 | memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]), | 270 | memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]), |
| 271 | value, map->value_size); | 271 | value, map->value_size); |
| 272 | else | 272 | else |
| 273 | memcpy(array->value + | 273 | copy_map_value(map, |
| 274 | array->elem_size * (index & array->index_mask), | 274 | array->value + |
| 275 | value, map->value_size); | 275 | array->elem_size * (index & array->index_mask), |
| 276 | value); | ||
| 276 | return 0; | 277 | return 0; |
| 277 | } | 278 | } |
| 278 | 279 | ||
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 3d661f0606fe..7019c1f05cab 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c | |||
| @@ -355,6 +355,11 @@ static bool btf_type_is_struct(const struct btf_type *t) | |||
| 355 | return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION; | 355 | return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION; |
| 356 | } | 356 | } |
| 357 | 357 | ||
| 358 | static bool __btf_type_is_struct(const struct btf_type *t) | ||
| 359 | { | ||
| 360 | return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT; | ||
| 361 | } | ||
| 362 | |||
| 358 | static bool btf_type_is_array(const struct btf_type *t) | 363 | static bool btf_type_is_array(const struct btf_type *t) |
| 359 | { | 364 | { |
| 360 | return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY; | 365 | return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY; |
| @@ -2045,6 +2050,43 @@ static void btf_struct_log(struct btf_verifier_env *env, | |||
| 2045 | btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); | 2050 | btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); |
| 2046 | } | 2051 | } |
| 2047 | 2052 | ||
| 2053 | /* find 'struct bpf_spin_lock' in map value. | ||
| 2054 | * return >= 0 offset if found | ||
| 2055 | * and < 0 in case of error | ||
| 2056 | */ | ||
| 2057 | int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t) | ||
| 2058 | { | ||
| 2059 | const struct btf_member *member; | ||
| 2060 | u32 i, off = -ENOENT; | ||
| 2061 | |||
| 2062 | if (!__btf_type_is_struct(t)) | ||
| 2063 | return -EINVAL; | ||
| 2064 | |||
| 2065 | for_each_member(i, t, member) { | ||
| 2066 | const struct btf_type *member_type = btf_type_by_id(btf, | ||
| 2067 | member->type); | ||
| 2068 | if (!__btf_type_is_struct(member_type)) | ||
| 2069 | continue; | ||
| 2070 | if (member_type->size != sizeof(struct bpf_spin_lock)) | ||
| 2071 | continue; | ||
| 2072 | if (strcmp(__btf_name_by_offset(btf, member_type->name_off), | ||
| 2073 | "bpf_spin_lock")) | ||
| 2074 | continue; | ||
| 2075 | if (off != -ENOENT) | ||
| 2076 | /* only one 'struct bpf_spin_lock' is allowed */ | ||
| 2077 | return -E2BIG; | ||
| 2078 | off = btf_member_bit_offset(t, member); | ||
| 2079 | if (off % 8) | ||
| 2080 | /* valid C code cannot generate such BTF */ | ||
| 2081 | return -EINVAL; | ||
| 2082 | off /= 8; | ||
| 2083 | if (off % __alignof__(struct bpf_spin_lock)) | ||
| 2084 | /* valid struct bpf_spin_lock will be 4 byte aligned */ | ||
| 2085 | return -EINVAL; | ||
| 2086 | } | ||
| 2087 | return off; | ||
| 2088 | } | ||
| 2089 | |||
| 2048 | static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t, | 2090 | static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t, |
| 2049 | u32 type_id, void *data, u8 bits_offset, | 2091 | u32 type_id, void *data, u8 bits_offset, |
| 2050 | struct seq_file *m) | 2092 | struct seq_file *m) |
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f13c543b7b36..ef88b167959d 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c | |||
| @@ -2002,6 +2002,8 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak; | |||
| 2002 | const struct bpf_func_proto bpf_map_push_elem_proto __weak; | 2002 | const struct bpf_func_proto bpf_map_push_elem_proto __weak; |
| 2003 | const struct bpf_func_proto bpf_map_pop_elem_proto __weak; | 2003 | const struct bpf_func_proto bpf_map_pop_elem_proto __weak; |
| 2004 | const struct bpf_func_proto bpf_map_peek_elem_proto __weak; | 2004 | const struct bpf_func_proto bpf_map_peek_elem_proto __weak; |
| 2005 | const struct bpf_func_proto bpf_spin_lock_proto __weak; | ||
| 2006 | const struct bpf_func_proto bpf_spin_unlock_proto __weak; | ||
| 2005 | 2007 | ||
| 2006 | const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; | 2008 | const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; |
| 2007 | const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; | 2009 | const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; |
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 4b7c76765d9d..6d3b22c5ad68 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c | |||
| @@ -718,21 +718,12 @@ static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) | |||
| 718 | BITS_PER_LONG == 64; | 718 | BITS_PER_LONG == 64; |
| 719 | } | 719 | } |
| 720 | 720 | ||
| 721 | static u32 htab_size_value(const struct bpf_htab *htab, bool percpu) | ||
| 722 | { | ||
| 723 | u32 size = htab->map.value_size; | ||
| 724 | |||
| 725 | if (percpu || fd_htab_map_needs_adjust(htab)) | ||
| 726 | size = round_up(size, 8); | ||
| 727 | return size; | ||
| 728 | } | ||
| 729 | |||
| 730 | static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, | 721 | static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, |
| 731 | void *value, u32 key_size, u32 hash, | 722 | void *value, u32 key_size, u32 hash, |
| 732 | bool percpu, bool onallcpus, | 723 | bool percpu, bool onallcpus, |
| 733 | struct htab_elem *old_elem) | 724 | struct htab_elem *old_elem) |
| 734 | { | 725 | { |
| 735 | u32 size = htab_size_value(htab, percpu); | 726 | u32 size = htab->map.value_size; |
| 736 | bool prealloc = htab_is_prealloc(htab); | 727 | bool prealloc = htab_is_prealloc(htab); |
| 737 | struct htab_elem *l_new, **pl_new; | 728 | struct htab_elem *l_new, **pl_new; |
| 738 | void __percpu *pptr; | 729 | void __percpu *pptr; |
| @@ -770,10 +761,13 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, | |||
| 770 | l_new = ERR_PTR(-ENOMEM); | 761 | l_new = ERR_PTR(-ENOMEM); |
| 771 | goto dec_count; | 762 | goto dec_count; |
| 772 | } | 763 | } |
| 764 | check_and_init_map_lock(&htab->map, | ||
| 765 | l_new->key + round_up(key_size, 8)); | ||
| 773 | } | 766 | } |
| 774 | 767 | ||
| 775 | memcpy(l_new->key, key, key_size); | 768 | memcpy(l_new->key, key, key_size); |
| 776 | if (percpu) { | 769 | if (percpu) { |
| 770 | size = round_up(size, 8); | ||
| 777 | if (prealloc) { | 771 | if (prealloc) { |
| 778 | pptr = htab_elem_get_ptr(l_new, key_size); | 772 | pptr = htab_elem_get_ptr(l_new, key_size); |
| 779 | } else { | 773 | } else { |
| @@ -791,8 +785,13 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, | |||
| 791 | 785 | ||
| 792 | if (!prealloc) | 786 | if (!prealloc) |
| 793 | htab_elem_set_ptr(l_new, key_size, pptr); | 787 | htab_elem_set_ptr(l_new, key_size, pptr); |
| 794 | } else { | 788 | } else if (fd_htab_map_needs_adjust(htab)) { |
| 789 | size = round_up(size, 8); | ||
| 795 | memcpy(l_new->key + round_up(key_size, 8), value, size); | 790 | memcpy(l_new->key + round_up(key_size, 8), value, size); |
| 791 | } else { | ||
| 792 | copy_map_value(&htab->map, | ||
| 793 | l_new->key + round_up(key_size, 8), | ||
| 794 | value); | ||
| 796 | } | 795 | } |
| 797 | 796 | ||
| 798 | l_new->hash = hash; | 797 | l_new->hash = hash; |
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index a74972b07e74..fbe544761628 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c | |||
| @@ -221,6 +221,86 @@ const struct bpf_func_proto bpf_get_current_comm_proto = { | |||
| 221 | .arg2_type = ARG_CONST_SIZE, | 221 | .arg2_type = ARG_CONST_SIZE, |
| 222 | }; | 222 | }; |
| 223 | 223 | ||
| 224 | #if defined(CONFIG_QUEUED_SPINLOCKS) || defined(CONFIG_BPF_ARCH_SPINLOCK) | ||
| 225 | |||
| 226 | static inline void __bpf_spin_lock(struct bpf_spin_lock *lock) | ||
| 227 | { | ||
| 228 | arch_spinlock_t *l = (void *)lock; | ||
| 229 | union { | ||
| 230 | __u32 val; | ||
| 231 | arch_spinlock_t lock; | ||
| 232 | } u = { .lock = __ARCH_SPIN_LOCK_UNLOCKED }; | ||
| 233 | |||
| 234 | compiletime_assert(u.val == 0, "__ARCH_SPIN_LOCK_UNLOCKED not 0"); | ||
| 235 | BUILD_BUG_ON(sizeof(*l) != sizeof(__u32)); | ||
| 236 | BUILD_BUG_ON(sizeof(*lock) != sizeof(__u32)); | ||
| 237 | arch_spin_lock(l); | ||
| 238 | } | ||
| 239 | |||
| 240 | static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock) | ||
| 241 | { | ||
| 242 | arch_spinlock_t *l = (void *)lock; | ||
| 243 | |||
| 244 | arch_spin_unlock(l); | ||
| 245 | } | ||
| 246 | |||
| 247 | #else | ||
| 248 | |||
| 249 | static inline void __bpf_spin_lock(struct bpf_spin_lock *lock) | ||
| 250 | { | ||
| 251 | atomic_t *l = (void *)lock; | ||
| 252 | |||
| 253 | BUILD_BUG_ON(sizeof(*l) != sizeof(*lock)); | ||
| 254 | do { | ||
| 255 | atomic_cond_read_relaxed(l, !VAL); | ||
| 256 | } while (atomic_xchg(l, 1)); | ||
| 257 | } | ||
| 258 | |||
| 259 | static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock) | ||
| 260 | { | ||
| 261 | atomic_t *l = (void *)lock; | ||
| 262 | |||
| 263 | atomic_set_release(l, 0); | ||
| 264 | } | ||
| 265 | |||
| 266 | #endif | ||
| 267 | |||
| 268 | static DEFINE_PER_CPU(unsigned long, irqsave_flags); | ||
| 269 | |||
| 270 | notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock) | ||
| 271 | { | ||
| 272 | unsigned long flags; | ||
| 273 | |||
| 274 | local_irq_save(flags); | ||
| 275 | __bpf_spin_lock(lock); | ||
| 276 | __this_cpu_write(irqsave_flags, flags); | ||
| 277 | return 0; | ||
| 278 | } | ||
| 279 | |||
| 280 | const struct bpf_func_proto bpf_spin_lock_proto = { | ||
| 281 | .func = bpf_spin_lock, | ||
| 282 | .gpl_only = false, | ||
| 283 | .ret_type = RET_VOID, | ||
| 284 | .arg1_type = ARG_PTR_TO_SPIN_LOCK, | ||
| 285 | }; | ||
| 286 | |||
| 287 | notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock) | ||
| 288 | { | ||
| 289 | unsigned long flags; | ||
| 290 | |||
| 291 | flags = __this_cpu_read(irqsave_flags); | ||
| 292 | __bpf_spin_unlock(lock); | ||
| 293 | local_irq_restore(flags); | ||
| 294 | return 0; | ||
| 295 | } | ||
| 296 | |||
| 297 | const struct bpf_func_proto bpf_spin_unlock_proto = { | ||
| 298 | .func = bpf_spin_unlock, | ||
| 299 | .gpl_only = false, | ||
| 300 | .ret_type = RET_VOID, | ||
| 301 | .arg1_type = ARG_PTR_TO_SPIN_LOCK, | ||
| 302 | }; | ||
| 303 | |||
| 224 | #ifdef CONFIG_CGROUPS | 304 | #ifdef CONFIG_CGROUPS |
| 225 | BPF_CALL_0(bpf_get_current_cgroup_id) | 305 | BPF_CALL_0(bpf_get_current_cgroup_id) |
| 226 | { | 306 | { |
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 52378d3e34b3..583346a0ab29 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c | |||
| @@ -37,6 +37,11 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) | |||
| 37 | return ERR_PTR(-EINVAL); | 37 | return ERR_PTR(-EINVAL); |
| 38 | } | 38 | } |
| 39 | 39 | ||
| 40 | if (map_value_has_spin_lock(inner_map)) { | ||
| 41 | fdput(f); | ||
| 42 | return ERR_PTR(-ENOTSUPP); | ||
| 43 | } | ||
| 44 | |||
| 40 | inner_map_meta_size = sizeof(*inner_map_meta); | 45 | inner_map_meta_size = sizeof(*inner_map_meta); |
| 41 | /* In some cases verifier needs to access beyond just base map. */ | 46 | /* In some cases verifier needs to access beyond just base map. */ |
| 42 | if (inner_map->ops == &array_map_ops) | 47 | if (inner_map->ops == &array_map_ops) |
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b155cd17c1bd..ebf0a673cb83 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c | |||
| @@ -463,7 +463,7 @@ int map_check_no_btf(const struct bpf_map *map, | |||
| 463 | return -ENOTSUPP; | 463 | return -ENOTSUPP; |
| 464 | } | 464 | } |
| 465 | 465 | ||
| 466 | static int map_check_btf(const struct bpf_map *map, const struct btf *btf, | 466 | static int map_check_btf(struct bpf_map *map, const struct btf *btf, |
| 467 | u32 btf_key_id, u32 btf_value_id) | 467 | u32 btf_key_id, u32 btf_value_id) |
| 468 | { | 468 | { |
| 469 | const struct btf_type *key_type, *value_type; | 469 | const struct btf_type *key_type, *value_type; |
| @@ -478,6 +478,21 @@ static int map_check_btf(const struct bpf_map *map, const struct btf *btf, | |||
| 478 | if (!value_type || value_size != map->value_size) | 478 | if (!value_type || value_size != map->value_size) |
| 479 | return -EINVAL; | 479 | return -EINVAL; |
| 480 | 480 | ||
| 481 | map->spin_lock_off = btf_find_spin_lock(btf, value_type); | ||
| 482 | |||
| 483 | if (map_value_has_spin_lock(map)) { | ||
| 484 | if (map->map_type != BPF_MAP_TYPE_HASH && | ||
| 485 | map->map_type != BPF_MAP_TYPE_ARRAY) | ||
| 486 | return -ENOTSUPP; | ||
| 487 | if (map->spin_lock_off + sizeof(struct bpf_spin_lock) > | ||
| 488 | map->value_size) { | ||
| 489 | WARN_ONCE(1, | ||
| 490 | "verifier bug spin_lock_off %d value_size %d\n", | ||
| 491 | map->spin_lock_off, map->value_size); | ||
| 492 | return -EFAULT; | ||
| 493 | } | ||
| 494 | } | ||
| 495 | |||
| 481 | if (map->ops->map_check_btf) | 496 | if (map->ops->map_check_btf) |
| 482 | ret = map->ops->map_check_btf(map, btf, key_type, value_type); | 497 | ret = map->ops->map_check_btf(map, btf, key_type, value_type); |
| 483 | 498 | ||
| @@ -542,6 +557,8 @@ static int map_create(union bpf_attr *attr) | |||
| 542 | map->btf = btf; | 557 | map->btf = btf; |
| 543 | map->btf_key_type_id = attr->btf_key_type_id; | 558 | map->btf_key_type_id = attr->btf_key_type_id; |
| 544 | map->btf_value_type_id = attr->btf_value_type_id; | 559 | map->btf_value_type_id = attr->btf_value_type_id; |
| 560 | } else { | ||
| 561 | map->spin_lock_off = -EINVAL; | ||
| 545 | } | 562 | } |
| 546 | 563 | ||
| 547 | err = security_bpf_map_alloc(map); | 564 | err = security_bpf_map_alloc(map); |
| @@ -740,7 +757,7 @@ static int map_lookup_elem(union bpf_attr *attr) | |||
| 740 | err = -ENOENT; | 757 | err = -ENOENT; |
| 741 | } else { | 758 | } else { |
| 742 | err = 0; | 759 | err = 0; |
| 743 | memcpy(value, ptr, value_size); | 760 | copy_map_value(map, value, ptr); |
| 744 | } | 761 | } |
| 745 | rcu_read_unlock(); | 762 | rcu_read_unlock(); |
| 746 | } | 763 | } |
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8c1c21cd50b4..38892bdee651 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c | |||
| @@ -213,6 +213,7 @@ struct bpf_call_arg_meta { | |||
| 213 | s64 msize_smax_value; | 213 | s64 msize_smax_value; |
| 214 | u64 msize_umax_value; | 214 | u64 msize_umax_value; |
| 215 | int ptr_id; | 215 | int ptr_id; |
| 216 | int func_id; | ||
| 216 | }; | 217 | }; |
| 217 | 218 | ||
| 218 | static DEFINE_MUTEX(bpf_verifier_lock); | 219 | static DEFINE_MUTEX(bpf_verifier_lock); |
| @@ -351,6 +352,12 @@ static bool reg_is_refcounted(const struct bpf_reg_state *reg) | |||
| 351 | return type_is_refcounted(reg->type); | 352 | return type_is_refcounted(reg->type); |
| 352 | } | 353 | } |
| 353 | 354 | ||
| 355 | static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) | ||
| 356 | { | ||
| 357 | return reg->type == PTR_TO_MAP_VALUE && | ||
| 358 | map_value_has_spin_lock(reg->map_ptr); | ||
| 359 | } | ||
| 360 | |||
| 354 | static bool reg_is_refcounted_or_null(const struct bpf_reg_state *reg) | 361 | static bool reg_is_refcounted_or_null(const struct bpf_reg_state *reg) |
| 355 | { | 362 | { |
| 356 | return type_is_refcounted_or_null(reg->type); | 363 | return type_is_refcounted_or_null(reg->type); |
| @@ -712,6 +719,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, | |||
| 712 | } | 719 | } |
| 713 | dst_state->speculative = src->speculative; | 720 | dst_state->speculative = src->speculative; |
| 714 | dst_state->curframe = src->curframe; | 721 | dst_state->curframe = src->curframe; |
| 722 | dst_state->active_spin_lock = src->active_spin_lock; | ||
| 715 | for (i = 0; i <= src->curframe; i++) { | 723 | for (i = 0; i <= src->curframe; i++) { |
| 716 | dst = dst_state->frame[i]; | 724 | dst = dst_state->frame[i]; |
| 717 | if (!dst) { | 725 | if (!dst) { |
| @@ -1483,6 +1491,21 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, | |||
| 1483 | if (err) | 1491 | if (err) |
| 1484 | verbose(env, "R%d max value is outside of the array range\n", | 1492 | verbose(env, "R%d max value is outside of the array range\n", |
| 1485 | regno); | 1493 | regno); |
| 1494 | |||
| 1495 | if (map_value_has_spin_lock(reg->map_ptr)) { | ||
| 1496 | u32 lock = reg->map_ptr->spin_lock_off; | ||
| 1497 | |||
| 1498 | /* if any part of struct bpf_spin_lock can be touched by | ||
| 1499 | * load/store reject this program. | ||
| 1500 | * To check that [x1, x2) overlaps with [y1, y2) | ||
| 1501 | * it is sufficient to check x1 < y2 && y1 < x2. | ||
| 1502 | */ | ||
| 1503 | if (reg->smin_value + off < lock + sizeof(struct bpf_spin_lock) && | ||
| 1504 | lock < reg->umax_value + off + size) { | ||
| 1505 | verbose(env, "bpf_spin_lock cannot be accessed directly by load/store\n"); | ||
| 1506 | return -EACCES; | ||
| 1507 | } | ||
| 1508 | } | ||
| 1486 | return err; | 1509 | return err; |
| 1487 | } | 1510 | } |
| 1488 | 1511 | ||
| @@ -2192,6 +2215,91 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, | |||
| 2192 | } | 2215 | } |
| 2193 | } | 2216 | } |
| 2194 | 2217 | ||
| 2218 | /* Implementation details: | ||
| 2219 | * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL | ||
| 2220 | * Two bpf_map_lookups (even with the same key) will have different reg->id. | ||
| 2221 | * For traditional PTR_TO_MAP_VALUE the verifier clears reg->id after | ||
| 2222 | * value_or_null->value transition, since the verifier only cares about | ||
| 2223 | * the range of access to valid map value pointer and doesn't care about actual | ||
| 2224 | * address of the map element. | ||
| 2225 | * For maps with 'struct bpf_spin_lock' inside map value the verifier keeps | ||
| 2226 | * reg->id > 0 after value_or_null->value transition. By doing so | ||
| 2227 | * two bpf_map_lookups will be considered two different pointers that | ||
| 2228 | * point to different bpf_spin_locks. | ||
| 2229 | * The verifier allows taking only one bpf_spin_lock at a time to avoid | ||
| 2230 | * dead-locks. | ||
| 2231 | * Since only one bpf_spin_lock is allowed the checks are simpler than | ||
| 2232 | * reg_is_refcounted() logic. The verifier needs to remember only | ||
| 2233 | * one spin_lock instead of array of acquired_refs. | ||
| 2234 | * cur_state->active_spin_lock remembers which map value element got locked | ||
| 2235 | * and clears it after bpf_spin_unlock. | ||
| 2236 | */ | ||
| 2237 | static int process_spin_lock(struct bpf_verifier_env *env, int regno, | ||
| 2238 | bool is_lock) | ||
| 2239 | { | ||
| 2240 | struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; | ||
| 2241 | struct bpf_verifier_state *cur = env->cur_state; | ||
| 2242 | bool is_const = tnum_is_const(reg->var_off); | ||
| 2243 | struct bpf_map *map = reg->map_ptr; | ||
| 2244 | u64 val = reg->var_off.value; | ||
| 2245 | |||
| 2246 | if (reg->type != PTR_TO_MAP_VALUE) { | ||
| 2247 | verbose(env, "R%d is not a pointer to map_value\n", regno); | ||
| 2248 | return -EINVAL; | ||
| 2249 | } | ||
| 2250 | if (!is_const) { | ||
| 2251 | verbose(env, | ||
| 2252 | "R%d doesn't have constant offset. bpf_spin_lock has to be at the constant offset\n", | ||
| 2253 | regno); | ||
| 2254 | return -EINVAL; | ||
| 2255 | } | ||
| 2256 | if (!map->btf) { | ||
| 2257 | verbose(env, | ||
| 2258 | "map '%s' has to have BTF in order to use bpf_spin_lock\n", | ||
| 2259 | map->name); | ||
| 2260 | return -EINVAL; | ||
| 2261 | } | ||
| 2262 | if (!map_value_has_spin_lock(map)) { | ||
| 2263 | if (map->spin_lock_off == -E2BIG) | ||
| 2264 | verbose(env, | ||
| 2265 | "map '%s' has more than one 'struct bpf_spin_lock'\n", | ||
| 2266 | map->name); | ||
| 2267 | else if (map->spin_lock_off == -ENOENT) | ||
| 2268 | verbose(env, | ||
| 2269 | "map '%s' doesn't have 'struct bpf_spin_lock'\n", | ||
| 2270 | map->name); | ||
| 2271 | else | ||
| 2272 | verbose(env, | ||
| 2273 | "map '%s' is not a struct type or bpf_spin_lock is mangled\n", | ||
| 2274 | map->name); | ||
| 2275 | return -EINVAL; | ||
| 2276 | } | ||
| 2277 | if (map->spin_lock_off != val + reg->off) { | ||
| 2278 | verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock'\n", | ||
| 2279 | val + reg->off); | ||
| 2280 | return -EINVAL; | ||
| 2281 | } | ||
| 2282 | if (is_lock) { | ||
| 2283 | if (cur->active_spin_lock) { | ||
| 2284 | verbose(env, | ||
| 2285 | "Locking two bpf_spin_locks are not allowed\n"); | ||
| 2286 | return -EINVAL; | ||
| 2287 | } | ||
| 2288 | cur->active_spin_lock = reg->id; | ||
| 2289 | } else { | ||
| 2290 | if (!cur->active_spin_lock) { | ||
| 2291 | verbose(env, "bpf_spin_unlock without taking a lock\n"); | ||
| 2292 | return -EINVAL; | ||
| 2293 | } | ||
| 2294 | if (cur->active_spin_lock != reg->id) { | ||
| 2295 | verbose(env, "bpf_spin_unlock of different lock\n"); | ||
| 2296 | return -EINVAL; | ||
| 2297 | } | ||
| 2298 | cur->active_spin_lock = 0; | ||
| 2299 | } | ||
| 2300 | return 0; | ||
| 2301 | } | ||
| 2302 | |||
| 2195 | static bool arg_type_is_mem_ptr(enum bpf_arg_type type) | 2303 | static bool arg_type_is_mem_ptr(enum bpf_arg_type type) |
| 2196 | { | 2304 | { |
| 2197 | return type == ARG_PTR_TO_MEM || | 2305 | return type == ARG_PTR_TO_MEM || |
| @@ -2268,6 +2376,17 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, | |||
| 2268 | return -EFAULT; | 2376 | return -EFAULT; |
| 2269 | } | 2377 | } |
| 2270 | meta->ptr_id = reg->id; | 2378 | meta->ptr_id = reg->id; |
| 2379 | } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { | ||
| 2380 | if (meta->func_id == BPF_FUNC_spin_lock) { | ||
| 2381 | if (process_spin_lock(env, regno, true)) | ||
| 2382 | return -EACCES; | ||
| 2383 | } else if (meta->func_id == BPF_FUNC_spin_unlock) { | ||
| 2384 | if (process_spin_lock(env, regno, false)) | ||
| 2385 | return -EACCES; | ||
| 2386 | } else { | ||
| 2387 | verbose(env, "verifier internal error\n"); | ||
| 2388 | return -EFAULT; | ||
| 2389 | } | ||
| 2271 | } else if (arg_type_is_mem_ptr(arg_type)) { | 2390 | } else if (arg_type_is_mem_ptr(arg_type)) { |
| 2272 | expected_type = PTR_TO_STACK; | 2391 | expected_type = PTR_TO_STACK; |
| 2273 | /* One exception here. In case function allows for NULL to be | 2392 | /* One exception here. In case function allows for NULL to be |
| @@ -2887,6 +3006,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn | |||
| 2887 | return err; | 3006 | return err; |
| 2888 | } | 3007 | } |
| 2889 | 3008 | ||
| 3009 | meta.func_id = func_id; | ||
| 2890 | /* check args */ | 3010 | /* check args */ |
| 2891 | err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta); | 3011 | err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta); |
| 2892 | if (err) | 3012 | if (err) |
| @@ -4473,7 +4593,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, | |||
| 4473 | } else if (reg->type == PTR_TO_SOCKET_OR_NULL) { | 4593 | } else if (reg->type == PTR_TO_SOCKET_OR_NULL) { |
| 4474 | reg->type = PTR_TO_SOCKET; | 4594 | reg->type = PTR_TO_SOCKET; |
| 4475 | } | 4595 | } |
| 4476 | if (is_null || !reg_is_refcounted(reg)) { | 4596 | if (is_null || !(reg_is_refcounted(reg) || |
| 4597 | reg_may_point_to_spin_lock(reg))) { | ||
| 4477 | /* We don't need id from this point onwards anymore, | 4598 | /* We don't need id from this point onwards anymore, |
| 4478 | * thus we should better reset it, so that state | 4599 | * thus we should better reset it, so that state |
| 4479 | * pruning has chances to take effect. | 4600 | * pruning has chances to take effect. |
| @@ -4871,6 +4992,11 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) | |||
| 4871 | return err; | 4992 | return err; |
| 4872 | } | 4993 | } |
| 4873 | 4994 | ||
| 4995 | if (env->cur_state->active_spin_lock) { | ||
| 4996 | verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_spin_lock-ed region\n"); | ||
| 4997 | return -EINVAL; | ||
| 4998 | } | ||
| 4999 | |||
| 4874 | if (regs[BPF_REG_6].type != PTR_TO_CTX) { | 5000 | if (regs[BPF_REG_6].type != PTR_TO_CTX) { |
| 4875 | verbose(env, | 5001 | verbose(env, |
| 4876 | "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); | 5002 | "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); |
| @@ -5607,8 +5733,11 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, | |||
| 5607 | case PTR_TO_MAP_VALUE: | 5733 | case PTR_TO_MAP_VALUE: |
| 5608 | /* If the new min/max/var_off satisfy the old ones and | 5734 | /* If the new min/max/var_off satisfy the old ones and |
| 5609 | * everything else matches, we are OK. | 5735 | * everything else matches, we are OK. |
| 5610 | * We don't care about the 'id' value, because nothing | 5736 | * 'id' is not compared, since it's only used for maps with |
| 5611 | * uses it for PTR_TO_MAP_VALUE (only for ..._OR_NULL) | 5737 | * bpf_spin_lock inside map element and in such cases if |
| 5738 | * the rest of the prog is valid for one map element then | ||
| 5739 | * it's valid for all map elements regardless of the key | ||
| 5740 | * used in bpf_map_lookup() | ||
| 5612 | */ | 5741 | */ |
| 5613 | return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && | 5742 | return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && |
| 5614 | range_within(rold, rcur) && | 5743 | range_within(rold, rcur) && |
| @@ -5811,6 +5940,9 @@ static bool states_equal(struct bpf_verifier_env *env, | |||
| 5811 | if (old->speculative && !cur->speculative) | 5940 | if (old->speculative && !cur->speculative) |
| 5812 | return false; | 5941 | return false; |
| 5813 | 5942 | ||
| 5943 | if (old->active_spin_lock != cur->active_spin_lock) | ||
| 5944 | return false; | ||
| 5945 | |||
| 5814 | /* for states to be equal callsites have to be the same | 5946 | /* for states to be equal callsites have to be the same |
| 5815 | * and all frame states need to be equivalent | 5947 | * and all frame states need to be equivalent |
| 5816 | */ | 5948 | */ |
| @@ -6229,6 +6361,12 @@ static int do_check(struct bpf_verifier_env *env) | |||
| 6229 | return -EINVAL; | 6361 | return -EINVAL; |
| 6230 | } | 6362 | } |
| 6231 | 6363 | ||
| 6364 | if (env->cur_state->active_spin_lock && | ||
| 6365 | (insn->src_reg == BPF_PSEUDO_CALL || | ||
| 6366 | insn->imm != BPF_FUNC_spin_unlock)) { | ||
| 6367 | verbose(env, "function calls are not allowed while holding a lock\n"); | ||
| 6368 | return -EINVAL; | ||
| 6369 | } | ||
| 6232 | if (insn->src_reg == BPF_PSEUDO_CALL) | 6370 | if (insn->src_reg == BPF_PSEUDO_CALL) |
| 6233 | err = check_func_call(env, insn, &env->insn_idx); | 6371 | err = check_func_call(env, insn, &env->insn_idx); |
| 6234 | else | 6372 | else |
| @@ -6259,6 +6397,11 @@ static int do_check(struct bpf_verifier_env *env) | |||
| 6259 | return -EINVAL; | 6397 | return -EINVAL; |
| 6260 | } | 6398 | } |
| 6261 | 6399 | ||
| 6400 | if (env->cur_state->active_spin_lock) { | ||
| 6401 | verbose(env, "bpf_spin_unlock is missing\n"); | ||
| 6402 | return -EINVAL; | ||
| 6403 | } | ||
| 6404 | |||
| 6262 | if (state->curframe) { | 6405 | if (state->curframe) { |
| 6263 | /* exit from nested function */ | 6406 | /* exit from nested function */ |
| 6264 | env->prev_insn_idx = env->insn_idx; | 6407 | env->prev_insn_idx = env->insn_idx; |
| @@ -6356,6 +6499,19 @@ static int check_map_prealloc(struct bpf_map *map) | |||
| 6356 | !(map->map_flags & BPF_F_NO_PREALLOC); | 6499 | !(map->map_flags & BPF_F_NO_PREALLOC); |
| 6357 | } | 6500 | } |
| 6358 | 6501 | ||
| 6502 | static bool is_tracing_prog_type(enum bpf_prog_type type) | ||
| 6503 | { | ||
| 6504 | switch (type) { | ||
| 6505 | case BPF_PROG_TYPE_KPROBE: | ||
| 6506 | case BPF_PROG_TYPE_TRACEPOINT: | ||
| 6507 | case BPF_PROG_TYPE_PERF_EVENT: | ||
| 6508 | case BPF_PROG_TYPE_RAW_TRACEPOINT: | ||
| 6509 | return true; | ||
| 6510 | default: | ||
| 6511 | return false; | ||
| 6512 | } | ||
| 6513 | } | ||
| 6514 | |||
| 6359 | static int check_map_prog_compatibility(struct bpf_verifier_env *env, | 6515 | static int check_map_prog_compatibility(struct bpf_verifier_env *env, |
| 6360 | struct bpf_map *map, | 6516 | struct bpf_map *map, |
| 6361 | struct bpf_prog *prog) | 6517 | struct bpf_prog *prog) |
| @@ -6378,6 +6534,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, | |||
| 6378 | } | 6534 | } |
| 6379 | } | 6535 | } |
| 6380 | 6536 | ||
| 6537 | if ((is_tracing_prog_type(prog->type) || | ||
| 6538 | prog->type == BPF_PROG_TYPE_SOCKET_FILTER) && | ||
| 6539 | map_value_has_spin_lock(map)) { | ||
| 6540 | verbose(env, "tracing progs cannot use bpf_spin_lock yet\n"); | ||
| 6541 | return -EINVAL; | ||
| 6542 | } | ||
| 6543 | |||
| 6381 | if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) && | 6544 | if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) && |
| 6382 | !bpf_offload_prog_map_match(prog, map)) { | 6545 | !bpf_offload_prog_map_match(prog, map)) { |
| 6383 | verbose(env, "offload device mismatch between prog and map\n"); | 6546 | verbose(env, "offload device mismatch between prog and map\n"); |
