diff options
| author | Ingo Molnar <mingo@kernel.org> | 2018-02-06 15:12:31 -0500 |
|---|---|---|
| committer | Ingo Molnar <mingo@kernel.org> | 2018-02-06 15:12:31 -0500 |
| commit | 82845079160817cc6ac64e5321bbd935e0a47b3a (patch) | |
| tree | 0886d1d52428e9db14536cae4b37db896e7c360a /kernel | |
| parent | 32e839dda3ba576943365f0f5817ce5c843137dc (diff) | |
| parent | 68c5735eaa5e680e701c9a2d1e3c7880bdf5ab66 (diff) | |
Merge branch 'linus' into sched/urgent, to resolve conflicts
Conflicts:
arch/arm64/kernel/entry.S
arch/x86/Kconfig
include/linux/sched/mm.h
kernel/fork.c
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
58 files changed, 4082 insertions, 1225 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 172d151d429c..f85ae5dfa474 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
| @@ -81,6 +81,7 @@ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o | |||
| 81 | obj-$(CONFIG_GCOV_KERNEL) += gcov/ | 81 | obj-$(CONFIG_GCOV_KERNEL) += gcov/ |
| 82 | obj-$(CONFIG_KCOV) += kcov.o | 82 | obj-$(CONFIG_KCOV) += kcov.o |
| 83 | obj-$(CONFIG_KPROBES) += kprobes.o | 83 | obj-$(CONFIG_KPROBES) += kprobes.o |
| 84 | obj-$(CONFIG_FAIL_FUNCTION) += fail_function.o | ||
| 84 | obj-$(CONFIG_KGDB) += debug/ | 85 | obj-$(CONFIG_KGDB) += debug/ |
| 85 | obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o | 86 | obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o |
| 86 | obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o | 87 | obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o |
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index e691da0b3bab..a713fd23ec88 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile | |||
| @@ -9,9 +9,11 @@ obj-$(CONFIG_BPF_SYSCALL) += devmap.o | |||
| 9 | obj-$(CONFIG_BPF_SYSCALL) += cpumap.o | 9 | obj-$(CONFIG_BPF_SYSCALL) += cpumap.o |
| 10 | obj-$(CONFIG_BPF_SYSCALL) += offload.o | 10 | obj-$(CONFIG_BPF_SYSCALL) += offload.o |
| 11 | ifeq ($(CONFIG_STREAM_PARSER),y) | 11 | ifeq ($(CONFIG_STREAM_PARSER),y) |
| 12 | ifeq ($(CONFIG_INET),y) | ||
| 12 | obj-$(CONFIG_BPF_SYSCALL) += sockmap.o | 13 | obj-$(CONFIG_BPF_SYSCALL) += sockmap.o |
| 13 | endif | 14 | endif |
| 14 | endif | 15 | endif |
| 16 | endif | ||
| 15 | ifeq ($(CONFIG_PERF_EVENTS),y) | 17 | ifeq ($(CONFIG_PERF_EVENTS),y) |
| 16 | obj-$(CONFIG_BPF_SYSCALL) += stackmap.o | 18 | obj-$(CONFIG_BPF_SYSCALL) += stackmap.o |
| 17 | endif | 19 | endif |
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index ab94d304a634..b1f66480135b 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c | |||
| @@ -49,27 +49,35 @@ static int bpf_array_alloc_percpu(struct bpf_array *array) | |||
| 49 | } | 49 | } |
| 50 | 50 | ||
| 51 | /* Called from syscall */ | 51 | /* Called from syscall */ |
| 52 | static struct bpf_map *array_map_alloc(union bpf_attr *attr) | 52 | static int array_map_alloc_check(union bpf_attr *attr) |
| 53 | { | 53 | { |
| 54 | bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; | 54 | bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; |
| 55 | int numa_node = bpf_map_attr_numa_node(attr); | 55 | int numa_node = bpf_map_attr_numa_node(attr); |
| 56 | u32 elem_size, index_mask, max_entries; | ||
| 57 | bool unpriv = !capable(CAP_SYS_ADMIN); | ||
| 58 | struct bpf_array *array; | ||
| 59 | u64 array_size, mask64; | ||
| 60 | 56 | ||
| 61 | /* check sanity of attributes */ | 57 | /* check sanity of attributes */ |
| 62 | if (attr->max_entries == 0 || attr->key_size != 4 || | 58 | if (attr->max_entries == 0 || attr->key_size != 4 || |
| 63 | attr->value_size == 0 || | 59 | attr->value_size == 0 || |
| 64 | attr->map_flags & ~ARRAY_CREATE_FLAG_MASK || | 60 | attr->map_flags & ~ARRAY_CREATE_FLAG_MASK || |
| 65 | (percpu && numa_node != NUMA_NO_NODE)) | 61 | (percpu && numa_node != NUMA_NO_NODE)) |
| 66 | return ERR_PTR(-EINVAL); | 62 | return -EINVAL; |
| 67 | 63 | ||
| 68 | if (attr->value_size > KMALLOC_MAX_SIZE) | 64 | if (attr->value_size > KMALLOC_MAX_SIZE) |
| 69 | /* if value_size is bigger, the user space won't be able to | 65 | /* if value_size is bigger, the user space won't be able to |
| 70 | * access the elements. | 66 | * access the elements. |
| 71 | */ | 67 | */ |
| 72 | return ERR_PTR(-E2BIG); | 68 | return -E2BIG; |
| 69 | |||
| 70 | return 0; | ||
| 71 | } | ||
| 72 | |||
| 73 | static struct bpf_map *array_map_alloc(union bpf_attr *attr) | ||
| 74 | { | ||
| 75 | bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; | ||
| 76 | int numa_node = bpf_map_attr_numa_node(attr); | ||
| 77 | u32 elem_size, index_mask, max_entries; | ||
| 78 | bool unpriv = !capable(CAP_SYS_ADMIN); | ||
| 79 | struct bpf_array *array; | ||
| 80 | u64 array_size, mask64; | ||
| 73 | 81 | ||
| 74 | elem_size = round_up(attr->value_size, 8); | 82 | elem_size = round_up(attr->value_size, 8); |
| 75 | 83 | ||
| @@ -112,12 +120,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) | |||
| 112 | array->map.unpriv_array = unpriv; | 120 | array->map.unpriv_array = unpriv; |
| 113 | 121 | ||
| 114 | /* copy mandatory map attributes */ | 122 | /* copy mandatory map attributes */ |
| 115 | array->map.map_type = attr->map_type; | 123 | bpf_map_init_from_attr(&array->map, attr); |
| 116 | array->map.key_size = attr->key_size; | ||
| 117 | array->map.value_size = attr->value_size; | ||
| 118 | array->map.max_entries = attr->max_entries; | ||
| 119 | array->map.map_flags = attr->map_flags; | ||
| 120 | array->map.numa_node = numa_node; | ||
| 121 | array->elem_size = elem_size; | 124 | array->elem_size = elem_size; |
| 122 | 125 | ||
| 123 | if (!percpu) | 126 | if (!percpu) |
| @@ -327,6 +330,7 @@ static void array_map_free(struct bpf_map *map) | |||
| 327 | } | 330 | } |
| 328 | 331 | ||
| 329 | const struct bpf_map_ops array_map_ops = { | 332 | const struct bpf_map_ops array_map_ops = { |
| 333 | .map_alloc_check = array_map_alloc_check, | ||
| 330 | .map_alloc = array_map_alloc, | 334 | .map_alloc = array_map_alloc, |
| 331 | .map_free = array_map_free, | 335 | .map_free = array_map_free, |
| 332 | .map_get_next_key = array_map_get_next_key, | 336 | .map_get_next_key = array_map_get_next_key, |
| @@ -337,6 +341,7 @@ const struct bpf_map_ops array_map_ops = { | |||
| 337 | }; | 341 | }; |
| 338 | 342 | ||
| 339 | const struct bpf_map_ops percpu_array_map_ops = { | 343 | const struct bpf_map_ops percpu_array_map_ops = { |
| 344 | .map_alloc_check = array_map_alloc_check, | ||
| 340 | .map_alloc = array_map_alloc, | 345 | .map_alloc = array_map_alloc, |
| 341 | .map_free = array_map_free, | 346 | .map_free = array_map_free, |
| 342 | .map_get_next_key = array_map_get_next_key, | 347 | .map_get_next_key = array_map_get_next_key, |
| @@ -345,12 +350,12 @@ const struct bpf_map_ops percpu_array_map_ops = { | |||
| 345 | .map_delete_elem = array_map_delete_elem, | 350 | .map_delete_elem = array_map_delete_elem, |
| 346 | }; | 351 | }; |
| 347 | 352 | ||
| 348 | static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr) | 353 | static int fd_array_map_alloc_check(union bpf_attr *attr) |
| 349 | { | 354 | { |
| 350 | /* only file descriptors can be stored in this type of map */ | 355 | /* only file descriptors can be stored in this type of map */ |
| 351 | if (attr->value_size != sizeof(u32)) | 356 | if (attr->value_size != sizeof(u32)) |
| 352 | return ERR_PTR(-EINVAL); | 357 | return -EINVAL; |
| 353 | return array_map_alloc(attr); | 358 | return array_map_alloc_check(attr); |
| 354 | } | 359 | } |
| 355 | 360 | ||
| 356 | static void fd_array_map_free(struct bpf_map *map) | 361 | static void fd_array_map_free(struct bpf_map *map) |
| @@ -474,7 +479,8 @@ void bpf_fd_array_map_clear(struct bpf_map *map) | |||
| 474 | } | 479 | } |
| 475 | 480 | ||
| 476 | const struct bpf_map_ops prog_array_map_ops = { | 481 | const struct bpf_map_ops prog_array_map_ops = { |
| 477 | .map_alloc = fd_array_map_alloc, | 482 | .map_alloc_check = fd_array_map_alloc_check, |
| 483 | .map_alloc = array_map_alloc, | ||
| 478 | .map_free = fd_array_map_free, | 484 | .map_free = fd_array_map_free, |
| 479 | .map_get_next_key = array_map_get_next_key, | 485 | .map_get_next_key = array_map_get_next_key, |
| 480 | .map_lookup_elem = fd_array_map_lookup_elem, | 486 | .map_lookup_elem = fd_array_map_lookup_elem, |
| @@ -561,7 +567,8 @@ static void perf_event_fd_array_release(struct bpf_map *map, | |||
| 561 | } | 567 | } |
| 562 | 568 | ||
| 563 | const struct bpf_map_ops perf_event_array_map_ops = { | 569 | const struct bpf_map_ops perf_event_array_map_ops = { |
| 564 | .map_alloc = fd_array_map_alloc, | 570 | .map_alloc_check = fd_array_map_alloc_check, |
| 571 | .map_alloc = array_map_alloc, | ||
| 565 | .map_free = fd_array_map_free, | 572 | .map_free = fd_array_map_free, |
| 566 | .map_get_next_key = array_map_get_next_key, | 573 | .map_get_next_key = array_map_get_next_key, |
| 567 | .map_lookup_elem = fd_array_map_lookup_elem, | 574 | .map_lookup_elem = fd_array_map_lookup_elem, |
| @@ -592,7 +599,8 @@ static void cgroup_fd_array_free(struct bpf_map *map) | |||
| 592 | } | 599 | } |
| 593 | 600 | ||
| 594 | const struct bpf_map_ops cgroup_array_map_ops = { | 601 | const struct bpf_map_ops cgroup_array_map_ops = { |
| 595 | .map_alloc = fd_array_map_alloc, | 602 | .map_alloc_check = fd_array_map_alloc_check, |
| 603 | .map_alloc = array_map_alloc, | ||
| 596 | .map_free = cgroup_fd_array_free, | 604 | .map_free = cgroup_fd_array_free, |
| 597 | .map_get_next_key = array_map_get_next_key, | 605 | .map_get_next_key = array_map_get_next_key, |
| 598 | .map_lookup_elem = fd_array_map_lookup_elem, | 606 | .map_lookup_elem = fd_array_map_lookup_elem, |
| @@ -610,7 +618,7 @@ static struct bpf_map *array_of_map_alloc(union bpf_attr *attr) | |||
| 610 | if (IS_ERR(inner_map_meta)) | 618 | if (IS_ERR(inner_map_meta)) |
| 611 | return inner_map_meta; | 619 | return inner_map_meta; |
| 612 | 620 | ||
| 613 | map = fd_array_map_alloc(attr); | 621 | map = array_map_alloc(attr); |
| 614 | if (IS_ERR(map)) { | 622 | if (IS_ERR(map)) { |
| 615 | bpf_map_meta_free(inner_map_meta); | 623 | bpf_map_meta_free(inner_map_meta); |
| 616 | return map; | 624 | return map; |
| @@ -673,6 +681,7 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map, | |||
| 673 | } | 681 | } |
| 674 | 682 | ||
| 675 | const struct bpf_map_ops array_of_maps_map_ops = { | 683 | const struct bpf_map_ops array_of_maps_map_ops = { |
| 684 | .map_alloc_check = fd_array_map_alloc_check, | ||
| 676 | .map_alloc = array_of_map_alloc, | 685 | .map_alloc = array_of_map_alloc, |
| 677 | .map_free = array_of_map_free, | 686 | .map_free = array_of_map_free, |
| 678 | .map_get_next_key = array_map_get_next_key, | 687 | .map_get_next_key = array_map_get_next_key, |
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index b789ab78d28f..c1c0b60d3f2f 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c | |||
| @@ -568,6 +568,8 @@ static bool cgroup_dev_is_valid_access(int off, int size, | |||
| 568 | enum bpf_access_type type, | 568 | enum bpf_access_type type, |
| 569 | struct bpf_insn_access_aux *info) | 569 | struct bpf_insn_access_aux *info) |
| 570 | { | 570 | { |
| 571 | const int size_default = sizeof(__u32); | ||
| 572 | |||
| 571 | if (type == BPF_WRITE) | 573 | if (type == BPF_WRITE) |
| 572 | return false; | 574 | return false; |
| 573 | 575 | ||
| @@ -576,8 +578,17 @@ static bool cgroup_dev_is_valid_access(int off, int size, | |||
| 576 | /* The verifier guarantees that size > 0. */ | 578 | /* The verifier guarantees that size > 0. */ |
| 577 | if (off % size != 0) | 579 | if (off % size != 0) |
| 578 | return false; | 580 | return false; |
| 579 | if (size != sizeof(__u32)) | 581 | |
| 580 | return false; | 582 | switch (off) { |
| 583 | case bpf_ctx_range(struct bpf_cgroup_dev_ctx, access_type): | ||
| 584 | bpf_ctx_record_field_size(info, size_default); | ||
| 585 | if (!bpf_ctx_narrow_access_ok(off, size, size_default)) | ||
| 586 | return false; | ||
| 587 | break; | ||
| 588 | default: | ||
| 589 | if (size != size_default) | ||
| 590 | return false; | ||
| 591 | } | ||
| 581 | 592 | ||
| 582 | return true; | 593 | return true; |
| 583 | } | 594 | } |
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7949e8b8f94e..5f35f93dcab2 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c | |||
| @@ -94,6 +94,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) | |||
| 94 | fp->pages = size / PAGE_SIZE; | 94 | fp->pages = size / PAGE_SIZE; |
| 95 | fp->aux = aux; | 95 | fp->aux = aux; |
| 96 | fp->aux->prog = fp; | 96 | fp->aux->prog = fp; |
| 97 | fp->jit_requested = ebpf_jit_enabled(); | ||
| 97 | 98 | ||
| 98 | INIT_LIST_HEAD_RCU(&fp->aux->ksym_lnode); | 99 | INIT_LIST_HEAD_RCU(&fp->aux->ksym_lnode); |
| 99 | 100 | ||
| @@ -217,30 +218,40 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) | |||
| 217 | return 0; | 218 | return 0; |
| 218 | } | 219 | } |
| 219 | 220 | ||
| 220 | static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn) | ||
| 221 | { | ||
| 222 | return BPF_CLASS(insn->code) == BPF_JMP && | ||
| 223 | /* Call and Exit are both special jumps with no | ||
| 224 | * target inside the BPF instruction image. | ||
| 225 | */ | ||
| 226 | BPF_OP(insn->code) != BPF_CALL && | ||
| 227 | BPF_OP(insn->code) != BPF_EXIT; | ||
| 228 | } | ||
| 229 | |||
| 230 | static void bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta) | 221 | static void bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta) |
| 231 | { | 222 | { |
| 232 | struct bpf_insn *insn = prog->insnsi; | 223 | struct bpf_insn *insn = prog->insnsi; |
| 233 | u32 i, insn_cnt = prog->len; | 224 | u32 i, insn_cnt = prog->len; |
| 225 | bool pseudo_call; | ||
| 226 | u8 code; | ||
| 227 | int off; | ||
| 234 | 228 | ||
| 235 | for (i = 0; i < insn_cnt; i++, insn++) { | 229 | for (i = 0; i < insn_cnt; i++, insn++) { |
| 236 | if (!bpf_is_jmp_and_has_target(insn)) | 230 | code = insn->code; |
| 231 | if (BPF_CLASS(code) != BPF_JMP) | ||
| 237 | continue; | 232 | continue; |
| 233 | if (BPF_OP(code) == BPF_EXIT) | ||
| 234 | continue; | ||
| 235 | if (BPF_OP(code) == BPF_CALL) { | ||
| 236 | if (insn->src_reg == BPF_PSEUDO_CALL) | ||
| 237 | pseudo_call = true; | ||
| 238 | else | ||
| 239 | continue; | ||
| 240 | } else { | ||
| 241 | pseudo_call = false; | ||
| 242 | } | ||
| 243 | off = pseudo_call ? insn->imm : insn->off; | ||
| 238 | 244 | ||
| 239 | /* Adjust offset of jmps if we cross boundaries. */ | 245 | /* Adjust offset of jmps if we cross boundaries. */ |
| 240 | if (i < pos && i + insn->off + 1 > pos) | 246 | if (i < pos && i + off + 1 > pos) |
| 241 | insn->off += delta; | 247 | off += delta; |
| 242 | else if (i > pos + delta && i + insn->off + 1 <= pos + delta) | 248 | else if (i > pos + delta && i + off + 1 <= pos + delta) |
| 243 | insn->off -= delta; | 249 | off -= delta; |
| 250 | |||
| 251 | if (pseudo_call) | ||
| 252 | insn->imm = off; | ||
| 253 | else | ||
| 254 | insn->off = off; | ||
| 244 | } | 255 | } |
| 245 | } | 256 | } |
| 246 | 257 | ||
| @@ -289,6 +300,11 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, | |||
| 289 | } | 300 | } |
| 290 | 301 | ||
| 291 | #ifdef CONFIG_BPF_JIT | 302 | #ifdef CONFIG_BPF_JIT |
| 303 | /* All BPF JIT sysctl knobs here. */ | ||
| 304 | int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON); | ||
| 305 | int bpf_jit_harden __read_mostly; | ||
| 306 | int bpf_jit_kallsyms __read_mostly; | ||
| 307 | |||
| 292 | static __always_inline void | 308 | static __always_inline void |
| 293 | bpf_get_prog_addr_region(const struct bpf_prog *prog, | 309 | bpf_get_prog_addr_region(const struct bpf_prog *prog, |
| 294 | unsigned long *symbol_start, | 310 | unsigned long *symbol_start, |
| @@ -370,8 +386,6 @@ static DEFINE_SPINLOCK(bpf_lock); | |||
| 370 | static LIST_HEAD(bpf_kallsyms); | 386 | static LIST_HEAD(bpf_kallsyms); |
| 371 | static struct latch_tree_root bpf_tree __cacheline_aligned; | 387 | static struct latch_tree_root bpf_tree __cacheline_aligned; |
| 372 | 388 | ||
| 373 | int bpf_jit_kallsyms __read_mostly; | ||
| 374 | |||
| 375 | static void bpf_prog_ksym_node_add(struct bpf_prog_aux *aux) | 389 | static void bpf_prog_ksym_node_add(struct bpf_prog_aux *aux) |
| 376 | { | 390 | { |
| 377 | WARN_ON_ONCE(!list_empty(&aux->ksym_lnode)); | 391 | WARN_ON_ONCE(!list_empty(&aux->ksym_lnode)); |
| @@ -552,8 +566,6 @@ void __weak bpf_jit_free(struct bpf_prog *fp) | |||
| 552 | bpf_prog_unlock_free(fp); | 566 | bpf_prog_unlock_free(fp); |
| 553 | } | 567 | } |
| 554 | 568 | ||
| 555 | int bpf_jit_harden __read_mostly; | ||
| 556 | |||
| 557 | static int bpf_jit_blind_insn(const struct bpf_insn *from, | 569 | static int bpf_jit_blind_insn(const struct bpf_insn *from, |
| 558 | const struct bpf_insn *aux, | 570 | const struct bpf_insn *aux, |
| 559 | struct bpf_insn *to_buff) | 571 | struct bpf_insn *to_buff) |
| @@ -711,7 +723,7 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) | |||
| 711 | struct bpf_insn *insn; | 723 | struct bpf_insn *insn; |
| 712 | int i, rewritten; | 724 | int i, rewritten; |
| 713 | 725 | ||
| 714 | if (!bpf_jit_blinding_enabled()) | 726 | if (!bpf_jit_blinding_enabled(prog) || prog->blinded) |
| 715 | return prog; | 727 | return prog; |
| 716 | 728 | ||
| 717 | clone = bpf_prog_clone_create(prog, GFP_USER); | 729 | clone = bpf_prog_clone_create(prog, GFP_USER); |
| @@ -753,13 +765,16 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) | |||
| 753 | i += insn_delta; | 765 | i += insn_delta; |
| 754 | } | 766 | } |
| 755 | 767 | ||
| 768 | clone->blinded = 1; | ||
| 756 | return clone; | 769 | return clone; |
| 757 | } | 770 | } |
| 758 | #endif /* CONFIG_BPF_JIT */ | 771 | #endif /* CONFIG_BPF_JIT */ |
| 759 | 772 | ||
| 760 | /* Base function for offset calculation. Needs to go into .text section, | 773 | /* Base function for offset calculation. Needs to go into .text section, |
| 761 | * therefore keeping it non-static as well; will also be used by JITs | 774 | * therefore keeping it non-static as well; will also be used by JITs |
| 762 | * anyway later on, so do not let the compiler omit it. | 775 | * anyway later on, so do not let the compiler omit it. This also needs |
| 776 | * to go into kallsyms for correlation from e.g. bpftool, so naming | ||
| 777 | * must not change. | ||
| 763 | */ | 778 | */ |
| 764 | noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | 779 | noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) |
| 765 | { | 780 | { |
| @@ -767,6 +782,137 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | |||
| 767 | } | 782 | } |
| 768 | EXPORT_SYMBOL_GPL(__bpf_call_base); | 783 | EXPORT_SYMBOL_GPL(__bpf_call_base); |
| 769 | 784 | ||
| 785 | /* All UAPI available opcodes. */ | ||
| 786 | #define BPF_INSN_MAP(INSN_2, INSN_3) \ | ||
| 787 | /* 32 bit ALU operations. */ \ | ||
| 788 | /* Register based. */ \ | ||
| 789 | INSN_3(ALU, ADD, X), \ | ||
| 790 | INSN_3(ALU, SUB, X), \ | ||
| 791 | INSN_3(ALU, AND, X), \ | ||
| 792 | INSN_3(ALU, OR, X), \ | ||
| 793 | INSN_3(ALU, LSH, X), \ | ||
| 794 | INSN_3(ALU, RSH, X), \ | ||
| 795 | INSN_3(ALU, XOR, X), \ | ||
| 796 | INSN_3(ALU, MUL, X), \ | ||
| 797 | INSN_3(ALU, MOV, X), \ | ||
| 798 | INSN_3(ALU, DIV, X), \ | ||
| 799 | INSN_3(ALU, MOD, X), \ | ||
| 800 | INSN_2(ALU, NEG), \ | ||
| 801 | INSN_3(ALU, END, TO_BE), \ | ||
| 802 | INSN_3(ALU, END, TO_LE), \ | ||
| 803 | /* Immediate based. */ \ | ||
| 804 | INSN_3(ALU, ADD, K), \ | ||
| 805 | INSN_3(ALU, SUB, K), \ | ||
| 806 | INSN_3(ALU, AND, K), \ | ||
| 807 | INSN_3(ALU, OR, K), \ | ||
| 808 | INSN_3(ALU, LSH, K), \ | ||
| 809 | INSN_3(ALU, RSH, K), \ | ||
| 810 | INSN_3(ALU, XOR, K), \ | ||
| 811 | INSN_3(ALU, MUL, K), \ | ||
| 812 | INSN_3(ALU, MOV, K), \ | ||
| 813 | INSN_3(ALU, DIV, K), \ | ||
| 814 | INSN_3(ALU, MOD, K), \ | ||
| 815 | /* 64 bit ALU operations. */ \ | ||
| 816 | /* Register based. */ \ | ||
| 817 | INSN_3(ALU64, ADD, X), \ | ||
| 818 | INSN_3(ALU64, SUB, X), \ | ||
| 819 | INSN_3(ALU64, AND, X), \ | ||
| 820 | INSN_3(ALU64, OR, X), \ | ||
| 821 | INSN_3(ALU64, LSH, X), \ | ||
| 822 | INSN_3(ALU64, RSH, X), \ | ||
| 823 | INSN_3(ALU64, XOR, X), \ | ||
| 824 | INSN_3(ALU64, MUL, X), \ | ||
| 825 | INSN_3(ALU64, MOV, X), \ | ||
| 826 | INSN_3(ALU64, ARSH, X), \ | ||
| 827 | INSN_3(ALU64, DIV, X), \ | ||
| 828 | INSN_3(ALU64, MOD, X), \ | ||
| 829 | INSN_2(ALU64, NEG), \ | ||
| 830 | /* Immediate based. */ \ | ||
| 831 | INSN_3(ALU64, ADD, K), \ | ||
| 832 | INSN_3(ALU64, SUB, K), \ | ||
| 833 | INSN_3(ALU64, AND, K), \ | ||
| 834 | INSN_3(ALU64, OR, K), \ | ||
| 835 | INSN_3(ALU64, LSH, K), \ | ||
| 836 | INSN_3(ALU64, RSH, K), \ | ||
| 837 | INSN_3(ALU64, XOR, K), \ | ||
| 838 | INSN_3(ALU64, MUL, K), \ | ||
| 839 | INSN_3(ALU64, MOV, K), \ | ||
| 840 | INSN_3(ALU64, ARSH, K), \ | ||
| 841 | INSN_3(ALU64, DIV, K), \ | ||
| 842 | INSN_3(ALU64, MOD, K), \ | ||
| 843 | /* Call instruction. */ \ | ||
| 844 | INSN_2(JMP, CALL), \ | ||
| 845 | /* Exit instruction. */ \ | ||
| 846 | INSN_2(JMP, EXIT), \ | ||
| 847 | /* Jump instructions. */ \ | ||
| 848 | /* Register based. */ \ | ||
| 849 | INSN_3(JMP, JEQ, X), \ | ||
| 850 | INSN_3(JMP, JNE, X), \ | ||
| 851 | INSN_3(JMP, JGT, X), \ | ||
| 852 | INSN_3(JMP, JLT, X), \ | ||
| 853 | INSN_3(JMP, JGE, X), \ | ||
| 854 | INSN_3(JMP, JLE, X), \ | ||
| 855 | INSN_3(JMP, JSGT, X), \ | ||
| 856 | INSN_3(JMP, JSLT, X), \ | ||
| 857 | INSN_3(JMP, JSGE, X), \ | ||
| 858 | INSN_3(JMP, JSLE, X), \ | ||
| 859 | INSN_3(JMP, JSET, X), \ | ||
| 860 | /* Immediate based. */ \ | ||
| 861 | INSN_3(JMP, JEQ, K), \ | ||
| 862 | INSN_3(JMP, JNE, K), \ | ||
| 863 | INSN_3(JMP, JGT, K), \ | ||
| 864 | INSN_3(JMP, JLT, K), \ | ||
| 865 | INSN_3(JMP, JGE, K), \ | ||
| 866 | INSN_3(JMP, JLE, K), \ | ||
| 867 | INSN_3(JMP, JSGT, K), \ | ||
| 868 | INSN_3(JMP, JSLT, K), \ | ||
| 869 | INSN_3(JMP, JSGE, K), \ | ||
| 870 | INSN_3(JMP, JSLE, K), \ | ||
| 871 | INSN_3(JMP, JSET, K), \ | ||
| 872 | INSN_2(JMP, JA), \ | ||
| 873 | /* Store instructions. */ \ | ||
| 874 | /* Register based. */ \ | ||
| 875 | INSN_3(STX, MEM, B), \ | ||
| 876 | INSN_3(STX, MEM, H), \ | ||
| 877 | INSN_3(STX, MEM, W), \ | ||
| 878 | INSN_3(STX, MEM, DW), \ | ||
| 879 | INSN_3(STX, XADD, W), \ | ||
| 880 | INSN_3(STX, XADD, DW), \ | ||
| 881 | /* Immediate based. */ \ | ||
| 882 | INSN_3(ST, MEM, B), \ | ||
| 883 | INSN_3(ST, MEM, H), \ | ||
| 884 | INSN_3(ST, MEM, W), \ | ||
| 885 | INSN_3(ST, MEM, DW), \ | ||
| 886 | /* Load instructions. */ \ | ||
| 887 | /* Register based. */ \ | ||
| 888 | INSN_3(LDX, MEM, B), \ | ||
| 889 | INSN_3(LDX, MEM, H), \ | ||
| 890 | INSN_3(LDX, MEM, W), \ | ||
| 891 | INSN_3(LDX, MEM, DW), \ | ||
| 892 | /* Immediate based. */ \ | ||
| 893 | INSN_3(LD, IMM, DW), \ | ||
| 894 | /* Misc (old cBPF carry-over). */ \ | ||
| 895 | INSN_3(LD, ABS, B), \ | ||
| 896 | INSN_3(LD, ABS, H), \ | ||
| 897 | INSN_3(LD, ABS, W), \ | ||
| 898 | INSN_3(LD, IND, B), \ | ||
| 899 | INSN_3(LD, IND, H), \ | ||
| 900 | INSN_3(LD, IND, W) | ||
| 901 | |||
| 902 | bool bpf_opcode_in_insntable(u8 code) | ||
| 903 | { | ||
| 904 | #define BPF_INSN_2_TBL(x, y) [BPF_##x | BPF_##y] = true | ||
| 905 | #define BPF_INSN_3_TBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = true | ||
| 906 | static const bool public_insntable[256] = { | ||
| 907 | [0 ... 255] = false, | ||
| 908 | /* Now overwrite non-defaults ... */ | ||
| 909 | BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL), | ||
| 910 | }; | ||
| 911 | #undef BPF_INSN_3_TBL | ||
| 912 | #undef BPF_INSN_2_TBL | ||
| 913 | return public_insntable[code]; | ||
| 914 | } | ||
| 915 | |||
| 770 | #ifndef CONFIG_BPF_JIT_ALWAYS_ON | 916 | #ifndef CONFIG_BPF_JIT_ALWAYS_ON |
| 771 | /** | 917 | /** |
| 772 | * __bpf_prog_run - run eBPF program on a given context | 918 | * __bpf_prog_run - run eBPF program on a given context |
| @@ -775,118 +921,21 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); | |||
| 775 | * | 921 | * |
| 776 | * Decode and execute eBPF instructions. | 922 | * Decode and execute eBPF instructions. |
| 777 | */ | 923 | */ |
| 778 | static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, | 924 | static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) |
| 779 | u64 *stack) | ||
| 780 | { | 925 | { |
| 781 | u64 tmp; | 926 | u64 tmp; |
| 927 | #define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y | ||
| 928 | #define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z | ||
| 782 | static const void *jumptable[256] = { | 929 | static const void *jumptable[256] = { |
| 783 | [0 ... 255] = &&default_label, | 930 | [0 ... 255] = &&default_label, |
| 784 | /* Now overwrite non-defaults ... */ | 931 | /* Now overwrite non-defaults ... */ |
| 785 | /* 32 bit ALU operations */ | 932 | BPF_INSN_MAP(BPF_INSN_2_LBL, BPF_INSN_3_LBL), |
| 786 | [BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X, | 933 | /* Non-UAPI available opcodes. */ |
| 787 | [BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K, | 934 | [BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS, |
| 788 | [BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X, | ||
| 789 | [BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K, | ||
| 790 | [BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X, | ||
| 791 | [BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K, | ||
| 792 | [BPF_ALU | BPF_OR | BPF_X] = &&ALU_OR_X, | ||
| 793 | [BPF_ALU | BPF_OR | BPF_K] = &&ALU_OR_K, | ||
| 794 | [BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X, | ||
| 795 | [BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K, | ||
| 796 | [BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X, | ||
| 797 | [BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K, | ||
| 798 | [BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X, | ||
| 799 | [BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K, | ||
| 800 | [BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X, | ||
| 801 | [BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K, | ||
| 802 | [BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X, | ||
| 803 | [BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K, | ||
| 804 | [BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X, | ||
| 805 | [BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K, | ||
| 806 | [BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X, | ||
| 807 | [BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K, | ||
| 808 | [BPF_ALU | BPF_NEG] = &&ALU_NEG, | ||
| 809 | [BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE, | ||
| 810 | [BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE, | ||
| 811 | /* 64 bit ALU operations */ | ||
| 812 | [BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X, | ||
| 813 | [BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K, | ||
| 814 | [BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X, | ||
| 815 | [BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K, | ||
| 816 | [BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X, | ||
| 817 | [BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K, | ||
| 818 | [BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X, | ||
| 819 | [BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K, | ||
| 820 | [BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X, | ||
| 821 | [BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K, | ||
| 822 | [BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X, | ||
| 823 | [BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K, | ||
| 824 | [BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X, | ||
| 825 | [BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K, | ||
| 826 | [BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X, | ||
| 827 | [BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K, | ||
| 828 | [BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X, | ||
| 829 | [BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K, | ||
| 830 | [BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X, | ||
| 831 | [BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K, | ||
| 832 | [BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X, | ||
| 833 | [BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K, | ||
| 834 | [BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X, | ||
| 835 | [BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K, | ||
| 836 | [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG, | ||
| 837 | /* Call instruction */ | ||
| 838 | [BPF_JMP | BPF_CALL] = &&JMP_CALL, | ||
| 839 | [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL, | 935 | [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL, |
| 840 | /* Jumps */ | ||
| 841 | [BPF_JMP | BPF_JA] = &&JMP_JA, | ||
| 842 | [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X, | ||
| 843 | [BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K, | ||
| 844 | [BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X, | ||
| 845 | [BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K, | ||
| 846 | [BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X, | ||
| 847 | [BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K, | ||
| 848 | [BPF_JMP | BPF_JLT | BPF_X] = &&JMP_JLT_X, | ||
| 849 | [BPF_JMP | BPF_JLT | BPF_K] = &&JMP_JLT_K, | ||
| 850 | [BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X, | ||
| 851 | [BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K, | ||
| 852 | [BPF_JMP | BPF_JLE | BPF_X] = &&JMP_JLE_X, | ||
| 853 | [BPF_JMP | BPF_JLE | BPF_K] = &&JMP_JLE_K, | ||
| 854 | [BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X, | ||
| 855 | [BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K, | ||
| 856 | [BPF_JMP | BPF_JSLT | BPF_X] = &&JMP_JSLT_X, | ||
| 857 | [BPF_JMP | BPF_JSLT | BPF_K] = &&JMP_JSLT_K, | ||
| 858 | [BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X, | ||
| 859 | [BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K, | ||
| 860 | [BPF_JMP | BPF_JSLE | BPF_X] = &&JMP_JSLE_X, | ||
| 861 | [BPF_JMP | BPF_JSLE | BPF_K] = &&JMP_JSLE_K, | ||
| 862 | [BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X, | ||
| 863 | [BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K, | ||
| 864 | /* Program return */ | ||
| 865 | [BPF_JMP | BPF_EXIT] = &&JMP_EXIT, | ||
| 866 | /* Store instructions */ | ||
| 867 | [BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B, | ||
| 868 | [BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H, | ||
| 869 | [BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W, | ||
| 870 | [BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW, | ||
| 871 | [BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W, | ||
| 872 | [BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW, | ||
| 873 | [BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B, | ||
| 874 | [BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H, | ||
| 875 | [BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W, | ||
| 876 | [BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW, | ||
| 877 | /* Load instructions */ | ||
| 878 | [BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B, | ||
| 879 | [BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H, | ||
| 880 | [BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W, | ||
| 881 | [BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW, | ||
| 882 | [BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W, | ||
| 883 | [BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H, | ||
| 884 | [BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B, | ||
| 885 | [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W, | ||
| 886 | [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H, | ||
| 887 | [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B, | ||
| 888 | [BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW, | ||
| 889 | }; | 936 | }; |
| 937 | #undef BPF_INSN_3_LBL | ||
| 938 | #undef BPF_INSN_2_LBL | ||
| 890 | u32 tail_call_cnt = 0; | 939 | u32 tail_call_cnt = 0; |
| 891 | void *ptr; | 940 | void *ptr; |
| 892 | int off; | 941 | int off; |
| @@ -950,14 +999,10 @@ select_insn: | |||
| 950 | (*(s64 *) &DST) >>= IMM; | 999 | (*(s64 *) &DST) >>= IMM; |
| 951 | CONT; | 1000 | CONT; |
| 952 | ALU64_MOD_X: | 1001 | ALU64_MOD_X: |
| 953 | if (unlikely(SRC == 0)) | ||
| 954 | return 0; | ||
| 955 | div64_u64_rem(DST, SRC, &tmp); | 1002 | div64_u64_rem(DST, SRC, &tmp); |
| 956 | DST = tmp; | 1003 | DST = tmp; |
| 957 | CONT; | 1004 | CONT; |
| 958 | ALU_MOD_X: | 1005 | ALU_MOD_X: |
| 959 | if (unlikely((u32)SRC == 0)) | ||
| 960 | return 0; | ||
| 961 | tmp = (u32) DST; | 1006 | tmp = (u32) DST; |
| 962 | DST = do_div(tmp, (u32) SRC); | 1007 | DST = do_div(tmp, (u32) SRC); |
| 963 | CONT; | 1008 | CONT; |
| @@ -970,13 +1015,9 @@ select_insn: | |||
| 970 | DST = do_div(tmp, (u32) IMM); | 1015 | DST = do_div(tmp, (u32) IMM); |
| 971 | CONT; | 1016 | CONT; |
| 972 | ALU64_DIV_X: | 1017 | ALU64_DIV_X: |
| 973 | if (unlikely(SRC == 0)) | ||
| 974 | return 0; | ||
| 975 | DST = div64_u64(DST, SRC); | 1018 | DST = div64_u64(DST, SRC); |
| 976 | CONT; | 1019 | CONT; |
| 977 | ALU_DIV_X: | 1020 | ALU_DIV_X: |
| 978 | if (unlikely((u32)SRC == 0)) | ||
| 979 | return 0; | ||
| 980 | tmp = (u32) DST; | 1021 | tmp = (u32) DST; |
| 981 | do_div(tmp, (u32) SRC); | 1022 | do_div(tmp, (u32) SRC); |
| 982 | DST = (u32) tmp; | 1023 | DST = (u32) tmp; |
| @@ -1026,6 +1067,13 @@ select_insn: | |||
| 1026 | BPF_R4, BPF_R5); | 1067 | BPF_R4, BPF_R5); |
| 1027 | CONT; | 1068 | CONT; |
| 1028 | 1069 | ||
| 1070 | JMP_CALL_ARGS: | ||
| 1071 | BPF_R0 = (__bpf_call_base_args + insn->imm)(BPF_R1, BPF_R2, | ||
| 1072 | BPF_R3, BPF_R4, | ||
| 1073 | BPF_R5, | ||
| 1074 | insn + insn->off + 1); | ||
| 1075 | CONT; | ||
| 1076 | |||
| 1029 | JMP_TAIL_CALL: { | 1077 | JMP_TAIL_CALL: { |
| 1030 | struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2; | 1078 | struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2; |
| 1031 | struct bpf_array *array = container_of(map, struct bpf_array, map); | 1079 | struct bpf_array *array = container_of(map, struct bpf_array, map); |
| @@ -1280,8 +1328,14 @@ load_byte: | |||
| 1280 | goto load_byte; | 1328 | goto load_byte; |
| 1281 | 1329 | ||
| 1282 | default_label: | 1330 | default_label: |
| 1283 | /* If we ever reach this, we have a bug somewhere. */ | 1331 | /* If we ever reach this, we have a bug somewhere. Die hard here |
| 1284 | WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code); | 1332 | * instead of just returning 0; we could be somewhere in a subprog, |
| 1333 | * so execution could continue otherwise which we do /not/ want. | ||
| 1334 | * | ||
| 1335 | * Note, verifier whitelists all opcodes in bpf_opcode_in_insntable(). | ||
| 1336 | */ | ||
| 1337 | pr_warn("BPF interpreter: unknown opcode %02x\n", insn->code); | ||
| 1338 | BUG_ON(1); | ||
| 1285 | return 0; | 1339 | return 0; |
| 1286 | } | 1340 | } |
| 1287 | STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */ | 1341 | STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */ |
| @@ -1298,6 +1352,23 @@ static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn | |||
| 1298 | return ___bpf_prog_run(regs, insn, stack); \ | 1352 | return ___bpf_prog_run(regs, insn, stack); \ |
| 1299 | } | 1353 | } |
| 1300 | 1354 | ||
| 1355 | #define PROG_NAME_ARGS(stack_size) __bpf_prog_run_args##stack_size | ||
| 1356 | #define DEFINE_BPF_PROG_RUN_ARGS(stack_size) \ | ||
| 1357 | static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \ | ||
| 1358 | const struct bpf_insn *insn) \ | ||
| 1359 | { \ | ||
| 1360 | u64 stack[stack_size / sizeof(u64)]; \ | ||
| 1361 | u64 regs[MAX_BPF_REG]; \ | ||
| 1362 | \ | ||
| 1363 | FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ | ||
| 1364 | BPF_R1 = r1; \ | ||
| 1365 | BPF_R2 = r2; \ | ||
| 1366 | BPF_R3 = r3; \ | ||
| 1367 | BPF_R4 = r4; \ | ||
| 1368 | BPF_R5 = r5; \ | ||
| 1369 | return ___bpf_prog_run(regs, insn, stack); \ | ||
| 1370 | } | ||
| 1371 | |||
| 1301 | #define EVAL1(FN, X) FN(X) | 1372 | #define EVAL1(FN, X) FN(X) |
| 1302 | #define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y) | 1373 | #define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y) |
| 1303 | #define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y) | 1374 | #define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y) |
| @@ -1309,6 +1380,10 @@ EVAL6(DEFINE_BPF_PROG_RUN, 32, 64, 96, 128, 160, 192); | |||
| 1309 | EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384); | 1380 | EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384); |
| 1310 | EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512); | 1381 | EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512); |
| 1311 | 1382 | ||
| 1383 | EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 32, 64, 96, 128, 160, 192); | ||
| 1384 | EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 224, 256, 288, 320, 352, 384); | ||
| 1385 | EVAL4(DEFINE_BPF_PROG_RUN_ARGS, 416, 448, 480, 512); | ||
| 1386 | |||
| 1312 | #define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size), | 1387 | #define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size), |
| 1313 | 1388 | ||
| 1314 | static unsigned int (*interpreters[])(const void *ctx, | 1389 | static unsigned int (*interpreters[])(const void *ctx, |
| @@ -1317,11 +1392,33 @@ EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192) | |||
| 1317 | EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) | 1392 | EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) |
| 1318 | EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) | 1393 | EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) |
| 1319 | }; | 1394 | }; |
| 1395 | #undef PROG_NAME_LIST | ||
| 1396 | #define PROG_NAME_LIST(stack_size) PROG_NAME_ARGS(stack_size), | ||
| 1397 | static u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, | ||
| 1398 | const struct bpf_insn *insn) = { | ||
| 1399 | EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192) | ||
| 1400 | EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) | ||
| 1401 | EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) | ||
| 1402 | }; | ||
| 1403 | #undef PROG_NAME_LIST | ||
| 1404 | |||
| 1405 | void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth) | ||
| 1406 | { | ||
| 1407 | stack_depth = max_t(u32, stack_depth, 1); | ||
| 1408 | insn->off = (s16) insn->imm; | ||
| 1409 | insn->imm = interpreters_args[(round_up(stack_depth, 32) / 32) - 1] - | ||
| 1410 | __bpf_call_base_args; | ||
| 1411 | insn->code = BPF_JMP | BPF_CALL_ARGS; | ||
| 1412 | } | ||
| 1320 | 1413 | ||
| 1321 | #else | 1414 | #else |
| 1322 | static unsigned int __bpf_prog_ret0(const void *ctx, | 1415 | static unsigned int __bpf_prog_ret0_warn(const void *ctx, |
| 1323 | const struct bpf_insn *insn) | 1416 | const struct bpf_insn *insn) |
| 1324 | { | 1417 | { |
| 1418 | /* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON | ||
| 1419 | * is not working properly, so warn about it! | ||
| 1420 | */ | ||
| 1421 | WARN_ON_ONCE(1); | ||
| 1325 | return 0; | 1422 | return 0; |
| 1326 | } | 1423 | } |
| 1327 | #endif | 1424 | #endif |
| @@ -1329,6 +1426,9 @@ static unsigned int __bpf_prog_ret0(const void *ctx, | |||
| 1329 | bool bpf_prog_array_compatible(struct bpf_array *array, | 1426 | bool bpf_prog_array_compatible(struct bpf_array *array, |
| 1330 | const struct bpf_prog *fp) | 1427 | const struct bpf_prog *fp) |
| 1331 | { | 1428 | { |
| 1429 | if (fp->kprobe_override) | ||
| 1430 | return false; | ||
| 1431 | |||
| 1332 | if (!array->owner_prog_type) { | 1432 | if (!array->owner_prog_type) { |
| 1333 | /* There's no owner yet where we could check for | 1433 | /* There's no owner yet where we could check for |
| 1334 | * compatibility. | 1434 | * compatibility. |
| @@ -1378,7 +1478,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) | |||
| 1378 | 1478 | ||
| 1379 | fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; | 1479 | fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; |
| 1380 | #else | 1480 | #else |
| 1381 | fp->bpf_func = __bpf_prog_ret0; | 1481 | fp->bpf_func = __bpf_prog_ret0_warn; |
| 1382 | #endif | 1482 | #endif |
| 1383 | 1483 | ||
| 1384 | /* eBPF JITs can rewrite the program in case constant | 1484 | /* eBPF JITs can rewrite the program in case constant |
| @@ -1481,6 +1581,8 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, | |||
| 1481 | rcu_read_lock(); | 1581 | rcu_read_lock(); |
| 1482 | prog = rcu_dereference(progs)->progs; | 1582 | prog = rcu_dereference(progs)->progs; |
| 1483 | for (; *prog; prog++) { | 1583 | for (; *prog; prog++) { |
| 1584 | if (*prog == &dummy_bpf_prog.prog) | ||
| 1585 | continue; | ||
| 1484 | id = (*prog)->aux->id; | 1586 | id = (*prog)->aux->id; |
| 1485 | if (copy_to_user(prog_ids + i, &id, sizeof(id))) { | 1587 | if (copy_to_user(prog_ids + i, &id, sizeof(id))) { |
| 1486 | rcu_read_unlock(); | 1588 | rcu_read_unlock(); |
| @@ -1564,14 +1666,41 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, | |||
| 1564 | return 0; | 1666 | return 0; |
| 1565 | } | 1667 | } |
| 1566 | 1668 | ||
| 1669 | int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, | ||
| 1670 | __u32 __user *prog_ids, u32 request_cnt, | ||
| 1671 | __u32 __user *prog_cnt) | ||
| 1672 | { | ||
| 1673 | u32 cnt = 0; | ||
| 1674 | |||
| 1675 | if (array) | ||
| 1676 | cnt = bpf_prog_array_length(array); | ||
| 1677 | |||
| 1678 | if (copy_to_user(prog_cnt, &cnt, sizeof(cnt))) | ||
| 1679 | return -EFAULT; | ||
| 1680 | |||
| 1681 | /* return early if user requested only program count or nothing to copy */ | ||
| 1682 | if (!request_cnt || !cnt) | ||
| 1683 | return 0; | ||
| 1684 | |||
| 1685 | return bpf_prog_array_copy_to_user(array, prog_ids, request_cnt); | ||
| 1686 | } | ||
| 1687 | |||
| 1567 | static void bpf_prog_free_deferred(struct work_struct *work) | 1688 | static void bpf_prog_free_deferred(struct work_struct *work) |
| 1568 | { | 1689 | { |
| 1569 | struct bpf_prog_aux *aux; | 1690 | struct bpf_prog_aux *aux; |
| 1691 | int i; | ||
| 1570 | 1692 | ||
| 1571 | aux = container_of(work, struct bpf_prog_aux, work); | 1693 | aux = container_of(work, struct bpf_prog_aux, work); |
| 1572 | if (bpf_prog_is_dev_bound(aux)) | 1694 | if (bpf_prog_is_dev_bound(aux)) |
| 1573 | bpf_prog_offload_destroy(aux->prog); | 1695 | bpf_prog_offload_destroy(aux->prog); |
| 1574 | bpf_jit_free(aux->prog); | 1696 | for (i = 0; i < aux->func_cnt; i++) |
| 1697 | bpf_jit_free(aux->func[i]); | ||
| 1698 | if (aux->func_cnt) { | ||
| 1699 | kfree(aux->func); | ||
| 1700 | bpf_prog_unlock_free(aux->prog); | ||
| 1701 | } else { | ||
| 1702 | bpf_jit_free(aux->prog); | ||
| 1703 | } | ||
| 1575 | } | 1704 | } |
| 1576 | 1705 | ||
| 1577 | /* Free internal BPF program */ | 1706 | /* Free internal BPF program */ |
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index ce5b669003b2..fbfdada6caee 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c | |||
| @@ -94,13 +94,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) | |||
| 94 | if (!cmap) | 94 | if (!cmap) |
| 95 | return ERR_PTR(-ENOMEM); | 95 | return ERR_PTR(-ENOMEM); |
| 96 | 96 | ||
| 97 | /* mandatory map attributes */ | 97 | bpf_map_init_from_attr(&cmap->map, attr); |
| 98 | cmap->map.map_type = attr->map_type; | ||
| 99 | cmap->map.key_size = attr->key_size; | ||
| 100 | cmap->map.value_size = attr->value_size; | ||
| 101 | cmap->map.max_entries = attr->max_entries; | ||
| 102 | cmap->map.map_flags = attr->map_flags; | ||
| 103 | cmap->map.numa_node = bpf_map_attr_numa_node(attr); | ||
| 104 | 98 | ||
| 105 | /* Pre-limit array size based on NR_CPUS, not final CPU check */ | 99 | /* Pre-limit array size based on NR_CPUS, not final CPU check */ |
| 106 | if (cmap->map.max_entries > NR_CPUS) { | 100 | if (cmap->map.max_entries > NR_CPUS) { |
| @@ -143,7 +137,7 @@ free_cmap: | |||
| 143 | return ERR_PTR(err); | 137 | return ERR_PTR(err); |
| 144 | } | 138 | } |
| 145 | 139 | ||
| 146 | void __cpu_map_queue_destructor(void *ptr) | 140 | static void __cpu_map_queue_destructor(void *ptr) |
| 147 | { | 141 | { |
| 148 | /* The tear-down procedure should have made sure that queue is | 142 | /* The tear-down procedure should have made sure that queue is |
| 149 | * empty. See __cpu_map_entry_replace() and work-queue | 143 | * empty. See __cpu_map_entry_replace() and work-queue |
| @@ -222,8 +216,8 @@ static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp) | |||
| 222 | return xdp_pkt; | 216 | return xdp_pkt; |
| 223 | } | 217 | } |
| 224 | 218 | ||
| 225 | struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, | 219 | static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, |
| 226 | struct xdp_pkt *xdp_pkt) | 220 | struct xdp_pkt *xdp_pkt) |
| 227 | { | 221 | { |
| 228 | unsigned int frame_size; | 222 | unsigned int frame_size; |
| 229 | void *pkt_data_start; | 223 | void *pkt_data_start; |
| @@ -337,7 +331,8 @@ static int cpu_map_kthread_run(void *data) | |||
| 337 | return 0; | 331 | return 0; |
| 338 | } | 332 | } |
| 339 | 333 | ||
| 340 | struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, int map_id) | 334 | static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, |
| 335 | int map_id) | ||
| 341 | { | 336 | { |
| 342 | gfp_t gfp = GFP_ATOMIC|__GFP_NOWARN; | 337 | gfp_t gfp = GFP_ATOMIC|__GFP_NOWARN; |
| 343 | struct bpf_cpu_map_entry *rcpu; | 338 | struct bpf_cpu_map_entry *rcpu; |
| @@ -395,7 +390,7 @@ free_rcu: | |||
| 395 | return NULL; | 390 | return NULL; |
| 396 | } | 391 | } |
| 397 | 392 | ||
| 398 | void __cpu_map_entry_free(struct rcu_head *rcu) | 393 | static void __cpu_map_entry_free(struct rcu_head *rcu) |
| 399 | { | 394 | { |
| 400 | struct bpf_cpu_map_entry *rcpu; | 395 | struct bpf_cpu_map_entry *rcpu; |
| 401 | int cpu; | 396 | int cpu; |
| @@ -438,8 +433,8 @@ void __cpu_map_entry_free(struct rcu_head *rcu) | |||
| 438 | * cpu_map_kthread_stop, which waits for an RCU graze period before | 433 | * cpu_map_kthread_stop, which waits for an RCU graze period before |
| 439 | * stopping kthread, emptying the queue. | 434 | * stopping kthread, emptying the queue. |
| 440 | */ | 435 | */ |
| 441 | void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, | 436 | static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, |
| 442 | u32 key_cpu, struct bpf_cpu_map_entry *rcpu) | 437 | u32 key_cpu, struct bpf_cpu_map_entry *rcpu) |
| 443 | { | 438 | { |
| 444 | struct bpf_cpu_map_entry *old_rcpu; | 439 | struct bpf_cpu_map_entry *old_rcpu; |
| 445 | 440 | ||
| @@ -451,7 +446,7 @@ void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, | |||
| 451 | } | 446 | } |
| 452 | } | 447 | } |
| 453 | 448 | ||
| 454 | int cpu_map_delete_elem(struct bpf_map *map, void *key) | 449 | static int cpu_map_delete_elem(struct bpf_map *map, void *key) |
| 455 | { | 450 | { |
| 456 | struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); | 451 | struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); |
| 457 | u32 key_cpu = *(u32 *)key; | 452 | u32 key_cpu = *(u32 *)key; |
| @@ -464,8 +459,8 @@ int cpu_map_delete_elem(struct bpf_map *map, void *key) | |||
| 464 | return 0; | 459 | return 0; |
| 465 | } | 460 | } |
| 466 | 461 | ||
| 467 | int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, | 462 | static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, |
| 468 | u64 map_flags) | 463 | u64 map_flags) |
| 469 | { | 464 | { |
| 470 | struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); | 465 | struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); |
| 471 | struct bpf_cpu_map_entry *rcpu; | 466 | struct bpf_cpu_map_entry *rcpu; |
| @@ -502,7 +497,7 @@ int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, | |||
| 502 | return 0; | 497 | return 0; |
| 503 | } | 498 | } |
| 504 | 499 | ||
| 505 | void cpu_map_free(struct bpf_map *map) | 500 | static void cpu_map_free(struct bpf_map *map) |
| 506 | { | 501 | { |
| 507 | struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); | 502 | struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); |
| 508 | int cpu; | 503 | int cpu; |
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index ebdef54bf7df..565f9ece9115 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c | |||
| @@ -93,13 +93,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) | |||
| 93 | if (!dtab) | 93 | if (!dtab) |
| 94 | return ERR_PTR(-ENOMEM); | 94 | return ERR_PTR(-ENOMEM); |
| 95 | 95 | ||
| 96 | /* mandatory map attributes */ | 96 | bpf_map_init_from_attr(&dtab->map, attr); |
| 97 | dtab->map.map_type = attr->map_type; | ||
| 98 | dtab->map.key_size = attr->key_size; | ||
| 99 | dtab->map.value_size = attr->value_size; | ||
| 100 | dtab->map.max_entries = attr->max_entries; | ||
| 101 | dtab->map.map_flags = attr->map_flags; | ||
| 102 | dtab->map.numa_node = bpf_map_attr_numa_node(attr); | ||
| 103 | 97 | ||
| 104 | /* make sure page count doesn't overflow */ | 98 | /* make sure page count doesn't overflow */ |
| 105 | cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); | 99 | cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); |
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index e682850c9715..8740406df2cd 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c | |||
| @@ -21,10 +21,39 @@ static const char * const func_id_str[] = { | |||
| 21 | }; | 21 | }; |
| 22 | #undef __BPF_FUNC_STR_FN | 22 | #undef __BPF_FUNC_STR_FN |
| 23 | 23 | ||
| 24 | const char *func_id_name(int id) | 24 | static const char *__func_get_name(const struct bpf_insn_cbs *cbs, |
| 25 | const struct bpf_insn *insn, | ||
| 26 | char *buff, size_t len) | ||
| 25 | { | 27 | { |
| 26 | BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID); | 28 | BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID); |
| 27 | 29 | ||
| 30 | if (insn->src_reg != BPF_PSEUDO_CALL && | ||
| 31 | insn->imm >= 0 && insn->imm < __BPF_FUNC_MAX_ID && | ||
| 32 | func_id_str[insn->imm]) | ||
| 33 | return func_id_str[insn->imm]; | ||
| 34 | |||
| 35 | if (cbs && cbs->cb_call) | ||
| 36 | return cbs->cb_call(cbs->private_data, insn); | ||
| 37 | |||
| 38 | if (insn->src_reg == BPF_PSEUDO_CALL) | ||
| 39 | snprintf(buff, len, "%+d", insn->imm); | ||
| 40 | |||
| 41 | return buff; | ||
| 42 | } | ||
| 43 | |||
| 44 | static const char *__func_imm_name(const struct bpf_insn_cbs *cbs, | ||
| 45 | const struct bpf_insn *insn, | ||
| 46 | u64 full_imm, char *buff, size_t len) | ||
| 47 | { | ||
| 48 | if (cbs && cbs->cb_imm) | ||
| 49 | return cbs->cb_imm(cbs->private_data, insn, full_imm); | ||
| 50 | |||
| 51 | snprintf(buff, len, "0x%llx", (unsigned long long)full_imm); | ||
| 52 | return buff; | ||
| 53 | } | ||
| 54 | |||
| 55 | const char *func_id_name(int id) | ||
| 56 | { | ||
| 28 | if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id]) | 57 | if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id]) |
| 29 | return func_id_str[id]; | 58 | return func_id_str[id]; |
| 30 | else | 59 | else |
| @@ -83,7 +112,7 @@ static const char *const bpf_jmp_string[16] = { | |||
| 83 | [BPF_EXIT >> 4] = "exit", | 112 | [BPF_EXIT >> 4] = "exit", |
| 84 | }; | 113 | }; |
| 85 | 114 | ||
| 86 | static void print_bpf_end_insn(bpf_insn_print_cb verbose, | 115 | static void print_bpf_end_insn(bpf_insn_print_t verbose, |
| 87 | struct bpf_verifier_env *env, | 116 | struct bpf_verifier_env *env, |
| 88 | const struct bpf_insn *insn) | 117 | const struct bpf_insn *insn) |
| 89 | { | 118 | { |
| @@ -92,9 +121,12 @@ static void print_bpf_end_insn(bpf_insn_print_cb verbose, | |||
| 92 | insn->imm, insn->dst_reg); | 121 | insn->imm, insn->dst_reg); |
| 93 | } | 122 | } |
| 94 | 123 | ||
| 95 | void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, | 124 | void print_bpf_insn(const struct bpf_insn_cbs *cbs, |
| 96 | const struct bpf_insn *insn, bool allow_ptr_leaks) | 125 | struct bpf_verifier_env *env, |
| 126 | const struct bpf_insn *insn, | ||
| 127 | bool allow_ptr_leaks) | ||
| 97 | { | 128 | { |
| 129 | const bpf_insn_print_t verbose = cbs->cb_print; | ||
| 98 | u8 class = BPF_CLASS(insn->code); | 130 | u8 class = BPF_CLASS(insn->code); |
| 99 | 131 | ||
| 100 | if (class == BPF_ALU || class == BPF_ALU64) { | 132 | if (class == BPF_ALU || class == BPF_ALU64) { |
| @@ -175,12 +207,15 @@ void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, | |||
| 175 | */ | 207 | */ |
| 176 | u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; | 208 | u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; |
| 177 | bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD; | 209 | bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD; |
| 210 | char tmp[64]; | ||
| 178 | 211 | ||
| 179 | if (map_ptr && !allow_ptr_leaks) | 212 | if (map_ptr && !allow_ptr_leaks) |
| 180 | imm = 0; | 213 | imm = 0; |
| 181 | 214 | ||
| 182 | verbose(env, "(%02x) r%d = 0x%llx\n", insn->code, | 215 | verbose(env, "(%02x) r%d = %s\n", |
| 183 | insn->dst_reg, (unsigned long long)imm); | 216 | insn->code, insn->dst_reg, |
| 217 | __func_imm_name(cbs, insn, imm, | ||
| 218 | tmp, sizeof(tmp))); | ||
| 184 | } else { | 219 | } else { |
| 185 | verbose(env, "BUG_ld_%02x\n", insn->code); | 220 | verbose(env, "BUG_ld_%02x\n", insn->code); |
| 186 | return; | 221 | return; |
| @@ -189,8 +224,20 @@ void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, | |||
| 189 | u8 opcode = BPF_OP(insn->code); | 224 | u8 opcode = BPF_OP(insn->code); |
| 190 | 225 | ||
| 191 | if (opcode == BPF_CALL) { | 226 | if (opcode == BPF_CALL) { |
| 192 | verbose(env, "(%02x) call %s#%d\n", insn->code, | 227 | char tmp[64]; |
| 193 | func_id_name(insn->imm), insn->imm); | 228 | |
| 229 | if (insn->src_reg == BPF_PSEUDO_CALL) { | ||
| 230 | verbose(env, "(%02x) call pc%s\n", | ||
| 231 | insn->code, | ||
| 232 | __func_get_name(cbs, insn, | ||
| 233 | tmp, sizeof(tmp))); | ||
| 234 | } else { | ||
| 235 | strcpy(tmp, "unknown"); | ||
| 236 | verbose(env, "(%02x) call %s#%d\n", insn->code, | ||
| 237 | __func_get_name(cbs, insn, | ||
| 238 | tmp, sizeof(tmp)), | ||
| 239 | insn->imm); | ||
| 240 | } | ||
| 194 | } else if (insn->code == (BPF_JMP | BPF_JA)) { | 241 | } else if (insn->code == (BPF_JMP | BPF_JA)) { |
| 195 | verbose(env, "(%02x) goto pc%+d\n", | 242 | verbose(env, "(%02x) goto pc%+d\n", |
| 196 | insn->code, insn->off); | 243 | insn->code, insn->off); |
diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h index 8de977e420b6..266fe8ee542b 100644 --- a/kernel/bpf/disasm.h +++ b/kernel/bpf/disasm.h | |||
| @@ -17,16 +17,35 @@ | |||
| 17 | #include <linux/bpf.h> | 17 | #include <linux/bpf.h> |
| 18 | #include <linux/kernel.h> | 18 | #include <linux/kernel.h> |
| 19 | #include <linux/stringify.h> | 19 | #include <linux/stringify.h> |
| 20 | #ifndef __KERNEL__ | ||
| 21 | #include <stdio.h> | ||
| 22 | #include <string.h> | ||
| 23 | #endif | ||
| 24 | |||
| 25 | struct bpf_verifier_env; | ||
| 20 | 26 | ||
| 21 | extern const char *const bpf_alu_string[16]; | 27 | extern const char *const bpf_alu_string[16]; |
| 22 | extern const char *const bpf_class_string[8]; | 28 | extern const char *const bpf_class_string[8]; |
| 23 | 29 | ||
| 24 | const char *func_id_name(int id); | 30 | const char *func_id_name(int id); |
| 25 | 31 | ||
| 26 | struct bpf_verifier_env; | 32 | typedef __printf(2, 3) void (*bpf_insn_print_t)(struct bpf_verifier_env *env, |
| 27 | typedef void (*bpf_insn_print_cb)(struct bpf_verifier_env *env, | 33 | const char *, ...); |
| 28 | const char *, ...); | 34 | typedef const char *(*bpf_insn_revmap_call_t)(void *private_data, |
| 29 | void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, | 35 | const struct bpf_insn *insn); |
| 30 | const struct bpf_insn *insn, bool allow_ptr_leaks); | 36 | typedef const char *(*bpf_insn_print_imm_t)(void *private_data, |
| 37 | const struct bpf_insn *insn, | ||
| 38 | __u64 full_imm); | ||
| 39 | |||
| 40 | struct bpf_insn_cbs { | ||
| 41 | bpf_insn_print_t cb_print; | ||
| 42 | bpf_insn_revmap_call_t cb_call; | ||
| 43 | bpf_insn_print_imm_t cb_imm; | ||
| 44 | void *private_data; | ||
| 45 | }; | ||
| 31 | 46 | ||
| 47 | void print_bpf_insn(const struct bpf_insn_cbs *cbs, | ||
| 48 | struct bpf_verifier_env *env, | ||
| 49 | const struct bpf_insn *insn, | ||
| 50 | bool allow_ptr_leaks); | ||
| 32 | #endif | 51 | #endif |
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3905d4bc5b80..b76828f23b49 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c | |||
| @@ -227,7 +227,7 @@ static int alloc_extra_elems(struct bpf_htab *htab) | |||
| 227 | } | 227 | } |
| 228 | 228 | ||
| 229 | /* Called from syscall */ | 229 | /* Called from syscall */ |
| 230 | static struct bpf_map *htab_map_alloc(union bpf_attr *attr) | 230 | static int htab_map_alloc_check(union bpf_attr *attr) |
| 231 | { | 231 | { |
| 232 | bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || | 232 | bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || |
| 233 | attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); | 233 | attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); |
| @@ -241,9 +241,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) | |||
| 241 | bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); | 241 | bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); |
| 242 | bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); | 242 | bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); |
| 243 | int numa_node = bpf_map_attr_numa_node(attr); | 243 | int numa_node = bpf_map_attr_numa_node(attr); |
| 244 | struct bpf_htab *htab; | ||
| 245 | int err, i; | ||
| 246 | u64 cost; | ||
| 247 | 244 | ||
| 248 | BUILD_BUG_ON(offsetof(struct htab_elem, htab) != | 245 | BUILD_BUG_ON(offsetof(struct htab_elem, htab) != |
| 249 | offsetof(struct htab_elem, hash_node.pprev)); | 246 | offsetof(struct htab_elem, hash_node.pprev)); |
| @@ -254,40 +251,68 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) | |||
| 254 | /* LRU implementation is much complicated than other | 251 | /* LRU implementation is much complicated than other |
| 255 | * maps. Hence, limit to CAP_SYS_ADMIN for now. | 252 | * maps. Hence, limit to CAP_SYS_ADMIN for now. |
| 256 | */ | 253 | */ |
| 257 | return ERR_PTR(-EPERM); | 254 | return -EPERM; |
| 258 | 255 | ||
| 259 | if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK) | 256 | if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK) |
| 260 | /* reserved bits should not be used */ | 257 | /* reserved bits should not be used */ |
| 261 | return ERR_PTR(-EINVAL); | 258 | return -EINVAL; |
| 262 | 259 | ||
| 263 | if (!lru && percpu_lru) | 260 | if (!lru && percpu_lru) |
| 264 | return ERR_PTR(-EINVAL); | 261 | return -EINVAL; |
| 265 | 262 | ||
| 266 | if (lru && !prealloc) | 263 | if (lru && !prealloc) |
| 267 | return ERR_PTR(-ENOTSUPP); | 264 | return -ENOTSUPP; |
| 268 | 265 | ||
| 269 | if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru)) | 266 | if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru)) |
| 270 | return ERR_PTR(-EINVAL); | 267 | return -EINVAL; |
| 268 | |||
| 269 | /* check sanity of attributes. | ||
| 270 | * value_size == 0 may be allowed in the future to use map as a set | ||
| 271 | */ | ||
| 272 | if (attr->max_entries == 0 || attr->key_size == 0 || | ||
| 273 | attr->value_size == 0) | ||
| 274 | return -EINVAL; | ||
| 275 | |||
| 276 | if (attr->key_size > MAX_BPF_STACK) | ||
| 277 | /* eBPF programs initialize keys on stack, so they cannot be | ||
| 278 | * larger than max stack size | ||
| 279 | */ | ||
| 280 | return -E2BIG; | ||
| 281 | |||
| 282 | if (attr->value_size >= KMALLOC_MAX_SIZE - | ||
| 283 | MAX_BPF_STACK - sizeof(struct htab_elem)) | ||
| 284 | /* if value_size is bigger, the user space won't be able to | ||
| 285 | * access the elements via bpf syscall. This check also makes | ||
| 286 | * sure that the elem_size doesn't overflow and it's | ||
| 287 | * kmalloc-able later in htab_map_update_elem() | ||
| 288 | */ | ||
| 289 | return -E2BIG; | ||
| 290 | |||
| 291 | return 0; | ||
| 292 | } | ||
| 293 | |||
| 294 | static struct bpf_map *htab_map_alloc(union bpf_attr *attr) | ||
| 295 | { | ||
| 296 | bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || | ||
| 297 | attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); | ||
| 298 | bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || | ||
| 299 | attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); | ||
| 300 | /* percpu_lru means each cpu has its own LRU list. | ||
| 301 | * it is different from BPF_MAP_TYPE_PERCPU_HASH where | ||
| 302 | * the map's value itself is percpu. percpu_lru has | ||
| 303 | * nothing to do with the map's value. | ||
| 304 | */ | ||
| 305 | bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); | ||
| 306 | bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); | ||
| 307 | struct bpf_htab *htab; | ||
| 308 | int err, i; | ||
| 309 | u64 cost; | ||
| 271 | 310 | ||
| 272 | htab = kzalloc(sizeof(*htab), GFP_USER); | 311 | htab = kzalloc(sizeof(*htab), GFP_USER); |
| 273 | if (!htab) | 312 | if (!htab) |
| 274 | return ERR_PTR(-ENOMEM); | 313 | return ERR_PTR(-ENOMEM); |
| 275 | 314 | ||
| 276 | /* mandatory map attributes */ | 315 | bpf_map_init_from_attr(&htab->map, attr); |
| 277 | htab->map.map_type = attr->map_type; | ||
| 278 | htab->map.key_size = attr->key_size; | ||
| 279 | htab->map.value_size = attr->value_size; | ||
| 280 | htab->map.max_entries = attr->max_entries; | ||
| 281 | htab->map.map_flags = attr->map_flags; | ||
| 282 | htab->map.numa_node = numa_node; | ||
| 283 | |||
| 284 | /* check sanity of attributes. | ||
| 285 | * value_size == 0 may be allowed in the future to use map as a set | ||
| 286 | */ | ||
| 287 | err = -EINVAL; | ||
| 288 | if (htab->map.max_entries == 0 || htab->map.key_size == 0 || | ||
| 289 | htab->map.value_size == 0) | ||
| 290 | goto free_htab; | ||
| 291 | 316 | ||
| 292 | if (percpu_lru) { | 317 | if (percpu_lru) { |
| 293 | /* ensure each CPU's lru list has >=1 elements. | 318 | /* ensure each CPU's lru list has >=1 elements. |
| @@ -304,22 +329,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) | |||
| 304 | /* hash table size must be power of 2 */ | 329 | /* hash table size must be power of 2 */ |
| 305 | htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); | 330 | htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); |
| 306 | 331 | ||
| 307 | err = -E2BIG; | ||
| 308 | if (htab->map.key_size > MAX_BPF_STACK) | ||
| 309 | /* eBPF programs initialize keys on stack, so they cannot be | ||
| 310 | * larger than max stack size | ||
| 311 | */ | ||
| 312 | goto free_htab; | ||
| 313 | |||
| 314 | if (htab->map.value_size >= KMALLOC_MAX_SIZE - | ||
| 315 | MAX_BPF_STACK - sizeof(struct htab_elem)) | ||
| 316 | /* if value_size is bigger, the user space won't be able to | ||
| 317 | * access the elements via bpf syscall. This check also makes | ||
| 318 | * sure that the elem_size doesn't overflow and it's | ||
| 319 | * kmalloc-able later in htab_map_update_elem() | ||
| 320 | */ | ||
| 321 | goto free_htab; | ||
| 322 | |||
| 323 | htab->elem_size = sizeof(struct htab_elem) + | 332 | htab->elem_size = sizeof(struct htab_elem) + |
| 324 | round_up(htab->map.key_size, 8); | 333 | round_up(htab->map.key_size, 8); |
| 325 | if (percpu) | 334 | if (percpu) |
| @@ -327,6 +336,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) | |||
| 327 | else | 336 | else |
| 328 | htab->elem_size += round_up(htab->map.value_size, 8); | 337 | htab->elem_size += round_up(htab->map.value_size, 8); |
| 329 | 338 | ||
| 339 | err = -E2BIG; | ||
| 330 | /* prevent zero size kmalloc and check for u32 overflow */ | 340 | /* prevent zero size kmalloc and check for u32 overflow */ |
| 331 | if (htab->n_buckets == 0 || | 341 | if (htab->n_buckets == 0 || |
| 332 | htab->n_buckets > U32_MAX / sizeof(struct bucket)) | 342 | htab->n_buckets > U32_MAX / sizeof(struct bucket)) |
| @@ -1143,6 +1153,7 @@ static void htab_map_free(struct bpf_map *map) | |||
| 1143 | } | 1153 | } |
| 1144 | 1154 | ||
| 1145 | const struct bpf_map_ops htab_map_ops = { | 1155 | const struct bpf_map_ops htab_map_ops = { |
| 1156 | .map_alloc_check = htab_map_alloc_check, | ||
| 1146 | .map_alloc = htab_map_alloc, | 1157 | .map_alloc = htab_map_alloc, |
| 1147 | .map_free = htab_map_free, | 1158 | .map_free = htab_map_free, |
| 1148 | .map_get_next_key = htab_map_get_next_key, | 1159 | .map_get_next_key = htab_map_get_next_key, |
| @@ -1153,6 +1164,7 @@ const struct bpf_map_ops htab_map_ops = { | |||
| 1153 | }; | 1164 | }; |
| 1154 | 1165 | ||
| 1155 | const struct bpf_map_ops htab_lru_map_ops = { | 1166 | const struct bpf_map_ops htab_lru_map_ops = { |
| 1167 | .map_alloc_check = htab_map_alloc_check, | ||
| 1156 | .map_alloc = htab_map_alloc, | 1168 | .map_alloc = htab_map_alloc, |
| 1157 | .map_free = htab_map_free, | 1169 | .map_free = htab_map_free, |
| 1158 | .map_get_next_key = htab_map_get_next_key, | 1170 | .map_get_next_key = htab_map_get_next_key, |
| @@ -1236,6 +1248,7 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, | |||
| 1236 | } | 1248 | } |
| 1237 | 1249 | ||
| 1238 | const struct bpf_map_ops htab_percpu_map_ops = { | 1250 | const struct bpf_map_ops htab_percpu_map_ops = { |
| 1251 | .map_alloc_check = htab_map_alloc_check, | ||
| 1239 | .map_alloc = htab_map_alloc, | 1252 | .map_alloc = htab_map_alloc, |
| 1240 | .map_free = htab_map_free, | 1253 | .map_free = htab_map_free, |
| 1241 | .map_get_next_key = htab_map_get_next_key, | 1254 | .map_get_next_key = htab_map_get_next_key, |
| @@ -1245,6 +1258,7 @@ const struct bpf_map_ops htab_percpu_map_ops = { | |||
| 1245 | }; | 1258 | }; |
| 1246 | 1259 | ||
| 1247 | const struct bpf_map_ops htab_lru_percpu_map_ops = { | 1260 | const struct bpf_map_ops htab_lru_percpu_map_ops = { |
| 1261 | .map_alloc_check = htab_map_alloc_check, | ||
| 1248 | .map_alloc = htab_map_alloc, | 1262 | .map_alloc = htab_map_alloc, |
| 1249 | .map_free = htab_map_free, | 1263 | .map_free = htab_map_free, |
| 1250 | .map_get_next_key = htab_map_get_next_key, | 1264 | .map_get_next_key = htab_map_get_next_key, |
| @@ -1253,11 +1267,11 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { | |||
| 1253 | .map_delete_elem = htab_lru_map_delete_elem, | 1267 | .map_delete_elem = htab_lru_map_delete_elem, |
| 1254 | }; | 1268 | }; |
| 1255 | 1269 | ||
| 1256 | static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr) | 1270 | static int fd_htab_map_alloc_check(union bpf_attr *attr) |
| 1257 | { | 1271 | { |
| 1258 | if (attr->value_size != sizeof(u32)) | 1272 | if (attr->value_size != sizeof(u32)) |
| 1259 | return ERR_PTR(-EINVAL); | 1273 | return -EINVAL; |
| 1260 | return htab_map_alloc(attr); | 1274 | return htab_map_alloc_check(attr); |
| 1261 | } | 1275 | } |
| 1262 | 1276 | ||
| 1263 | static void fd_htab_map_free(struct bpf_map *map) | 1277 | static void fd_htab_map_free(struct bpf_map *map) |
| @@ -1328,7 +1342,7 @@ static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr) | |||
| 1328 | if (IS_ERR(inner_map_meta)) | 1342 | if (IS_ERR(inner_map_meta)) |
| 1329 | return inner_map_meta; | 1343 | return inner_map_meta; |
| 1330 | 1344 | ||
| 1331 | map = fd_htab_map_alloc(attr); | 1345 | map = htab_map_alloc(attr); |
| 1332 | if (IS_ERR(map)) { | 1346 | if (IS_ERR(map)) { |
| 1333 | bpf_map_meta_free(inner_map_meta); | 1347 | bpf_map_meta_free(inner_map_meta); |
| 1334 | return map; | 1348 | return map; |
| @@ -1372,6 +1386,7 @@ static void htab_of_map_free(struct bpf_map *map) | |||
| 1372 | } | 1386 | } |
| 1373 | 1387 | ||
| 1374 | const struct bpf_map_ops htab_of_maps_map_ops = { | 1388 | const struct bpf_map_ops htab_of_maps_map_ops = { |
| 1389 | .map_alloc_check = fd_htab_map_alloc_check, | ||
| 1375 | .map_alloc = htab_of_map_alloc, | 1390 | .map_alloc = htab_of_map_alloc, |
| 1376 | .map_free = htab_of_map_free, | 1391 | .map_free = htab_of_map_free, |
| 1377 | .map_get_next_key = htab_map_get_next_key, | 1392 | .map_get_next_key = htab_map_get_next_key, |
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 5bb5e49ef4c3..81e2f6995adb 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c | |||
| @@ -150,39 +150,29 @@ static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) | |||
| 150 | return 0; | 150 | return 0; |
| 151 | } | 151 | } |
| 152 | 152 | ||
| 153 | static int bpf_mkobj_ops(struct inode *dir, struct dentry *dentry, | 153 | static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw, |
| 154 | umode_t mode, const struct inode_operations *iops) | 154 | const struct inode_operations *iops) |
| 155 | { | 155 | { |
| 156 | struct inode *inode; | 156 | struct inode *dir = dentry->d_parent->d_inode; |
| 157 | 157 | struct inode *inode = bpf_get_inode(dir->i_sb, dir, mode); | |
| 158 | inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFREG); | ||
| 159 | if (IS_ERR(inode)) | 158 | if (IS_ERR(inode)) |
| 160 | return PTR_ERR(inode); | 159 | return PTR_ERR(inode); |
| 161 | 160 | ||
| 162 | inode->i_op = iops; | 161 | inode->i_op = iops; |
| 163 | inode->i_private = dentry->d_fsdata; | 162 | inode->i_private = raw; |
| 164 | 163 | ||
| 165 | bpf_dentry_finalize(dentry, inode, dir); | 164 | bpf_dentry_finalize(dentry, inode, dir); |
| 166 | return 0; | 165 | return 0; |
| 167 | } | 166 | } |
| 168 | 167 | ||
| 169 | static int bpf_mkobj(struct inode *dir, struct dentry *dentry, umode_t mode, | 168 | static int bpf_mkprog(struct dentry *dentry, umode_t mode, void *arg) |
| 170 | dev_t devt) | ||
| 171 | { | 169 | { |
| 172 | enum bpf_type type = MINOR(devt); | 170 | return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops); |
| 173 | 171 | } | |
| 174 | if (MAJOR(devt) != UNNAMED_MAJOR || !S_ISREG(mode) || | ||
| 175 | dentry->d_fsdata == NULL) | ||
| 176 | return -EPERM; | ||
| 177 | 172 | ||
| 178 | switch (type) { | 173 | static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) |
| 179 | case BPF_TYPE_PROG: | 174 | { |
| 180 | return bpf_mkobj_ops(dir, dentry, mode, &bpf_prog_iops); | 175 | return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops); |
| 181 | case BPF_TYPE_MAP: | ||
| 182 | return bpf_mkobj_ops(dir, dentry, mode, &bpf_map_iops); | ||
| 183 | default: | ||
| 184 | return -EPERM; | ||
| 185 | } | ||
| 186 | } | 176 | } |
| 187 | 177 | ||
| 188 | static struct dentry * | 178 | static struct dentry * |
| @@ -218,7 +208,6 @@ static int bpf_symlink(struct inode *dir, struct dentry *dentry, | |||
| 218 | 208 | ||
| 219 | static const struct inode_operations bpf_dir_iops = { | 209 | static const struct inode_operations bpf_dir_iops = { |
| 220 | .lookup = bpf_lookup, | 210 | .lookup = bpf_lookup, |
| 221 | .mknod = bpf_mkobj, | ||
| 222 | .mkdir = bpf_mkdir, | 211 | .mkdir = bpf_mkdir, |
| 223 | .symlink = bpf_symlink, | 212 | .symlink = bpf_symlink, |
| 224 | .rmdir = simple_rmdir, | 213 | .rmdir = simple_rmdir, |
| @@ -234,7 +223,6 @@ static int bpf_obj_do_pin(const struct filename *pathname, void *raw, | |||
| 234 | struct inode *dir; | 223 | struct inode *dir; |
| 235 | struct path path; | 224 | struct path path; |
| 236 | umode_t mode; | 225 | umode_t mode; |
| 237 | dev_t devt; | ||
| 238 | int ret; | 226 | int ret; |
| 239 | 227 | ||
| 240 | dentry = kern_path_create(AT_FDCWD, pathname->name, &path, 0); | 228 | dentry = kern_path_create(AT_FDCWD, pathname->name, &path, 0); |
| @@ -242,9 +230,8 @@ static int bpf_obj_do_pin(const struct filename *pathname, void *raw, | |||
| 242 | return PTR_ERR(dentry); | 230 | return PTR_ERR(dentry); |
| 243 | 231 | ||
| 244 | mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask()); | 232 | mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask()); |
| 245 | devt = MKDEV(UNNAMED_MAJOR, type); | ||
| 246 | 233 | ||
| 247 | ret = security_path_mknod(&path, dentry, mode, devt); | 234 | ret = security_path_mknod(&path, dentry, mode, 0); |
| 248 | if (ret) | 235 | if (ret) |
| 249 | goto out; | 236 | goto out; |
| 250 | 237 | ||
| @@ -254,9 +241,16 @@ static int bpf_obj_do_pin(const struct filename *pathname, void *raw, | |||
| 254 | goto out; | 241 | goto out; |
| 255 | } | 242 | } |
| 256 | 243 | ||
| 257 | dentry->d_fsdata = raw; | 244 | switch (type) { |
| 258 | ret = vfs_mknod(dir, dentry, mode, devt); | 245 | case BPF_TYPE_PROG: |
| 259 | dentry->d_fsdata = NULL; | 246 | ret = vfs_mkobj(dentry, mode, bpf_mkprog, raw); |
| 247 | break; | ||
| 248 | case BPF_TYPE_MAP: | ||
| 249 | ret = vfs_mkobj(dentry, mode, bpf_mkmap, raw); | ||
| 250 | break; | ||
| 251 | default: | ||
| 252 | ret = -EPERM; | ||
| 253 | } | ||
| 260 | out: | 254 | out: |
| 261 | done_path_create(&path, dentry); | 255 | done_path_create(&path, dentry); |
| 262 | return ret; | 256 | return ret; |
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 885e45479680..7b469d10d0e9 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c | |||
| @@ -522,12 +522,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) | |||
| 522 | return ERR_PTR(-ENOMEM); | 522 | return ERR_PTR(-ENOMEM); |
| 523 | 523 | ||
| 524 | /* copy mandatory map attributes */ | 524 | /* copy mandatory map attributes */ |
| 525 | trie->map.map_type = attr->map_type; | 525 | bpf_map_init_from_attr(&trie->map, attr); |
| 526 | trie->map.key_size = attr->key_size; | ||
| 527 | trie->map.value_size = attr->value_size; | ||
| 528 | trie->map.max_entries = attr->max_entries; | ||
| 529 | trie->map.map_flags = attr->map_flags; | ||
| 530 | trie->map.numa_node = bpf_map_attr_numa_node(attr); | ||
| 531 | trie->data_size = attr->key_size - | 526 | trie->data_size = attr->key_size - |
| 532 | offsetof(struct bpf_lpm_trie_key, data); | 527 | offsetof(struct bpf_lpm_trie_key, data); |
| 533 | trie->max_prefixlen = trie->data_size * 8; | 528 | trie->max_prefixlen = trie->data_size * 8; |
| @@ -596,9 +591,96 @@ unlock: | |||
| 596 | raw_spin_unlock(&trie->lock); | 591 | raw_spin_unlock(&trie->lock); |
| 597 | } | 592 | } |
| 598 | 593 | ||
| 599 | static int trie_get_next_key(struct bpf_map *map, void *key, void *next_key) | 594 | static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) |
| 600 | { | 595 | { |
| 601 | return -ENOTSUPP; | 596 | struct lpm_trie_node *node, *next_node = NULL, *parent, *search_root; |
| 597 | struct lpm_trie *trie = container_of(map, struct lpm_trie, map); | ||
| 598 | struct bpf_lpm_trie_key *key = _key, *next_key = _next_key; | ||
| 599 | struct lpm_trie_node **node_stack = NULL; | ||
| 600 | int err = 0, stack_ptr = -1; | ||
| 601 | unsigned int next_bit; | ||
| 602 | size_t matchlen; | ||
| 603 | |||
| 604 | /* The get_next_key follows postorder. For the 4 node example in | ||
| 605 | * the top of this file, the trie_get_next_key() returns the following | ||
| 606 | * one after another: | ||
| 607 | * 192.168.0.0/24 | ||
| 608 | * 192.168.1.0/24 | ||
| 609 | * 192.168.128.0/24 | ||
| 610 | * 192.168.0.0/16 | ||
| 611 | * | ||
| 612 | * The idea is to return more specific keys before less specific ones. | ||
| 613 | */ | ||
| 614 | |||
| 615 | /* Empty trie */ | ||
| 616 | search_root = rcu_dereference(trie->root); | ||
| 617 | if (!search_root) | ||
| 618 | return -ENOENT; | ||
| 619 | |||
| 620 | /* For invalid key, find the leftmost node in the trie */ | ||
| 621 | if (!key || key->prefixlen > trie->max_prefixlen) | ||
| 622 | goto find_leftmost; | ||
| 623 | |||
| 624 | node_stack = kmalloc(trie->max_prefixlen * sizeof(struct lpm_trie_node *), | ||
| 625 | GFP_ATOMIC | __GFP_NOWARN); | ||
| 626 | if (!node_stack) | ||
| 627 | return -ENOMEM; | ||
| 628 | |||
| 629 | /* Try to find the exact node for the given key */ | ||
| 630 | for (node = search_root; node;) { | ||
| 631 | node_stack[++stack_ptr] = node; | ||
| 632 | matchlen = longest_prefix_match(trie, node, key); | ||
| 633 | if (node->prefixlen != matchlen || | ||
| 634 | node->prefixlen == key->prefixlen) | ||
| 635 | break; | ||
| 636 | |||
| 637 | next_bit = extract_bit(key->data, node->prefixlen); | ||
| 638 | node = rcu_dereference(node->child[next_bit]); | ||
| 639 | } | ||
| 640 | if (!node || node->prefixlen != key->prefixlen || | ||
| 641 | (node->flags & LPM_TREE_NODE_FLAG_IM)) | ||
| 642 | goto find_leftmost; | ||
| 643 | |||
| 644 | /* The node with the exactly-matching key has been found, | ||
| 645 | * find the first node in postorder after the matched node. | ||
| 646 | */ | ||
| 647 | node = node_stack[stack_ptr]; | ||
| 648 | while (stack_ptr > 0) { | ||
| 649 | parent = node_stack[stack_ptr - 1]; | ||
| 650 | if (rcu_dereference(parent->child[0]) == node) { | ||
| 651 | search_root = rcu_dereference(parent->child[1]); | ||
| 652 | if (search_root) | ||
| 653 | goto find_leftmost; | ||
| 654 | } | ||
| 655 | if (!(parent->flags & LPM_TREE_NODE_FLAG_IM)) { | ||
| 656 | next_node = parent; | ||
| 657 | goto do_copy; | ||
| 658 | } | ||
| 659 | |||
| 660 | node = parent; | ||
| 661 | stack_ptr--; | ||
| 662 | } | ||
| 663 | |||
| 664 | /* did not find anything */ | ||
| 665 | err = -ENOENT; | ||
| 666 | goto free_stack; | ||
| 667 | |||
| 668 | find_leftmost: | ||
| 669 | /* Find the leftmost non-intermediate node, all intermediate nodes | ||
| 670 | * have exact two children, so this function will never return NULL. | ||
| 671 | */ | ||
| 672 | for (node = search_root; node;) { | ||
| 673 | if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) | ||
| 674 | next_node = node; | ||
| 675 | node = rcu_dereference(node->child[0]); | ||
| 676 | } | ||
| 677 | do_copy: | ||
| 678 | next_key->prefixlen = next_node->prefixlen; | ||
| 679 | memcpy((void *)next_key + offsetof(struct bpf_lpm_trie_key, data), | ||
| 680 | next_node->data, trie->data_size); | ||
| 681 | free_stack: | ||
| 682 | kfree(node_stack); | ||
| 683 | return err; | ||
| 602 | } | 684 | } |
| 603 | 685 | ||
| 604 | const struct bpf_map_ops trie_map_ops = { | 686 | const struct bpf_map_ops trie_map_ops = { |
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 8455b89d1bbf..c9401075b58c 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c | |||
| @@ -16,18 +16,35 @@ | |||
| 16 | #include <linux/bpf.h> | 16 | #include <linux/bpf.h> |
| 17 | #include <linux/bpf_verifier.h> | 17 | #include <linux/bpf_verifier.h> |
| 18 | #include <linux/bug.h> | 18 | #include <linux/bug.h> |
| 19 | #include <linux/kdev_t.h> | ||
| 19 | #include <linux/list.h> | 20 | #include <linux/list.h> |
| 20 | #include <linux/netdevice.h> | 21 | #include <linux/netdevice.h> |
| 21 | #include <linux/printk.h> | 22 | #include <linux/printk.h> |
| 23 | #include <linux/proc_ns.h> | ||
| 22 | #include <linux/rtnetlink.h> | 24 | #include <linux/rtnetlink.h> |
| 25 | #include <linux/rwsem.h> | ||
| 23 | 26 | ||
| 24 | /* protected by RTNL */ | 27 | /* Protects bpf_prog_offload_devs, bpf_map_offload_devs and offload members |
| 28 | * of all progs. | ||
| 29 | * RTNL lock cannot be taken when holding this lock. | ||
| 30 | */ | ||
| 31 | static DECLARE_RWSEM(bpf_devs_lock); | ||
| 25 | static LIST_HEAD(bpf_prog_offload_devs); | 32 | static LIST_HEAD(bpf_prog_offload_devs); |
| 33 | static LIST_HEAD(bpf_map_offload_devs); | ||
| 34 | |||
| 35 | static int bpf_dev_offload_check(struct net_device *netdev) | ||
| 36 | { | ||
| 37 | if (!netdev) | ||
| 38 | return -EINVAL; | ||
| 39 | if (!netdev->netdev_ops->ndo_bpf) | ||
| 40 | return -EOPNOTSUPP; | ||
| 41 | return 0; | ||
| 42 | } | ||
| 26 | 43 | ||
| 27 | int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) | 44 | int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) |
| 28 | { | 45 | { |
| 29 | struct net *net = current->nsproxy->net_ns; | 46 | struct bpf_prog_offload *offload; |
| 30 | struct bpf_dev_offload *offload; | 47 | int err; |
| 31 | 48 | ||
| 32 | if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS && | 49 | if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS && |
| 33 | attr->prog_type != BPF_PROG_TYPE_XDP) | 50 | attr->prog_type != BPF_PROG_TYPE_XDP) |
| @@ -41,34 +58,44 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) | |||
| 41 | return -ENOMEM; | 58 | return -ENOMEM; |
| 42 | 59 | ||
| 43 | offload->prog = prog; | 60 | offload->prog = prog; |
| 44 | init_waitqueue_head(&offload->verifier_done); | ||
| 45 | 61 | ||
| 46 | rtnl_lock(); | 62 | offload->netdev = dev_get_by_index(current->nsproxy->net_ns, |
| 47 | offload->netdev = __dev_get_by_index(net, attr->prog_ifindex); | 63 | attr->prog_ifindex); |
| 48 | if (!offload->netdev) { | 64 | err = bpf_dev_offload_check(offload->netdev); |
| 49 | rtnl_unlock(); | 65 | if (err) |
| 50 | kfree(offload); | 66 | goto err_maybe_put; |
| 51 | return -EINVAL; | ||
| 52 | } | ||
| 53 | 67 | ||
| 68 | down_write(&bpf_devs_lock); | ||
| 69 | if (offload->netdev->reg_state != NETREG_REGISTERED) { | ||
| 70 | err = -EINVAL; | ||
| 71 | goto err_unlock; | ||
| 72 | } | ||
| 54 | prog->aux->offload = offload; | 73 | prog->aux->offload = offload; |
| 55 | list_add_tail(&offload->offloads, &bpf_prog_offload_devs); | 74 | list_add_tail(&offload->offloads, &bpf_prog_offload_devs); |
| 56 | rtnl_unlock(); | 75 | dev_put(offload->netdev); |
| 76 | up_write(&bpf_devs_lock); | ||
| 57 | 77 | ||
| 58 | return 0; | 78 | return 0; |
| 79 | err_unlock: | ||
| 80 | up_write(&bpf_devs_lock); | ||
| 81 | err_maybe_put: | ||
| 82 | if (offload->netdev) | ||
| 83 | dev_put(offload->netdev); | ||
| 84 | kfree(offload); | ||
| 85 | return err; | ||
| 59 | } | 86 | } |
| 60 | 87 | ||
| 61 | static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd, | 88 | static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd, |
| 62 | struct netdev_bpf *data) | 89 | struct netdev_bpf *data) |
| 63 | { | 90 | { |
| 64 | struct net_device *netdev = prog->aux->offload->netdev; | 91 | struct bpf_prog_offload *offload = prog->aux->offload; |
| 92 | struct net_device *netdev; | ||
| 65 | 93 | ||
| 66 | ASSERT_RTNL(); | 94 | ASSERT_RTNL(); |
| 67 | 95 | ||
| 68 | if (!netdev) | 96 | if (!offload) |
| 69 | return -ENODEV; | 97 | return -ENODEV; |
| 70 | if (!netdev->netdev_ops->ndo_bpf) | 98 | netdev = offload->netdev; |
| 71 | return -EOPNOTSUPP; | ||
| 72 | 99 | ||
| 73 | data->command = cmd; | 100 | data->command = cmd; |
| 74 | 101 | ||
| @@ -87,62 +114,63 @@ int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env) | |||
| 87 | if (err) | 114 | if (err) |
| 88 | goto exit_unlock; | 115 | goto exit_unlock; |
| 89 | 116 | ||
| 90 | env->dev_ops = data.verifier.ops; | 117 | env->prog->aux->offload->dev_ops = data.verifier.ops; |
| 91 | |||
| 92 | env->prog->aux->offload->dev_state = true; | 118 | env->prog->aux->offload->dev_state = true; |
| 93 | env->prog->aux->offload->verifier_running = true; | ||
| 94 | exit_unlock: | 119 | exit_unlock: |
| 95 | rtnl_unlock(); | 120 | rtnl_unlock(); |
| 96 | return err; | 121 | return err; |
| 97 | } | 122 | } |
| 98 | 123 | ||
| 124 | int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, | ||
| 125 | int insn_idx, int prev_insn_idx) | ||
| 126 | { | ||
| 127 | struct bpf_prog_offload *offload; | ||
| 128 | int ret = -ENODEV; | ||
| 129 | |||
| 130 | down_read(&bpf_devs_lock); | ||
| 131 | offload = env->prog->aux->offload; | ||
| 132 | if (offload) | ||
| 133 | ret = offload->dev_ops->insn_hook(env, insn_idx, prev_insn_idx); | ||
| 134 | up_read(&bpf_devs_lock); | ||
| 135 | |||
| 136 | return ret; | ||
| 137 | } | ||
| 138 | |||
| 99 | static void __bpf_prog_offload_destroy(struct bpf_prog *prog) | 139 | static void __bpf_prog_offload_destroy(struct bpf_prog *prog) |
| 100 | { | 140 | { |
| 101 | struct bpf_dev_offload *offload = prog->aux->offload; | 141 | struct bpf_prog_offload *offload = prog->aux->offload; |
| 102 | struct netdev_bpf data = {}; | 142 | struct netdev_bpf data = {}; |
| 103 | 143 | ||
| 104 | /* Caution - if netdev is destroyed before the program, this function | ||
| 105 | * will be called twice. | ||
| 106 | */ | ||
| 107 | |||
| 108 | data.offload.prog = prog; | 144 | data.offload.prog = prog; |
| 109 | 145 | ||
| 110 | if (offload->verifier_running) | ||
| 111 | wait_event(offload->verifier_done, !offload->verifier_running); | ||
| 112 | |||
| 113 | if (offload->dev_state) | 146 | if (offload->dev_state) |
| 114 | WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data)); | 147 | WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data)); |
| 115 | 148 | ||
| 116 | offload->dev_state = false; | 149 | /* Make sure BPF_PROG_GET_NEXT_ID can't find this dead program */ |
| 150 | bpf_prog_free_id(prog, true); | ||
| 151 | |||
| 117 | list_del_init(&offload->offloads); | 152 | list_del_init(&offload->offloads); |
| 118 | offload->netdev = NULL; | 153 | kfree(offload); |
| 154 | prog->aux->offload = NULL; | ||
| 119 | } | 155 | } |
| 120 | 156 | ||
| 121 | void bpf_prog_offload_destroy(struct bpf_prog *prog) | 157 | void bpf_prog_offload_destroy(struct bpf_prog *prog) |
| 122 | { | 158 | { |
| 123 | struct bpf_dev_offload *offload = prog->aux->offload; | ||
| 124 | |||
| 125 | offload->verifier_running = false; | ||
| 126 | wake_up(&offload->verifier_done); | ||
| 127 | |||
| 128 | rtnl_lock(); | 159 | rtnl_lock(); |
| 129 | __bpf_prog_offload_destroy(prog); | 160 | down_write(&bpf_devs_lock); |
| 161 | if (prog->aux->offload) | ||
| 162 | __bpf_prog_offload_destroy(prog); | ||
| 163 | up_write(&bpf_devs_lock); | ||
| 130 | rtnl_unlock(); | 164 | rtnl_unlock(); |
| 131 | |||
| 132 | kfree(offload); | ||
| 133 | } | 165 | } |
| 134 | 166 | ||
| 135 | static int bpf_prog_offload_translate(struct bpf_prog *prog) | 167 | static int bpf_prog_offload_translate(struct bpf_prog *prog) |
| 136 | { | 168 | { |
| 137 | struct bpf_dev_offload *offload = prog->aux->offload; | ||
| 138 | struct netdev_bpf data = {}; | 169 | struct netdev_bpf data = {}; |
| 139 | int ret; | 170 | int ret; |
| 140 | 171 | ||
| 141 | data.offload.prog = prog; | 172 | data.offload.prog = prog; |
| 142 | 173 | ||
| 143 | offload->verifier_running = false; | ||
| 144 | wake_up(&offload->verifier_done); | ||
| 145 | |||
| 146 | rtnl_lock(); | 174 | rtnl_lock(); |
| 147 | ret = __bpf_offload_ndo(prog, BPF_OFFLOAD_TRANSLATE, &data); | 175 | ret = __bpf_offload_ndo(prog, BPF_OFFLOAD_TRANSLATE, &data); |
| 148 | rtnl_unlock(); | 176 | rtnl_unlock(); |
| @@ -164,14 +192,323 @@ int bpf_prog_offload_compile(struct bpf_prog *prog) | |||
| 164 | return bpf_prog_offload_translate(prog); | 192 | return bpf_prog_offload_translate(prog); |
| 165 | } | 193 | } |
| 166 | 194 | ||
| 195 | struct ns_get_path_bpf_prog_args { | ||
| 196 | struct bpf_prog *prog; | ||
| 197 | struct bpf_prog_info *info; | ||
| 198 | }; | ||
| 199 | |||
| 200 | static struct ns_common *bpf_prog_offload_info_fill_ns(void *private_data) | ||
| 201 | { | ||
| 202 | struct ns_get_path_bpf_prog_args *args = private_data; | ||
| 203 | struct bpf_prog_aux *aux = args->prog->aux; | ||
| 204 | struct ns_common *ns; | ||
| 205 | struct net *net; | ||
| 206 | |||
| 207 | rtnl_lock(); | ||
| 208 | down_read(&bpf_devs_lock); | ||
| 209 | |||
| 210 | if (aux->offload) { | ||
| 211 | args->info->ifindex = aux->offload->netdev->ifindex; | ||
| 212 | net = dev_net(aux->offload->netdev); | ||
| 213 | get_net(net); | ||
| 214 | ns = &net->ns; | ||
| 215 | } else { | ||
| 216 | args->info->ifindex = 0; | ||
| 217 | ns = NULL; | ||
| 218 | } | ||
| 219 | |||
| 220 | up_read(&bpf_devs_lock); | ||
| 221 | rtnl_unlock(); | ||
| 222 | |||
| 223 | return ns; | ||
| 224 | } | ||
| 225 | |||
| 226 | int bpf_prog_offload_info_fill(struct bpf_prog_info *info, | ||
| 227 | struct bpf_prog *prog) | ||
| 228 | { | ||
| 229 | struct ns_get_path_bpf_prog_args args = { | ||
| 230 | .prog = prog, | ||
| 231 | .info = info, | ||
| 232 | }; | ||
| 233 | struct bpf_prog_aux *aux = prog->aux; | ||
| 234 | struct inode *ns_inode; | ||
| 235 | struct path ns_path; | ||
| 236 | char __user *uinsns; | ||
| 237 | void *res; | ||
| 238 | u32 ulen; | ||
| 239 | |||
| 240 | res = ns_get_path_cb(&ns_path, bpf_prog_offload_info_fill_ns, &args); | ||
| 241 | if (IS_ERR(res)) { | ||
| 242 | if (!info->ifindex) | ||
| 243 | return -ENODEV; | ||
| 244 | return PTR_ERR(res); | ||
| 245 | } | ||
| 246 | |||
| 247 | down_read(&bpf_devs_lock); | ||
| 248 | |||
| 249 | if (!aux->offload) { | ||
| 250 | up_read(&bpf_devs_lock); | ||
| 251 | return -ENODEV; | ||
| 252 | } | ||
| 253 | |||
| 254 | ulen = info->jited_prog_len; | ||
| 255 | info->jited_prog_len = aux->offload->jited_len; | ||
| 256 | if (info->jited_prog_len & ulen) { | ||
| 257 | uinsns = u64_to_user_ptr(info->jited_prog_insns); | ||
| 258 | ulen = min_t(u32, info->jited_prog_len, ulen); | ||
| 259 | if (copy_to_user(uinsns, aux->offload->jited_image, ulen)) { | ||
| 260 | up_read(&bpf_devs_lock); | ||
| 261 | return -EFAULT; | ||
| 262 | } | ||
| 263 | } | ||
| 264 | |||
| 265 | up_read(&bpf_devs_lock); | ||
| 266 | |||
| 267 | ns_inode = ns_path.dentry->d_inode; | ||
| 268 | info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev); | ||
| 269 | info->netns_ino = ns_inode->i_ino; | ||
| 270 | path_put(&ns_path); | ||
| 271 | |||
| 272 | return 0; | ||
| 273 | } | ||
| 274 | |||
| 167 | const struct bpf_prog_ops bpf_offload_prog_ops = { | 275 | const struct bpf_prog_ops bpf_offload_prog_ops = { |
| 168 | }; | 276 | }; |
| 169 | 277 | ||
| 278 | static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap, | ||
| 279 | enum bpf_netdev_command cmd) | ||
| 280 | { | ||
| 281 | struct netdev_bpf data = {}; | ||
| 282 | struct net_device *netdev; | ||
| 283 | |||
| 284 | ASSERT_RTNL(); | ||
| 285 | |||
| 286 | data.command = cmd; | ||
| 287 | data.offmap = offmap; | ||
| 288 | /* Caller must make sure netdev is valid */ | ||
| 289 | netdev = offmap->netdev; | ||
| 290 | |||
| 291 | return netdev->netdev_ops->ndo_bpf(netdev, &data); | ||
| 292 | } | ||
| 293 | |||
| 294 | struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) | ||
| 295 | { | ||
| 296 | struct net *net = current->nsproxy->net_ns; | ||
| 297 | struct bpf_offloaded_map *offmap; | ||
| 298 | int err; | ||
| 299 | |||
| 300 | if (!capable(CAP_SYS_ADMIN)) | ||
| 301 | return ERR_PTR(-EPERM); | ||
| 302 | if (attr->map_type != BPF_MAP_TYPE_ARRAY && | ||
| 303 | attr->map_type != BPF_MAP_TYPE_HASH) | ||
| 304 | return ERR_PTR(-EINVAL); | ||
| 305 | |||
| 306 | offmap = kzalloc(sizeof(*offmap), GFP_USER); | ||
| 307 | if (!offmap) | ||
| 308 | return ERR_PTR(-ENOMEM); | ||
| 309 | |||
| 310 | bpf_map_init_from_attr(&offmap->map, attr); | ||
| 311 | |||
| 312 | rtnl_lock(); | ||
| 313 | down_write(&bpf_devs_lock); | ||
| 314 | offmap->netdev = __dev_get_by_index(net, attr->map_ifindex); | ||
| 315 | err = bpf_dev_offload_check(offmap->netdev); | ||
| 316 | if (err) | ||
| 317 | goto err_unlock; | ||
| 318 | |||
| 319 | err = bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_ALLOC); | ||
| 320 | if (err) | ||
| 321 | goto err_unlock; | ||
| 322 | |||
| 323 | list_add_tail(&offmap->offloads, &bpf_map_offload_devs); | ||
| 324 | up_write(&bpf_devs_lock); | ||
| 325 | rtnl_unlock(); | ||
| 326 | |||
| 327 | return &offmap->map; | ||
| 328 | |||
| 329 | err_unlock: | ||
| 330 | up_write(&bpf_devs_lock); | ||
| 331 | rtnl_unlock(); | ||
| 332 | kfree(offmap); | ||
| 333 | return ERR_PTR(err); | ||
| 334 | } | ||
| 335 | |||
| 336 | static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap) | ||
| 337 | { | ||
| 338 | WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE)); | ||
| 339 | /* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */ | ||
| 340 | bpf_map_free_id(&offmap->map, true); | ||
| 341 | list_del_init(&offmap->offloads); | ||
| 342 | offmap->netdev = NULL; | ||
| 343 | } | ||
| 344 | |||
| 345 | void bpf_map_offload_map_free(struct bpf_map *map) | ||
| 346 | { | ||
| 347 | struct bpf_offloaded_map *offmap = map_to_offmap(map); | ||
| 348 | |||
| 349 | rtnl_lock(); | ||
| 350 | down_write(&bpf_devs_lock); | ||
| 351 | if (offmap->netdev) | ||
| 352 | __bpf_map_offload_destroy(offmap); | ||
| 353 | up_write(&bpf_devs_lock); | ||
| 354 | rtnl_unlock(); | ||
| 355 | |||
| 356 | kfree(offmap); | ||
| 357 | } | ||
| 358 | |||
| 359 | int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value) | ||
| 360 | { | ||
| 361 | struct bpf_offloaded_map *offmap = map_to_offmap(map); | ||
| 362 | int ret = -ENODEV; | ||
| 363 | |||
| 364 | down_read(&bpf_devs_lock); | ||
| 365 | if (offmap->netdev) | ||
| 366 | ret = offmap->dev_ops->map_lookup_elem(offmap, key, value); | ||
| 367 | up_read(&bpf_devs_lock); | ||
| 368 | |||
| 369 | return ret; | ||
| 370 | } | ||
| 371 | |||
| 372 | int bpf_map_offload_update_elem(struct bpf_map *map, | ||
| 373 | void *key, void *value, u64 flags) | ||
| 374 | { | ||
| 375 | struct bpf_offloaded_map *offmap = map_to_offmap(map); | ||
| 376 | int ret = -ENODEV; | ||
| 377 | |||
| 378 | if (unlikely(flags > BPF_EXIST)) | ||
| 379 | return -EINVAL; | ||
| 380 | |||
| 381 | down_read(&bpf_devs_lock); | ||
| 382 | if (offmap->netdev) | ||
| 383 | ret = offmap->dev_ops->map_update_elem(offmap, key, value, | ||
| 384 | flags); | ||
| 385 | up_read(&bpf_devs_lock); | ||
| 386 | |||
| 387 | return ret; | ||
| 388 | } | ||
| 389 | |||
| 390 | int bpf_map_offload_delete_elem(struct bpf_map *map, void *key) | ||
| 391 | { | ||
| 392 | struct bpf_offloaded_map *offmap = map_to_offmap(map); | ||
| 393 | int ret = -ENODEV; | ||
| 394 | |||
| 395 | down_read(&bpf_devs_lock); | ||
| 396 | if (offmap->netdev) | ||
| 397 | ret = offmap->dev_ops->map_delete_elem(offmap, key); | ||
| 398 | up_read(&bpf_devs_lock); | ||
| 399 | |||
| 400 | return ret; | ||
| 401 | } | ||
| 402 | |||
| 403 | int bpf_map_offload_get_next_key(struct bpf_map *map, void *key, void *next_key) | ||
| 404 | { | ||
| 405 | struct bpf_offloaded_map *offmap = map_to_offmap(map); | ||
| 406 | int ret = -ENODEV; | ||
| 407 | |||
| 408 | down_read(&bpf_devs_lock); | ||
| 409 | if (offmap->netdev) | ||
| 410 | ret = offmap->dev_ops->map_get_next_key(offmap, key, next_key); | ||
| 411 | up_read(&bpf_devs_lock); | ||
| 412 | |||
| 413 | return ret; | ||
| 414 | } | ||
| 415 | |||
| 416 | struct ns_get_path_bpf_map_args { | ||
| 417 | struct bpf_offloaded_map *offmap; | ||
| 418 | struct bpf_map_info *info; | ||
| 419 | }; | ||
| 420 | |||
| 421 | static struct ns_common *bpf_map_offload_info_fill_ns(void *private_data) | ||
| 422 | { | ||
| 423 | struct ns_get_path_bpf_map_args *args = private_data; | ||
| 424 | struct ns_common *ns; | ||
| 425 | struct net *net; | ||
| 426 | |||
| 427 | rtnl_lock(); | ||
| 428 | down_read(&bpf_devs_lock); | ||
| 429 | |||
| 430 | if (args->offmap->netdev) { | ||
| 431 | args->info->ifindex = args->offmap->netdev->ifindex; | ||
| 432 | net = dev_net(args->offmap->netdev); | ||
| 433 | get_net(net); | ||
| 434 | ns = &net->ns; | ||
| 435 | } else { | ||
| 436 | args->info->ifindex = 0; | ||
| 437 | ns = NULL; | ||
| 438 | } | ||
| 439 | |||
| 440 | up_read(&bpf_devs_lock); | ||
| 441 | rtnl_unlock(); | ||
| 442 | |||
| 443 | return ns; | ||
| 444 | } | ||
| 445 | |||
| 446 | int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map) | ||
| 447 | { | ||
| 448 | struct ns_get_path_bpf_map_args args = { | ||
| 449 | .offmap = map_to_offmap(map), | ||
| 450 | .info = info, | ||
| 451 | }; | ||
| 452 | struct inode *ns_inode; | ||
| 453 | struct path ns_path; | ||
| 454 | void *res; | ||
| 455 | |||
| 456 | res = ns_get_path_cb(&ns_path, bpf_map_offload_info_fill_ns, &args); | ||
| 457 | if (IS_ERR(res)) { | ||
| 458 | if (!info->ifindex) | ||
| 459 | return -ENODEV; | ||
| 460 | return PTR_ERR(res); | ||
| 461 | } | ||
| 462 | |||
| 463 | ns_inode = ns_path.dentry->d_inode; | ||
| 464 | info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev); | ||
| 465 | info->netns_ino = ns_inode->i_ino; | ||
| 466 | path_put(&ns_path); | ||
| 467 | |||
| 468 | return 0; | ||
| 469 | } | ||
| 470 | |||
| 471 | bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map) | ||
| 472 | { | ||
| 473 | struct bpf_offloaded_map *offmap; | ||
| 474 | struct bpf_prog_offload *offload; | ||
| 475 | bool ret; | ||
| 476 | |||
| 477 | if (!bpf_prog_is_dev_bound(prog->aux) || !bpf_map_is_dev_bound(map)) | ||
| 478 | return false; | ||
| 479 | |||
| 480 | down_read(&bpf_devs_lock); | ||
| 481 | offload = prog->aux->offload; | ||
| 482 | offmap = map_to_offmap(map); | ||
| 483 | |||
| 484 | ret = offload && offload->netdev == offmap->netdev; | ||
| 485 | up_read(&bpf_devs_lock); | ||
| 486 | |||
| 487 | return ret; | ||
| 488 | } | ||
| 489 | |||
| 490 | static void bpf_offload_orphan_all_progs(struct net_device *netdev) | ||
| 491 | { | ||
| 492 | struct bpf_prog_offload *offload, *tmp; | ||
| 493 | |||
| 494 | list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, offloads) | ||
| 495 | if (offload->netdev == netdev) | ||
| 496 | __bpf_prog_offload_destroy(offload->prog); | ||
| 497 | } | ||
| 498 | |||
| 499 | static void bpf_offload_orphan_all_maps(struct net_device *netdev) | ||
| 500 | { | ||
| 501 | struct bpf_offloaded_map *offmap, *tmp; | ||
| 502 | |||
| 503 | list_for_each_entry_safe(offmap, tmp, &bpf_map_offload_devs, offloads) | ||
| 504 | if (offmap->netdev == netdev) | ||
| 505 | __bpf_map_offload_destroy(offmap); | ||
| 506 | } | ||
| 507 | |||
| 170 | static int bpf_offload_notification(struct notifier_block *notifier, | 508 | static int bpf_offload_notification(struct notifier_block *notifier, |
| 171 | ulong event, void *ptr) | 509 | ulong event, void *ptr) |
| 172 | { | 510 | { |
| 173 | struct net_device *netdev = netdev_notifier_info_to_dev(ptr); | 511 | struct net_device *netdev = netdev_notifier_info_to_dev(ptr); |
| 174 | struct bpf_dev_offload *offload, *tmp; | ||
| 175 | 512 | ||
| 176 | ASSERT_RTNL(); | 513 | ASSERT_RTNL(); |
| 177 | 514 | ||
| @@ -181,11 +518,10 @@ static int bpf_offload_notification(struct notifier_block *notifier, | |||
| 181 | if (netdev->reg_state != NETREG_UNREGISTERING) | 518 | if (netdev->reg_state != NETREG_UNREGISTERING) |
| 182 | break; | 519 | break; |
| 183 | 520 | ||
| 184 | list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, | 521 | down_write(&bpf_devs_lock); |
| 185 | offloads) { | 522 | bpf_offload_orphan_all_progs(netdev); |
| 186 | if (offload->netdev == netdev) | 523 | bpf_offload_orphan_all_maps(netdev); |
| 187 | __bpf_prog_offload_destroy(offload->prog); | 524 | up_write(&bpf_devs_lock); |
| 188 | } | ||
| 189 | break; | 525 | break; |
| 190 | default: | 526 | default: |
| 191 | break; | 527 | break; |
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 1712d319c2d8..0314d1783d77 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c | |||
| @@ -96,14 +96,6 @@ static inline struct smap_psock *smap_psock_sk(const struct sock *sk) | |||
| 96 | return rcu_dereference_sk_user_data(sk); | 96 | return rcu_dereference_sk_user_data(sk); |
| 97 | } | 97 | } |
| 98 | 98 | ||
| 99 | /* compute the linear packet data range [data, data_end) for skb when | ||
| 100 | * sk_skb type programs are in use. | ||
| 101 | */ | ||
| 102 | static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb) | ||
| 103 | { | ||
| 104 | TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb); | ||
| 105 | } | ||
| 106 | |||
| 107 | enum __sk_action { | 99 | enum __sk_action { |
| 108 | __SK_DROP = 0, | 100 | __SK_DROP = 0, |
| 109 | __SK_PASS, | 101 | __SK_PASS, |
| @@ -521,13 +513,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) | |||
| 521 | if (!stab) | 513 | if (!stab) |
| 522 | return ERR_PTR(-ENOMEM); | 514 | return ERR_PTR(-ENOMEM); |
| 523 | 515 | ||
| 524 | /* mandatory map attributes */ | 516 | bpf_map_init_from_attr(&stab->map, attr); |
| 525 | stab->map.map_type = attr->map_type; | ||
| 526 | stab->map.key_size = attr->key_size; | ||
| 527 | stab->map.value_size = attr->value_size; | ||
| 528 | stab->map.max_entries = attr->max_entries; | ||
| 529 | stab->map.map_flags = attr->map_flags; | ||
| 530 | stab->map.numa_node = bpf_map_attr_numa_node(attr); | ||
| 531 | 517 | ||
| 532 | /* make sure page count doesn't overflow */ | 518 | /* make sure page count doesn't overflow */ |
| 533 | cost = (u64) stab->map.max_entries * sizeof(struct sock *); | 519 | cost = (u64) stab->map.max_entries * sizeof(struct sock *); |
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index a15bc636cc98..b0ecf43f5894 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c | |||
| @@ -88,14 +88,10 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) | |||
| 88 | if (cost >= U32_MAX - PAGE_SIZE) | 88 | if (cost >= U32_MAX - PAGE_SIZE) |
| 89 | goto free_smap; | 89 | goto free_smap; |
| 90 | 90 | ||
| 91 | smap->map.map_type = attr->map_type; | 91 | bpf_map_init_from_attr(&smap->map, attr); |
| 92 | smap->map.key_size = attr->key_size; | ||
| 93 | smap->map.value_size = value_size; | 92 | smap->map.value_size = value_size; |
| 94 | smap->map.max_entries = attr->max_entries; | ||
| 95 | smap->map.map_flags = attr->map_flags; | ||
| 96 | smap->n_buckets = n_buckets; | 93 | smap->n_buckets = n_buckets; |
| 97 | smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; | 94 | smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; |
| 98 | smap->map.numa_node = bpf_map_attr_numa_node(attr); | ||
| 99 | 95 | ||
| 100 | err = bpf_map_precharge_memlock(smap->map.pages); | 96 | err = bpf_map_precharge_memlock(smap->map.pages); |
| 101 | if (err) | 97 | if (err) |
| @@ -226,9 +222,33 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) | |||
| 226 | return 0; | 222 | return 0; |
| 227 | } | 223 | } |
| 228 | 224 | ||
| 229 | static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key) | 225 | static int stack_map_get_next_key(struct bpf_map *map, void *key, |
| 226 | void *next_key) | ||
| 230 | { | 227 | { |
| 231 | return -EINVAL; | 228 | struct bpf_stack_map *smap = container_of(map, |
| 229 | struct bpf_stack_map, map); | ||
| 230 | u32 id; | ||
| 231 | |||
| 232 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
| 233 | |||
| 234 | if (!key) { | ||
| 235 | id = 0; | ||
| 236 | } else { | ||
| 237 | id = *(u32 *)key; | ||
| 238 | if (id >= smap->n_buckets || !smap->buckets[id]) | ||
| 239 | id = 0; | ||
| 240 | else | ||
| 241 | id++; | ||
| 242 | } | ||
| 243 | |||
| 244 | while (id < smap->n_buckets && !smap->buckets[id]) | ||
| 245 | id++; | ||
| 246 | |||
| 247 | if (id >= smap->n_buckets) | ||
| 248 | return -ENOENT; | ||
| 249 | |||
| 250 | *(u32 *)next_key = id; | ||
| 251 | return 0; | ||
| 232 | } | 252 | } |
| 233 | 253 | ||
| 234 | static int stack_map_update_elem(struct bpf_map *map, void *key, void *value, | 254 | static int stack_map_update_elem(struct bpf_map *map, void *key, void *value, |
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5cb783fc8224..e24aa3241387 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c | |||
| @@ -94,18 +94,34 @@ static int check_uarg_tail_zero(void __user *uaddr, | |||
| 94 | return 0; | 94 | return 0; |
| 95 | } | 95 | } |
| 96 | 96 | ||
| 97 | const struct bpf_map_ops bpf_map_offload_ops = { | ||
| 98 | .map_alloc = bpf_map_offload_map_alloc, | ||
| 99 | .map_free = bpf_map_offload_map_free, | ||
| 100 | }; | ||
| 101 | |||
| 97 | static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) | 102 | static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) |
| 98 | { | 103 | { |
| 104 | const struct bpf_map_ops *ops; | ||
| 99 | struct bpf_map *map; | 105 | struct bpf_map *map; |
| 106 | int err; | ||
| 100 | 107 | ||
| 101 | if (attr->map_type >= ARRAY_SIZE(bpf_map_types) || | 108 | if (attr->map_type >= ARRAY_SIZE(bpf_map_types)) |
| 102 | !bpf_map_types[attr->map_type]) | 109 | return ERR_PTR(-EINVAL); |
| 110 | ops = bpf_map_types[attr->map_type]; | ||
| 111 | if (!ops) | ||
| 103 | return ERR_PTR(-EINVAL); | 112 | return ERR_PTR(-EINVAL); |
| 104 | 113 | ||
| 105 | map = bpf_map_types[attr->map_type]->map_alloc(attr); | 114 | if (ops->map_alloc_check) { |
| 115 | err = ops->map_alloc_check(attr); | ||
| 116 | if (err) | ||
| 117 | return ERR_PTR(err); | ||
| 118 | } | ||
| 119 | if (attr->map_ifindex) | ||
| 120 | ops = &bpf_map_offload_ops; | ||
| 121 | map = ops->map_alloc(attr); | ||
| 106 | if (IS_ERR(map)) | 122 | if (IS_ERR(map)) |
| 107 | return map; | 123 | return map; |
| 108 | map->ops = bpf_map_types[attr->map_type]; | 124 | map->ops = ops; |
| 109 | map->map_type = attr->map_type; | 125 | map->map_type = attr->map_type; |
| 110 | return map; | 126 | return map; |
| 111 | } | 127 | } |
| @@ -134,6 +150,16 @@ void bpf_map_area_free(void *area) | |||
| 134 | kvfree(area); | 150 | kvfree(area); |
| 135 | } | 151 | } |
| 136 | 152 | ||
| 153 | void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr) | ||
| 154 | { | ||
| 155 | map->map_type = attr->map_type; | ||
| 156 | map->key_size = attr->key_size; | ||
| 157 | map->value_size = attr->value_size; | ||
| 158 | map->max_entries = attr->max_entries; | ||
| 159 | map->map_flags = attr->map_flags; | ||
| 160 | map->numa_node = bpf_map_attr_numa_node(attr); | ||
| 161 | } | ||
| 162 | |||
| 137 | int bpf_map_precharge_memlock(u32 pages) | 163 | int bpf_map_precharge_memlock(u32 pages) |
| 138 | { | 164 | { |
| 139 | struct user_struct *user = get_current_user(); | 165 | struct user_struct *user = get_current_user(); |
| @@ -189,16 +215,25 @@ static int bpf_map_alloc_id(struct bpf_map *map) | |||
| 189 | return id > 0 ? 0 : id; | 215 | return id > 0 ? 0 : id; |
| 190 | } | 216 | } |
| 191 | 217 | ||
| 192 | static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) | 218 | void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) |
| 193 | { | 219 | { |
| 194 | unsigned long flags; | 220 | unsigned long flags; |
| 195 | 221 | ||
| 222 | /* Offloaded maps are removed from the IDR store when their device | ||
| 223 | * disappears - even if someone holds an fd to them they are unusable, | ||
| 224 | * the memory is gone, all ops will fail; they are simply waiting for | ||
| 225 | * refcnt to drop to be freed. | ||
| 226 | */ | ||
| 227 | if (!map->id) | ||
| 228 | return; | ||
| 229 | |||
| 196 | if (do_idr_lock) | 230 | if (do_idr_lock) |
| 197 | spin_lock_irqsave(&map_idr_lock, flags); | 231 | spin_lock_irqsave(&map_idr_lock, flags); |
| 198 | else | 232 | else |
| 199 | __acquire(&map_idr_lock); | 233 | __acquire(&map_idr_lock); |
| 200 | 234 | ||
| 201 | idr_remove(&map_idr, map->id); | 235 | idr_remove(&map_idr, map->id); |
| 236 | map->id = 0; | ||
| 202 | 237 | ||
| 203 | if (do_idr_lock) | 238 | if (do_idr_lock) |
| 204 | spin_unlock_irqrestore(&map_idr_lock, flags); | 239 | spin_unlock_irqrestore(&map_idr_lock, flags); |
| @@ -378,7 +413,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src) | |||
| 378 | return 0; | 413 | return 0; |
| 379 | } | 414 | } |
| 380 | 415 | ||
| 381 | #define BPF_MAP_CREATE_LAST_FIELD map_name | 416 | #define BPF_MAP_CREATE_LAST_FIELD map_ifindex |
| 382 | /* called via syscall */ | 417 | /* called via syscall */ |
| 383 | static int map_create(union bpf_attr *attr) | 418 | static int map_create(union bpf_attr *attr) |
| 384 | { | 419 | { |
| @@ -566,8 +601,10 @@ static int map_lookup_elem(union bpf_attr *attr) | |||
| 566 | if (!value) | 601 | if (!value) |
| 567 | goto free_key; | 602 | goto free_key; |
| 568 | 603 | ||
| 569 | if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || | 604 | if (bpf_map_is_dev_bound(map)) { |
| 570 | map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { | 605 | err = bpf_map_offload_lookup_elem(map, key, value); |
| 606 | } else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || | ||
| 607 | map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { | ||
| 571 | err = bpf_percpu_hash_copy(map, key, value); | 608 | err = bpf_percpu_hash_copy(map, key, value); |
| 572 | } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { | 609 | } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { |
| 573 | err = bpf_percpu_array_copy(map, key, value); | 610 | err = bpf_percpu_array_copy(map, key, value); |
| @@ -654,7 +691,10 @@ static int map_update_elem(union bpf_attr *attr) | |||
| 654 | goto free_value; | 691 | goto free_value; |
| 655 | 692 | ||
| 656 | /* Need to create a kthread, thus must support schedule */ | 693 | /* Need to create a kthread, thus must support schedule */ |
| 657 | if (map->map_type == BPF_MAP_TYPE_CPUMAP) { | 694 | if (bpf_map_is_dev_bound(map)) { |
| 695 | err = bpf_map_offload_update_elem(map, key, value, attr->flags); | ||
| 696 | goto out; | ||
| 697 | } else if (map->map_type == BPF_MAP_TYPE_CPUMAP) { | ||
| 658 | err = map->ops->map_update_elem(map, key, value, attr->flags); | 698 | err = map->ops->map_update_elem(map, key, value, attr->flags); |
| 659 | goto out; | 699 | goto out; |
| 660 | } | 700 | } |
| @@ -669,10 +709,7 @@ static int map_update_elem(union bpf_attr *attr) | |||
| 669 | err = bpf_percpu_hash_update(map, key, value, attr->flags); | 709 | err = bpf_percpu_hash_update(map, key, value, attr->flags); |
| 670 | } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { | 710 | } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { |
| 671 | err = bpf_percpu_array_update(map, key, value, attr->flags); | 711 | err = bpf_percpu_array_update(map, key, value, attr->flags); |
| 672 | } else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || | 712 | } else if (IS_FD_ARRAY(map)) { |
| 673 | map->map_type == BPF_MAP_TYPE_PROG_ARRAY || | ||
| 674 | map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || | ||
| 675 | map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) { | ||
| 676 | rcu_read_lock(); | 713 | rcu_read_lock(); |
| 677 | err = bpf_fd_array_map_update_elem(map, f.file, key, value, | 714 | err = bpf_fd_array_map_update_elem(map, f.file, key, value, |
| 678 | attr->flags); | 715 | attr->flags); |
| @@ -731,6 +768,11 @@ static int map_delete_elem(union bpf_attr *attr) | |||
| 731 | goto err_put; | 768 | goto err_put; |
| 732 | } | 769 | } |
| 733 | 770 | ||
| 771 | if (bpf_map_is_dev_bound(map)) { | ||
| 772 | err = bpf_map_offload_delete_elem(map, key); | ||
| 773 | goto out; | ||
| 774 | } | ||
| 775 | |||
| 734 | preempt_disable(); | 776 | preempt_disable(); |
| 735 | __this_cpu_inc(bpf_prog_active); | 777 | __this_cpu_inc(bpf_prog_active); |
| 736 | rcu_read_lock(); | 778 | rcu_read_lock(); |
| @@ -738,7 +780,7 @@ static int map_delete_elem(union bpf_attr *attr) | |||
| 738 | rcu_read_unlock(); | 780 | rcu_read_unlock(); |
| 739 | __this_cpu_dec(bpf_prog_active); | 781 | __this_cpu_dec(bpf_prog_active); |
| 740 | preempt_enable(); | 782 | preempt_enable(); |
| 741 | 783 | out: | |
| 742 | if (!err) | 784 | if (!err) |
| 743 | trace_bpf_map_delete_elem(map, ufd, key); | 785 | trace_bpf_map_delete_elem(map, ufd, key); |
| 744 | kfree(key); | 786 | kfree(key); |
| @@ -788,9 +830,15 @@ static int map_get_next_key(union bpf_attr *attr) | |||
| 788 | if (!next_key) | 830 | if (!next_key) |
| 789 | goto free_key; | 831 | goto free_key; |
| 790 | 832 | ||
| 833 | if (bpf_map_is_dev_bound(map)) { | ||
| 834 | err = bpf_map_offload_get_next_key(map, key, next_key); | ||
| 835 | goto out; | ||
| 836 | } | ||
| 837 | |||
| 791 | rcu_read_lock(); | 838 | rcu_read_lock(); |
| 792 | err = map->ops->map_get_next_key(map, key, next_key); | 839 | err = map->ops->map_get_next_key(map, key, next_key); |
| 793 | rcu_read_unlock(); | 840 | rcu_read_unlock(); |
| 841 | out: | ||
| 794 | if (err) | 842 | if (err) |
| 795 | goto free_next_key; | 843 | goto free_next_key; |
| 796 | 844 | ||
| @@ -905,9 +953,13 @@ static int bpf_prog_alloc_id(struct bpf_prog *prog) | |||
| 905 | return id > 0 ? 0 : id; | 953 | return id > 0 ? 0 : id; |
| 906 | } | 954 | } |
| 907 | 955 | ||
| 908 | static void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) | 956 | void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) |
| 909 | { | 957 | { |
| 910 | /* cBPF to eBPF migrations are currently not in the idr store. */ | 958 | /* cBPF to eBPF migrations are currently not in the idr store. |
| 959 | * Offloaded programs are removed from the store when their device | ||
| 960 | * disappears - even if someone grabs an fd to them they are unusable, | ||
| 961 | * simply waiting for refcnt to drop to be freed. | ||
| 962 | */ | ||
| 911 | if (!prog->aux->id) | 963 | if (!prog->aux->id) |
| 912 | return; | 964 | return; |
| 913 | 965 | ||
| @@ -917,6 +969,7 @@ static void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) | |||
| 917 | __acquire(&prog_idr_lock); | 969 | __acquire(&prog_idr_lock); |
| 918 | 970 | ||
| 919 | idr_remove(&prog_idr, prog->aux->id); | 971 | idr_remove(&prog_idr, prog->aux->id); |
| 972 | prog->aux->id = 0; | ||
| 920 | 973 | ||
| 921 | if (do_idr_lock) | 974 | if (do_idr_lock) |
| 922 | spin_unlock_bh(&prog_idr_lock); | 975 | spin_unlock_bh(&prog_idr_lock); |
| @@ -937,10 +990,16 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) | |||
| 937 | static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) | 990 | static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) |
| 938 | { | 991 | { |
| 939 | if (atomic_dec_and_test(&prog->aux->refcnt)) { | 992 | if (atomic_dec_and_test(&prog->aux->refcnt)) { |
| 993 | int i; | ||
| 994 | |||
| 940 | trace_bpf_prog_put_rcu(prog); | 995 | trace_bpf_prog_put_rcu(prog); |
| 941 | /* bpf_prog_free_id() must be called first */ | 996 | /* bpf_prog_free_id() must be called first */ |
| 942 | bpf_prog_free_id(prog, do_idr_lock); | 997 | bpf_prog_free_id(prog, do_idr_lock); |
| 998 | |||
| 999 | for (i = 0; i < prog->aux->func_cnt; i++) | ||
| 1000 | bpf_prog_kallsyms_del(prog->aux->func[i]); | ||
| 943 | bpf_prog_kallsyms_del(prog); | 1001 | bpf_prog_kallsyms_del(prog); |
| 1002 | |||
| 944 | call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); | 1003 | call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); |
| 945 | } | 1004 | } |
| 946 | } | 1005 | } |
| @@ -1151,6 +1210,8 @@ static int bpf_prog_load(union bpf_attr *attr) | |||
| 1151 | if (!prog) | 1210 | if (!prog) |
| 1152 | return -ENOMEM; | 1211 | return -ENOMEM; |
| 1153 | 1212 | ||
| 1213 | prog->aux->offload_requested = !!attr->prog_ifindex; | ||
| 1214 | |||
| 1154 | err = security_bpf_prog_alloc(prog->aux); | 1215 | err = security_bpf_prog_alloc(prog->aux); |
| 1155 | if (err) | 1216 | if (err) |
| 1156 | goto free_prog_nouncharge; | 1217 | goto free_prog_nouncharge; |
| @@ -1172,7 +1233,7 @@ static int bpf_prog_load(union bpf_attr *attr) | |||
| 1172 | atomic_set(&prog->aux->refcnt, 1); | 1233 | atomic_set(&prog->aux->refcnt, 1); |
| 1173 | prog->gpl_compatible = is_gpl ? 1 : 0; | 1234 | prog->gpl_compatible = is_gpl ? 1 : 0; |
| 1174 | 1235 | ||
| 1175 | if (attr->prog_ifindex) { | 1236 | if (bpf_prog_is_dev_bound(prog->aux)) { |
| 1176 | err = bpf_prog_offload_init(prog, attr); | 1237 | err = bpf_prog_offload_init(prog, attr); |
| 1177 | if (err) | 1238 | if (err) |
| 1178 | goto free_prog; | 1239 | goto free_prog; |
| @@ -1194,7 +1255,8 @@ static int bpf_prog_load(union bpf_attr *attr) | |||
| 1194 | goto free_used_maps; | 1255 | goto free_used_maps; |
| 1195 | 1256 | ||
| 1196 | /* eBPF program is ready to be JITed */ | 1257 | /* eBPF program is ready to be JITed */ |
| 1197 | prog = bpf_prog_select_runtime(prog, &err); | 1258 | if (!prog->bpf_func) |
| 1259 | prog = bpf_prog_select_runtime(prog, &err); | ||
| 1198 | if (err < 0) | 1260 | if (err < 0) |
| 1199 | goto free_used_maps; | 1261 | goto free_used_maps; |
| 1200 | 1262 | ||
| @@ -1439,6 +1501,8 @@ static int bpf_prog_test_run(const union bpf_attr *attr, | |||
| 1439 | struct bpf_prog *prog; | 1501 | struct bpf_prog *prog; |
| 1440 | int ret = -ENOTSUPP; | 1502 | int ret = -ENOTSUPP; |
| 1441 | 1503 | ||
| 1504 | if (!capable(CAP_SYS_ADMIN)) | ||
| 1505 | return -EPERM; | ||
| 1442 | if (CHECK_ATTR(BPF_PROG_TEST_RUN)) | 1506 | if (CHECK_ATTR(BPF_PROG_TEST_RUN)) |
| 1443 | return -EINVAL; | 1507 | return -EINVAL; |
| 1444 | 1508 | ||
| @@ -1551,6 +1615,67 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr) | |||
| 1551 | return fd; | 1615 | return fd; |
| 1552 | } | 1616 | } |
| 1553 | 1617 | ||
| 1618 | static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog, | ||
| 1619 | unsigned long addr) | ||
| 1620 | { | ||
| 1621 | int i; | ||
| 1622 | |||
| 1623 | for (i = 0; i < prog->aux->used_map_cnt; i++) | ||
| 1624 | if (prog->aux->used_maps[i] == (void *)addr) | ||
| 1625 | return prog->aux->used_maps[i]; | ||
| 1626 | return NULL; | ||
| 1627 | } | ||
| 1628 | |||
| 1629 | static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) | ||
| 1630 | { | ||
| 1631 | const struct bpf_map *map; | ||
| 1632 | struct bpf_insn *insns; | ||
| 1633 | u64 imm; | ||
| 1634 | int i; | ||
| 1635 | |||
| 1636 | insns = kmemdup(prog->insnsi, bpf_prog_insn_size(prog), | ||
| 1637 | GFP_USER); | ||
| 1638 | if (!insns) | ||
| 1639 | return insns; | ||
| 1640 | |||
| 1641 | for (i = 0; i < prog->len; i++) { | ||
| 1642 | if (insns[i].code == (BPF_JMP | BPF_TAIL_CALL)) { | ||
| 1643 | insns[i].code = BPF_JMP | BPF_CALL; | ||
| 1644 | insns[i].imm = BPF_FUNC_tail_call; | ||
| 1645 | /* fall-through */ | ||
| 1646 | } | ||
| 1647 | if (insns[i].code == (BPF_JMP | BPF_CALL) || | ||
| 1648 | insns[i].code == (BPF_JMP | BPF_CALL_ARGS)) { | ||
| 1649 | if (insns[i].code == (BPF_JMP | BPF_CALL_ARGS)) | ||
| 1650 | insns[i].code = BPF_JMP | BPF_CALL; | ||
| 1651 | if (!bpf_dump_raw_ok()) | ||
| 1652 | insns[i].imm = 0; | ||
| 1653 | continue; | ||
| 1654 | } | ||
| 1655 | |||
| 1656 | if (insns[i].code != (BPF_LD | BPF_IMM | BPF_DW)) | ||
| 1657 | continue; | ||
| 1658 | |||
| 1659 | imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm; | ||
| 1660 | map = bpf_map_from_imm(prog, imm); | ||
| 1661 | if (map) { | ||
| 1662 | insns[i].src_reg = BPF_PSEUDO_MAP_FD; | ||
| 1663 | insns[i].imm = map->id; | ||
| 1664 | insns[i + 1].imm = 0; | ||
| 1665 | continue; | ||
| 1666 | } | ||
| 1667 | |||
| 1668 | if (!bpf_dump_raw_ok() && | ||
| 1669 | imm == (unsigned long)prog->aux) { | ||
| 1670 | insns[i].imm = 0; | ||
| 1671 | insns[i + 1].imm = 0; | ||
| 1672 | continue; | ||
| 1673 | } | ||
| 1674 | } | ||
| 1675 | |||
| 1676 | return insns; | ||
| 1677 | } | ||
| 1678 | |||
| 1554 | static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, | 1679 | static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, |
| 1555 | const union bpf_attr *attr, | 1680 | const union bpf_attr *attr, |
| 1556 | union bpf_attr __user *uattr) | 1681 | union bpf_attr __user *uattr) |
| @@ -1598,24 +1723,51 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, | |||
| 1598 | goto done; | 1723 | goto done; |
| 1599 | } | 1724 | } |
| 1600 | 1725 | ||
| 1601 | ulen = info.jited_prog_len; | ||
| 1602 | info.jited_prog_len = prog->jited_len; | ||
| 1603 | if (info.jited_prog_len && ulen) { | ||
| 1604 | uinsns = u64_to_user_ptr(info.jited_prog_insns); | ||
| 1605 | ulen = min_t(u32, info.jited_prog_len, ulen); | ||
| 1606 | if (copy_to_user(uinsns, prog->bpf_func, ulen)) | ||
| 1607 | return -EFAULT; | ||
| 1608 | } | ||
| 1609 | |||
| 1610 | ulen = info.xlated_prog_len; | 1726 | ulen = info.xlated_prog_len; |
| 1611 | info.xlated_prog_len = bpf_prog_insn_size(prog); | 1727 | info.xlated_prog_len = bpf_prog_insn_size(prog); |
| 1612 | if (info.xlated_prog_len && ulen) { | 1728 | if (info.xlated_prog_len && ulen) { |
| 1729 | struct bpf_insn *insns_sanitized; | ||
| 1730 | bool fault; | ||
| 1731 | |||
| 1732 | if (prog->blinded && !bpf_dump_raw_ok()) { | ||
| 1733 | info.xlated_prog_insns = 0; | ||
| 1734 | goto done; | ||
| 1735 | } | ||
| 1736 | insns_sanitized = bpf_insn_prepare_dump(prog); | ||
| 1737 | if (!insns_sanitized) | ||
| 1738 | return -ENOMEM; | ||
| 1613 | uinsns = u64_to_user_ptr(info.xlated_prog_insns); | 1739 | uinsns = u64_to_user_ptr(info.xlated_prog_insns); |
| 1614 | ulen = min_t(u32, info.xlated_prog_len, ulen); | 1740 | ulen = min_t(u32, info.xlated_prog_len, ulen); |
| 1615 | if (copy_to_user(uinsns, prog->insnsi, ulen)) | 1741 | fault = copy_to_user(uinsns, insns_sanitized, ulen); |
| 1742 | kfree(insns_sanitized); | ||
| 1743 | if (fault) | ||
| 1616 | return -EFAULT; | 1744 | return -EFAULT; |
| 1617 | } | 1745 | } |
| 1618 | 1746 | ||
| 1747 | if (bpf_prog_is_dev_bound(prog->aux)) { | ||
| 1748 | err = bpf_prog_offload_info_fill(&info, prog); | ||
| 1749 | if (err) | ||
| 1750 | return err; | ||
| 1751 | goto done; | ||
| 1752 | } | ||
| 1753 | |||
| 1754 | /* NOTE: the following code is supposed to be skipped for offload. | ||
| 1755 | * bpf_prog_offload_info_fill() is the place to fill similar fields | ||
| 1756 | * for offload. | ||
| 1757 | */ | ||
| 1758 | ulen = info.jited_prog_len; | ||
| 1759 | info.jited_prog_len = prog->jited_len; | ||
| 1760 | if (info.jited_prog_len && ulen) { | ||
| 1761 | if (bpf_dump_raw_ok()) { | ||
| 1762 | uinsns = u64_to_user_ptr(info.jited_prog_insns); | ||
| 1763 | ulen = min_t(u32, info.jited_prog_len, ulen); | ||
| 1764 | if (copy_to_user(uinsns, prog->bpf_func, ulen)) | ||
| 1765 | return -EFAULT; | ||
| 1766 | } else { | ||
| 1767 | info.jited_prog_insns = 0; | ||
| 1768 | } | ||
| 1769 | } | ||
| 1770 | |||
| 1619 | done: | 1771 | done: |
| 1620 | if (copy_to_user(uinfo, &info, info_len) || | 1772 | if (copy_to_user(uinfo, &info, info_len) || |
| 1621 | put_user(info_len, &uattr->info.info_len)) | 1773 | put_user(info_len, &uattr->info.info_len)) |
| @@ -1646,6 +1798,12 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, | |||
| 1646 | info.map_flags = map->map_flags; | 1798 | info.map_flags = map->map_flags; |
| 1647 | memcpy(info.name, map->name, sizeof(map->name)); | 1799 | memcpy(info.name, map->name, sizeof(map->name)); |
| 1648 | 1800 | ||
| 1801 | if (bpf_map_is_dev_bound(map)) { | ||
| 1802 | err = bpf_map_offload_info_fill(&info, map); | ||
| 1803 | if (err) | ||
| 1804 | return err; | ||
| 1805 | } | ||
| 1806 | |||
| 1649 | if (copy_to_user(uinfo, &info, info_len) || | 1807 | if (copy_to_user(uinfo, &info, info_len) || |
| 1650 | put_user(info_len, &uattr->info.info_len)) | 1808 | put_user(info_len, &uattr->info.info_len)) |
| 1651 | return -EFAULT; | 1809 | return -EFAULT; |
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 13551e623501..5fb69a85d967 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c | |||
| @@ -20,6 +20,8 @@ | |||
| 20 | #include <linux/file.h> | 20 | #include <linux/file.h> |
| 21 | #include <linux/vmalloc.h> | 21 | #include <linux/vmalloc.h> |
| 22 | #include <linux/stringify.h> | 22 | #include <linux/stringify.h> |
| 23 | #include <linux/bsearch.h> | ||
| 24 | #include <linux/sort.h> | ||
| 23 | 25 | ||
| 24 | #include "disasm.h" | 26 | #include "disasm.h" |
| 25 | 27 | ||
| @@ -167,11 +169,11 @@ struct bpf_call_arg_meta { | |||
| 167 | static DEFINE_MUTEX(bpf_verifier_lock); | 169 | static DEFINE_MUTEX(bpf_verifier_lock); |
| 168 | 170 | ||
| 169 | /* log_level controls verbosity level of eBPF verifier. | 171 | /* log_level controls verbosity level of eBPF verifier. |
| 170 | * verbose() is used to dump the verification trace to the log, so the user | 172 | * bpf_verifier_log_write() is used to dump the verification trace to the log, |
| 171 | * can figure out what's wrong with the program | 173 | * so the user can figure out what's wrong with the program |
| 172 | */ | 174 | */ |
| 173 | static __printf(2, 3) void verbose(struct bpf_verifier_env *env, | 175 | __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, |
| 174 | const char *fmt, ...) | 176 | const char *fmt, ...) |
| 175 | { | 177 | { |
| 176 | struct bpf_verifer_log *log = &env->log; | 178 | struct bpf_verifer_log *log = &env->log; |
| 177 | unsigned int n; | 179 | unsigned int n; |
| @@ -195,6 +197,14 @@ static __printf(2, 3) void verbose(struct bpf_verifier_env *env, | |||
| 195 | else | 197 | else |
| 196 | log->ubuf = NULL; | 198 | log->ubuf = NULL; |
| 197 | } | 199 | } |
| 200 | EXPORT_SYMBOL_GPL(bpf_verifier_log_write); | ||
| 201 | /* Historically bpf_verifier_log_write was called verbose, but the name was too | ||
| 202 | * generic for symbol export. The function was renamed, but not the calls in | ||
| 203 | * the verifier to avoid complicating backports. Hence the alias below. | ||
| 204 | */ | ||
| 205 | static __printf(2, 3) void verbose(struct bpf_verifier_env *env, | ||
| 206 | const char *fmt, ...) | ||
| 207 | __attribute__((alias("bpf_verifier_log_write"))); | ||
| 198 | 208 | ||
| 199 | static bool type_is_pkt_pointer(enum bpf_reg_type type) | 209 | static bool type_is_pkt_pointer(enum bpf_reg_type type) |
| 200 | { | 210 | { |
| @@ -216,23 +226,48 @@ static const char * const reg_type_str[] = { | |||
| 216 | [PTR_TO_PACKET_END] = "pkt_end", | 226 | [PTR_TO_PACKET_END] = "pkt_end", |
| 217 | }; | 227 | }; |
| 218 | 228 | ||
| 229 | static void print_liveness(struct bpf_verifier_env *env, | ||
| 230 | enum bpf_reg_liveness live) | ||
| 231 | { | ||
| 232 | if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN)) | ||
| 233 | verbose(env, "_"); | ||
| 234 | if (live & REG_LIVE_READ) | ||
| 235 | verbose(env, "r"); | ||
| 236 | if (live & REG_LIVE_WRITTEN) | ||
| 237 | verbose(env, "w"); | ||
| 238 | } | ||
| 239 | |||
| 240 | static struct bpf_func_state *func(struct bpf_verifier_env *env, | ||
| 241 | const struct bpf_reg_state *reg) | ||
| 242 | { | ||
| 243 | struct bpf_verifier_state *cur = env->cur_state; | ||
| 244 | |||
| 245 | return cur->frame[reg->frameno]; | ||
| 246 | } | ||
| 247 | |||
| 219 | static void print_verifier_state(struct bpf_verifier_env *env, | 248 | static void print_verifier_state(struct bpf_verifier_env *env, |
| 220 | struct bpf_verifier_state *state) | 249 | const struct bpf_func_state *state) |
| 221 | { | 250 | { |
| 222 | struct bpf_reg_state *reg; | 251 | const struct bpf_reg_state *reg; |
| 223 | enum bpf_reg_type t; | 252 | enum bpf_reg_type t; |
| 224 | int i; | 253 | int i; |
| 225 | 254 | ||
| 255 | if (state->frameno) | ||
| 256 | verbose(env, " frame%d:", state->frameno); | ||
| 226 | for (i = 0; i < MAX_BPF_REG; i++) { | 257 | for (i = 0; i < MAX_BPF_REG; i++) { |
| 227 | reg = &state->regs[i]; | 258 | reg = &state->regs[i]; |
| 228 | t = reg->type; | 259 | t = reg->type; |
| 229 | if (t == NOT_INIT) | 260 | if (t == NOT_INIT) |
| 230 | continue; | 261 | continue; |
| 231 | verbose(env, " R%d=%s", i, reg_type_str[t]); | 262 | verbose(env, " R%d", i); |
| 263 | print_liveness(env, reg->live); | ||
| 264 | verbose(env, "=%s", reg_type_str[t]); | ||
| 232 | if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && | 265 | if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && |
| 233 | tnum_is_const(reg->var_off)) { | 266 | tnum_is_const(reg->var_off)) { |
| 234 | /* reg->off should be 0 for SCALAR_VALUE */ | 267 | /* reg->off should be 0 for SCALAR_VALUE */ |
| 235 | verbose(env, "%lld", reg->var_off.value + reg->off); | 268 | verbose(env, "%lld", reg->var_off.value + reg->off); |
| 269 | if (t == PTR_TO_STACK) | ||
| 270 | verbose(env, ",call_%d", func(env, reg)->callsite); | ||
| 236 | } else { | 271 | } else { |
| 237 | verbose(env, "(id=%d", reg->id); | 272 | verbose(env, "(id=%d", reg->id); |
| 238 | if (t != SCALAR_VALUE) | 273 | if (t != SCALAR_VALUE) |
| @@ -277,16 +312,21 @@ static void print_verifier_state(struct bpf_verifier_env *env, | |||
| 277 | } | 312 | } |
| 278 | } | 313 | } |
| 279 | for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { | 314 | for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { |
| 280 | if (state->stack[i].slot_type[0] == STACK_SPILL) | 315 | if (state->stack[i].slot_type[0] == STACK_SPILL) { |
| 281 | verbose(env, " fp%d=%s", | 316 | verbose(env, " fp%d", |
| 282 | -MAX_BPF_STACK + i * BPF_REG_SIZE, | 317 | (-i - 1) * BPF_REG_SIZE); |
| 318 | print_liveness(env, state->stack[i].spilled_ptr.live); | ||
| 319 | verbose(env, "=%s", | ||
| 283 | reg_type_str[state->stack[i].spilled_ptr.type]); | 320 | reg_type_str[state->stack[i].spilled_ptr.type]); |
| 321 | } | ||
| 322 | if (state->stack[i].slot_type[0] == STACK_ZERO) | ||
| 323 | verbose(env, " fp%d=0", (-i - 1) * BPF_REG_SIZE); | ||
| 284 | } | 324 | } |
| 285 | verbose(env, "\n"); | 325 | verbose(env, "\n"); |
| 286 | } | 326 | } |
| 287 | 327 | ||
| 288 | static int copy_stack_state(struct bpf_verifier_state *dst, | 328 | static int copy_stack_state(struct bpf_func_state *dst, |
| 289 | const struct bpf_verifier_state *src) | 329 | const struct bpf_func_state *src) |
| 290 | { | 330 | { |
| 291 | if (!src->stack) | 331 | if (!src->stack) |
| 292 | return 0; | 332 | return 0; |
| @@ -302,13 +342,13 @@ static int copy_stack_state(struct bpf_verifier_state *dst, | |||
| 302 | 342 | ||
| 303 | /* do_check() starts with zero-sized stack in struct bpf_verifier_state to | 343 | /* do_check() starts with zero-sized stack in struct bpf_verifier_state to |
| 304 | * make it consume minimal amount of memory. check_stack_write() access from | 344 | * make it consume minimal amount of memory. check_stack_write() access from |
| 305 | * the program calls into realloc_verifier_state() to grow the stack size. | 345 | * the program calls into realloc_func_state() to grow the stack size. |
| 306 | * Note there is a non-zero 'parent' pointer inside bpf_verifier_state | 346 | * Note there is a non-zero 'parent' pointer inside bpf_verifier_state |
| 307 | * which this function copies over. It points to previous bpf_verifier_state | 347 | * which this function copies over. It points to previous bpf_verifier_state |
| 308 | * which is never reallocated | 348 | * which is never reallocated |
| 309 | */ | 349 | */ |
| 310 | static int realloc_verifier_state(struct bpf_verifier_state *state, int size, | 350 | static int realloc_func_state(struct bpf_func_state *state, int size, |
| 311 | bool copy_old) | 351 | bool copy_old) |
| 312 | { | 352 | { |
| 313 | u32 old_size = state->allocated_stack; | 353 | u32 old_size = state->allocated_stack; |
| 314 | struct bpf_stack_state *new_stack; | 354 | struct bpf_stack_state *new_stack; |
| @@ -341,10 +381,23 @@ static int realloc_verifier_state(struct bpf_verifier_state *state, int size, | |||
| 341 | return 0; | 381 | return 0; |
| 342 | } | 382 | } |
| 343 | 383 | ||
| 384 | static void free_func_state(struct bpf_func_state *state) | ||
| 385 | { | ||
| 386 | if (!state) | ||
| 387 | return; | ||
| 388 | kfree(state->stack); | ||
| 389 | kfree(state); | ||
| 390 | } | ||
| 391 | |||
| 344 | static void free_verifier_state(struct bpf_verifier_state *state, | 392 | static void free_verifier_state(struct bpf_verifier_state *state, |
| 345 | bool free_self) | 393 | bool free_self) |
| 346 | { | 394 | { |
| 347 | kfree(state->stack); | 395 | int i; |
| 396 | |||
| 397 | for (i = 0; i <= state->curframe; i++) { | ||
| 398 | free_func_state(state->frame[i]); | ||
| 399 | state->frame[i] = NULL; | ||
| 400 | } | ||
| 348 | if (free_self) | 401 | if (free_self) |
| 349 | kfree(state); | 402 | kfree(state); |
| 350 | } | 403 | } |
| @@ -352,18 +405,46 @@ static void free_verifier_state(struct bpf_verifier_state *state, | |||
| 352 | /* copy verifier state from src to dst growing dst stack space | 405 | /* copy verifier state from src to dst growing dst stack space |
| 353 | * when necessary to accommodate larger src stack | 406 | * when necessary to accommodate larger src stack |
| 354 | */ | 407 | */ |
| 355 | static int copy_verifier_state(struct bpf_verifier_state *dst, | 408 | static int copy_func_state(struct bpf_func_state *dst, |
| 356 | const struct bpf_verifier_state *src) | 409 | const struct bpf_func_state *src) |
| 357 | { | 410 | { |
| 358 | int err; | 411 | int err; |
| 359 | 412 | ||
| 360 | err = realloc_verifier_state(dst, src->allocated_stack, false); | 413 | err = realloc_func_state(dst, src->allocated_stack, false); |
| 361 | if (err) | 414 | if (err) |
| 362 | return err; | 415 | return err; |
| 363 | memcpy(dst, src, offsetof(struct bpf_verifier_state, allocated_stack)); | 416 | memcpy(dst, src, offsetof(struct bpf_func_state, allocated_stack)); |
| 364 | return copy_stack_state(dst, src); | 417 | return copy_stack_state(dst, src); |
| 365 | } | 418 | } |
| 366 | 419 | ||
| 420 | static int copy_verifier_state(struct bpf_verifier_state *dst_state, | ||
| 421 | const struct bpf_verifier_state *src) | ||
| 422 | { | ||
| 423 | struct bpf_func_state *dst; | ||
| 424 | int i, err; | ||
| 425 | |||
| 426 | /* if dst has more stack frames then src frame, free them */ | ||
| 427 | for (i = src->curframe + 1; i <= dst_state->curframe; i++) { | ||
| 428 | free_func_state(dst_state->frame[i]); | ||
| 429 | dst_state->frame[i] = NULL; | ||
| 430 | } | ||
| 431 | dst_state->curframe = src->curframe; | ||
| 432 | dst_state->parent = src->parent; | ||
| 433 | for (i = 0; i <= src->curframe; i++) { | ||
| 434 | dst = dst_state->frame[i]; | ||
| 435 | if (!dst) { | ||
| 436 | dst = kzalloc(sizeof(*dst), GFP_KERNEL); | ||
| 437 | if (!dst) | ||
| 438 | return -ENOMEM; | ||
| 439 | dst_state->frame[i] = dst; | ||
| 440 | } | ||
| 441 | err = copy_func_state(dst, src->frame[i]); | ||
| 442 | if (err) | ||
| 443 | return err; | ||
| 444 | } | ||
| 445 | return 0; | ||
| 446 | } | ||
| 447 | |||
| 367 | static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, | 448 | static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, |
| 368 | int *insn_idx) | 449 | int *insn_idx) |
| 369 | { | 450 | { |
| @@ -416,6 +497,8 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, | |||
| 416 | } | 497 | } |
| 417 | return &elem->st; | 498 | return &elem->st; |
| 418 | err: | 499 | err: |
| 500 | free_verifier_state(env->cur_state, true); | ||
| 501 | env->cur_state = NULL; | ||
| 419 | /* pop all elements and return */ | 502 | /* pop all elements and return */ |
| 420 | while (!pop_stack(env, NULL, NULL)); | 503 | while (!pop_stack(env, NULL, NULL)); |
| 421 | return NULL; | 504 | return NULL; |
| @@ -425,6 +508,10 @@ err: | |||
| 425 | static const int caller_saved[CALLER_SAVED_REGS] = { | 508 | static const int caller_saved[CALLER_SAVED_REGS] = { |
| 426 | BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 | 509 | BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 |
| 427 | }; | 510 | }; |
| 511 | #define CALLEE_SAVED_REGS 5 | ||
| 512 | static const int callee_saved[CALLEE_SAVED_REGS] = { | ||
| 513 | BPF_REG_6, BPF_REG_7, BPF_REG_8, BPF_REG_9 | ||
| 514 | }; | ||
| 428 | 515 | ||
| 429 | static void __mark_reg_not_init(struct bpf_reg_state *reg); | 516 | static void __mark_reg_not_init(struct bpf_reg_state *reg); |
| 430 | 517 | ||
| @@ -449,6 +536,13 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg) | |||
| 449 | __mark_reg_known(reg, 0); | 536 | __mark_reg_known(reg, 0); |
| 450 | } | 537 | } |
| 451 | 538 | ||
| 539 | static void __mark_reg_const_zero(struct bpf_reg_state *reg) | ||
| 540 | { | ||
| 541 | __mark_reg_known(reg, 0); | ||
| 542 | reg->off = 0; | ||
| 543 | reg->type = SCALAR_VALUE; | ||
| 544 | } | ||
| 545 | |||
| 452 | static void mark_reg_known_zero(struct bpf_verifier_env *env, | 546 | static void mark_reg_known_zero(struct bpf_verifier_env *env, |
| 453 | struct bpf_reg_state *regs, u32 regno) | 547 | struct bpf_reg_state *regs, u32 regno) |
| 454 | { | 548 | { |
| @@ -560,6 +654,7 @@ static void __mark_reg_unknown(struct bpf_reg_state *reg) | |||
| 560 | reg->id = 0; | 654 | reg->id = 0; |
| 561 | reg->off = 0; | 655 | reg->off = 0; |
| 562 | reg->var_off = tnum_unknown; | 656 | reg->var_off = tnum_unknown; |
| 657 | reg->frameno = 0; | ||
| 563 | __mark_reg_unbounded(reg); | 658 | __mark_reg_unbounded(reg); |
| 564 | } | 659 | } |
| 565 | 660 | ||
| @@ -568,8 +663,8 @@ static void mark_reg_unknown(struct bpf_verifier_env *env, | |||
| 568 | { | 663 | { |
| 569 | if (WARN_ON(regno >= MAX_BPF_REG)) { | 664 | if (WARN_ON(regno >= MAX_BPF_REG)) { |
| 570 | verbose(env, "mark_reg_unknown(regs, %u)\n", regno); | 665 | verbose(env, "mark_reg_unknown(regs, %u)\n", regno); |
| 571 | /* Something bad happened, let's kill all regs */ | 666 | /* Something bad happened, let's kill all regs except FP */ |
| 572 | for (regno = 0; regno < MAX_BPF_REG; regno++) | 667 | for (regno = 0; regno < BPF_REG_FP; regno++) |
| 573 | __mark_reg_not_init(regs + regno); | 668 | __mark_reg_not_init(regs + regno); |
| 574 | return; | 669 | return; |
| 575 | } | 670 | } |
| @@ -587,8 +682,8 @@ static void mark_reg_not_init(struct bpf_verifier_env *env, | |||
| 587 | { | 682 | { |
| 588 | if (WARN_ON(regno >= MAX_BPF_REG)) { | 683 | if (WARN_ON(regno >= MAX_BPF_REG)) { |
| 589 | verbose(env, "mark_reg_not_init(regs, %u)\n", regno); | 684 | verbose(env, "mark_reg_not_init(regs, %u)\n", regno); |
| 590 | /* Something bad happened, let's kill all regs */ | 685 | /* Something bad happened, let's kill all regs except FP */ |
| 591 | for (regno = 0; regno < MAX_BPF_REG; regno++) | 686 | for (regno = 0; regno < BPF_REG_FP; regno++) |
| 592 | __mark_reg_not_init(regs + regno); | 687 | __mark_reg_not_init(regs + regno); |
| 593 | return; | 688 | return; |
| 594 | } | 689 | } |
| @@ -596,8 +691,9 @@ static void mark_reg_not_init(struct bpf_verifier_env *env, | |||
| 596 | } | 691 | } |
| 597 | 692 | ||
| 598 | static void init_reg_state(struct bpf_verifier_env *env, | 693 | static void init_reg_state(struct bpf_verifier_env *env, |
| 599 | struct bpf_reg_state *regs) | 694 | struct bpf_func_state *state) |
| 600 | { | 695 | { |
| 696 | struct bpf_reg_state *regs = state->regs; | ||
| 601 | int i; | 697 | int i; |
| 602 | 698 | ||
| 603 | for (i = 0; i < MAX_BPF_REG; i++) { | 699 | for (i = 0; i < MAX_BPF_REG; i++) { |
| @@ -608,41 +704,218 @@ static void init_reg_state(struct bpf_verifier_env *env, | |||
| 608 | /* frame pointer */ | 704 | /* frame pointer */ |
| 609 | regs[BPF_REG_FP].type = PTR_TO_STACK; | 705 | regs[BPF_REG_FP].type = PTR_TO_STACK; |
| 610 | mark_reg_known_zero(env, regs, BPF_REG_FP); | 706 | mark_reg_known_zero(env, regs, BPF_REG_FP); |
| 707 | regs[BPF_REG_FP].frameno = state->frameno; | ||
| 611 | 708 | ||
| 612 | /* 1st arg to a function */ | 709 | /* 1st arg to a function */ |
| 613 | regs[BPF_REG_1].type = PTR_TO_CTX; | 710 | regs[BPF_REG_1].type = PTR_TO_CTX; |
| 614 | mark_reg_known_zero(env, regs, BPF_REG_1); | 711 | mark_reg_known_zero(env, regs, BPF_REG_1); |
| 615 | } | 712 | } |
| 616 | 713 | ||
| 714 | #define BPF_MAIN_FUNC (-1) | ||
| 715 | static void init_func_state(struct bpf_verifier_env *env, | ||
| 716 | struct bpf_func_state *state, | ||
| 717 | int callsite, int frameno, int subprogno) | ||
| 718 | { | ||
| 719 | state->callsite = callsite; | ||
| 720 | state->frameno = frameno; | ||
| 721 | state->subprogno = subprogno; | ||
| 722 | init_reg_state(env, state); | ||
| 723 | } | ||
| 724 | |||
| 617 | enum reg_arg_type { | 725 | enum reg_arg_type { |
| 618 | SRC_OP, /* register is used as source operand */ | 726 | SRC_OP, /* register is used as source operand */ |
| 619 | DST_OP, /* register is used as destination operand */ | 727 | DST_OP, /* register is used as destination operand */ |
| 620 | DST_OP_NO_MARK /* same as above, check only, don't mark */ | 728 | DST_OP_NO_MARK /* same as above, check only, don't mark */ |
| 621 | }; | 729 | }; |
| 622 | 730 | ||
| 623 | static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno) | 731 | static int cmp_subprogs(const void *a, const void *b) |
| 732 | { | ||
| 733 | return *(int *)a - *(int *)b; | ||
| 734 | } | ||
| 735 | |||
| 736 | static int find_subprog(struct bpf_verifier_env *env, int off) | ||
| 624 | { | 737 | { |
| 625 | struct bpf_verifier_state *parent = state->parent; | 738 | u32 *p; |
| 739 | |||
| 740 | p = bsearch(&off, env->subprog_starts, env->subprog_cnt, | ||
| 741 | sizeof(env->subprog_starts[0]), cmp_subprogs); | ||
| 742 | if (!p) | ||
| 743 | return -ENOENT; | ||
| 744 | return p - env->subprog_starts; | ||
| 745 | |||
| 746 | } | ||
| 747 | |||
| 748 | static int add_subprog(struct bpf_verifier_env *env, int off) | ||
| 749 | { | ||
| 750 | int insn_cnt = env->prog->len; | ||
| 751 | int ret; | ||
| 752 | |||
| 753 | if (off >= insn_cnt || off < 0) { | ||
| 754 | verbose(env, "call to invalid destination\n"); | ||
| 755 | return -EINVAL; | ||
| 756 | } | ||
| 757 | ret = find_subprog(env, off); | ||
| 758 | if (ret >= 0) | ||
| 759 | return 0; | ||
| 760 | if (env->subprog_cnt >= BPF_MAX_SUBPROGS) { | ||
| 761 | verbose(env, "too many subprograms\n"); | ||
| 762 | return -E2BIG; | ||
| 763 | } | ||
| 764 | env->subprog_starts[env->subprog_cnt++] = off; | ||
| 765 | sort(env->subprog_starts, env->subprog_cnt, | ||
| 766 | sizeof(env->subprog_starts[0]), cmp_subprogs, NULL); | ||
| 767 | return 0; | ||
| 768 | } | ||
| 769 | |||
| 770 | static int check_subprogs(struct bpf_verifier_env *env) | ||
| 771 | { | ||
| 772 | int i, ret, subprog_start, subprog_end, off, cur_subprog = 0; | ||
| 773 | struct bpf_insn *insn = env->prog->insnsi; | ||
| 774 | int insn_cnt = env->prog->len; | ||
| 775 | |||
| 776 | /* determine subprog starts. The end is one before the next starts */ | ||
| 777 | for (i = 0; i < insn_cnt; i++) { | ||
| 778 | if (insn[i].code != (BPF_JMP | BPF_CALL)) | ||
| 779 | continue; | ||
| 780 | if (insn[i].src_reg != BPF_PSEUDO_CALL) | ||
| 781 | continue; | ||
| 782 | if (!env->allow_ptr_leaks) { | ||
| 783 | verbose(env, "function calls to other bpf functions are allowed for root only\n"); | ||
| 784 | return -EPERM; | ||
| 785 | } | ||
| 786 | if (bpf_prog_is_dev_bound(env->prog->aux)) { | ||
| 787 | verbose(env, "function calls in offloaded programs are not supported yet\n"); | ||
| 788 | return -EINVAL; | ||
| 789 | } | ||
| 790 | ret = add_subprog(env, i + insn[i].imm + 1); | ||
| 791 | if (ret < 0) | ||
| 792 | return ret; | ||
| 793 | } | ||
| 794 | |||
| 795 | if (env->log.level > 1) | ||
| 796 | for (i = 0; i < env->subprog_cnt; i++) | ||
| 797 | verbose(env, "func#%d @%d\n", i, env->subprog_starts[i]); | ||
| 798 | |||
| 799 | /* now check that all jumps are within the same subprog */ | ||
| 800 | subprog_start = 0; | ||
| 801 | if (env->subprog_cnt == cur_subprog) | ||
| 802 | subprog_end = insn_cnt; | ||
| 803 | else | ||
| 804 | subprog_end = env->subprog_starts[cur_subprog++]; | ||
| 805 | for (i = 0; i < insn_cnt; i++) { | ||
| 806 | u8 code = insn[i].code; | ||
| 807 | |||
| 808 | if (BPF_CLASS(code) != BPF_JMP) | ||
| 809 | goto next; | ||
| 810 | if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) | ||
| 811 | goto next; | ||
| 812 | off = i + insn[i].off + 1; | ||
| 813 | if (off < subprog_start || off >= subprog_end) { | ||
| 814 | verbose(env, "jump out of range from insn %d to %d\n", i, off); | ||
| 815 | return -EINVAL; | ||
| 816 | } | ||
| 817 | next: | ||
| 818 | if (i == subprog_end - 1) { | ||
| 819 | /* to avoid fall-through from one subprog into another | ||
| 820 | * the last insn of the subprog should be either exit | ||
| 821 | * or unconditional jump back | ||
| 822 | */ | ||
| 823 | if (code != (BPF_JMP | BPF_EXIT) && | ||
| 824 | code != (BPF_JMP | BPF_JA)) { | ||
| 825 | verbose(env, "last insn is not an exit or jmp\n"); | ||
| 826 | return -EINVAL; | ||
| 827 | } | ||
| 828 | subprog_start = subprog_end; | ||
| 829 | if (env->subprog_cnt == cur_subprog) | ||
| 830 | subprog_end = insn_cnt; | ||
| 831 | else | ||
| 832 | subprog_end = env->subprog_starts[cur_subprog++]; | ||
| 833 | } | ||
| 834 | } | ||
| 835 | return 0; | ||
| 836 | } | ||
| 837 | |||
| 838 | static | ||
| 839 | struct bpf_verifier_state *skip_callee(struct bpf_verifier_env *env, | ||
| 840 | const struct bpf_verifier_state *state, | ||
| 841 | struct bpf_verifier_state *parent, | ||
| 842 | u32 regno) | ||
| 843 | { | ||
| 844 | struct bpf_verifier_state *tmp = NULL; | ||
| 845 | |||
| 846 | /* 'parent' could be a state of caller and | ||
| 847 | * 'state' could be a state of callee. In such case | ||
| 848 | * parent->curframe < state->curframe | ||
| 849 | * and it's ok for r1 - r5 registers | ||
| 850 | * | ||
| 851 | * 'parent' could be a callee's state after it bpf_exit-ed. | ||
| 852 | * In such case parent->curframe > state->curframe | ||
| 853 | * and it's ok for r0 only | ||
| 854 | */ | ||
| 855 | if (parent->curframe == state->curframe || | ||
| 856 | (parent->curframe < state->curframe && | ||
| 857 | regno >= BPF_REG_1 && regno <= BPF_REG_5) || | ||
| 858 | (parent->curframe > state->curframe && | ||
| 859 | regno == BPF_REG_0)) | ||
| 860 | return parent; | ||
| 861 | |||
| 862 | if (parent->curframe > state->curframe && | ||
| 863 | regno >= BPF_REG_6) { | ||
| 864 | /* for callee saved regs we have to skip the whole chain | ||
| 865 | * of states that belong to callee and mark as LIVE_READ | ||
| 866 | * the registers before the call | ||
| 867 | */ | ||
| 868 | tmp = parent; | ||
| 869 | while (tmp && tmp->curframe != state->curframe) { | ||
| 870 | tmp = tmp->parent; | ||
| 871 | } | ||
| 872 | if (!tmp) | ||
| 873 | goto bug; | ||
| 874 | parent = tmp; | ||
| 875 | } else { | ||
| 876 | goto bug; | ||
| 877 | } | ||
| 878 | return parent; | ||
| 879 | bug: | ||
| 880 | verbose(env, "verifier bug regno %d tmp %p\n", regno, tmp); | ||
| 881 | verbose(env, "regno %d parent frame %d current frame %d\n", | ||
| 882 | regno, parent->curframe, state->curframe); | ||
| 883 | return NULL; | ||
| 884 | } | ||
| 885 | |||
| 886 | static int mark_reg_read(struct bpf_verifier_env *env, | ||
| 887 | const struct bpf_verifier_state *state, | ||
| 888 | struct bpf_verifier_state *parent, | ||
| 889 | u32 regno) | ||
| 890 | { | ||
| 891 | bool writes = parent == state->parent; /* Observe write marks */ | ||
| 626 | 892 | ||
| 627 | if (regno == BPF_REG_FP) | 893 | if (regno == BPF_REG_FP) |
| 628 | /* We don't need to worry about FP liveness because it's read-only */ | 894 | /* We don't need to worry about FP liveness because it's read-only */ |
| 629 | return; | 895 | return 0; |
| 630 | 896 | ||
| 631 | while (parent) { | 897 | while (parent) { |
| 632 | /* if read wasn't screened by an earlier write ... */ | 898 | /* if read wasn't screened by an earlier write ... */ |
| 633 | if (state->regs[regno].live & REG_LIVE_WRITTEN) | 899 | if (writes && state->frame[state->curframe]->regs[regno].live & REG_LIVE_WRITTEN) |
| 634 | break; | 900 | break; |
| 901 | parent = skip_callee(env, state, parent, regno); | ||
| 902 | if (!parent) | ||
| 903 | return -EFAULT; | ||
| 635 | /* ... then we depend on parent's value */ | 904 | /* ... then we depend on parent's value */ |
| 636 | parent->regs[regno].live |= REG_LIVE_READ; | 905 | parent->frame[parent->curframe]->regs[regno].live |= REG_LIVE_READ; |
| 637 | state = parent; | 906 | state = parent; |
| 638 | parent = state->parent; | 907 | parent = state->parent; |
| 908 | writes = true; | ||
| 639 | } | 909 | } |
| 910 | return 0; | ||
| 640 | } | 911 | } |
| 641 | 912 | ||
| 642 | static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, | 913 | static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, |
| 643 | enum reg_arg_type t) | 914 | enum reg_arg_type t) |
| 644 | { | 915 | { |
| 645 | struct bpf_reg_state *regs = env->cur_state->regs; | 916 | struct bpf_verifier_state *vstate = env->cur_state; |
| 917 | struct bpf_func_state *state = vstate->frame[vstate->curframe]; | ||
| 918 | struct bpf_reg_state *regs = state->regs; | ||
| 646 | 919 | ||
| 647 | if (regno >= MAX_BPF_REG) { | 920 | if (regno >= MAX_BPF_REG) { |
| 648 | verbose(env, "R%d is invalid\n", regno); | 921 | verbose(env, "R%d is invalid\n", regno); |
| @@ -655,7 +928,7 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, | |||
| 655 | verbose(env, "R%d !read_ok\n", regno); | 928 | verbose(env, "R%d !read_ok\n", regno); |
| 656 | return -EACCES; | 929 | return -EACCES; |
| 657 | } | 930 | } |
| 658 | mark_reg_read(env->cur_state, regno); | 931 | return mark_reg_read(env, vstate, vstate->parent, regno); |
| 659 | } else { | 932 | } else { |
| 660 | /* check whether register used as dest operand can be written to */ | 933 | /* check whether register used as dest operand can be written to */ |
| 661 | if (regno == BPF_REG_FP) { | 934 | if (regno == BPF_REG_FP) { |
| @@ -686,17 +959,25 @@ static bool is_spillable_regtype(enum bpf_reg_type type) | |||
| 686 | } | 959 | } |
| 687 | } | 960 | } |
| 688 | 961 | ||
| 962 | /* Does this register contain a constant zero? */ | ||
| 963 | static bool register_is_null(struct bpf_reg_state *reg) | ||
| 964 | { | ||
| 965 | return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0); | ||
| 966 | } | ||
| 967 | |||
| 689 | /* check_stack_read/write functions track spill/fill of registers, | 968 | /* check_stack_read/write functions track spill/fill of registers, |
| 690 | * stack boundary and alignment are checked in check_mem_access() | 969 | * stack boundary and alignment are checked in check_mem_access() |
| 691 | */ | 970 | */ |
| 692 | static int check_stack_write(struct bpf_verifier_env *env, | 971 | static int check_stack_write(struct bpf_verifier_env *env, |
| 693 | struct bpf_verifier_state *state, int off, | 972 | struct bpf_func_state *state, /* func where register points to */ |
| 694 | int size, int value_regno) | 973 | int off, int size, int value_regno) |
| 695 | { | 974 | { |
| 975 | struct bpf_func_state *cur; /* state of the current function */ | ||
| 696 | int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; | 976 | int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; |
| 977 | enum bpf_reg_type type; | ||
| 697 | 978 | ||
| 698 | err = realloc_verifier_state(state, round_up(slot + 1, BPF_REG_SIZE), | 979 | err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE), |
| 699 | true); | 980 | true); |
| 700 | if (err) | 981 | if (err) |
| 701 | return err; | 982 | return err; |
| 702 | /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, | 983 | /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, |
| @@ -709,8 +990,9 @@ static int check_stack_write(struct bpf_verifier_env *env, | |||
| 709 | return -EACCES; | 990 | return -EACCES; |
| 710 | } | 991 | } |
| 711 | 992 | ||
| 993 | cur = env->cur_state->frame[env->cur_state->curframe]; | ||
| 712 | if (value_regno >= 0 && | 994 | if (value_regno >= 0 && |
| 713 | is_spillable_regtype(state->regs[value_regno].type)) { | 995 | is_spillable_regtype((type = cur->regs[value_regno].type))) { |
| 714 | 996 | ||
| 715 | /* register containing pointer is being spilled into stack */ | 997 | /* register containing pointer is being spilled into stack */ |
| 716 | if (size != BPF_REG_SIZE) { | 998 | if (size != BPF_REG_SIZE) { |
| @@ -718,51 +1000,116 @@ static int check_stack_write(struct bpf_verifier_env *env, | |||
| 718 | return -EACCES; | 1000 | return -EACCES; |
| 719 | } | 1001 | } |
| 720 | 1002 | ||
| 1003 | if (state != cur && type == PTR_TO_STACK) { | ||
| 1004 | verbose(env, "cannot spill pointers to stack into stack frame of the caller\n"); | ||
| 1005 | return -EINVAL; | ||
| 1006 | } | ||
| 1007 | |||
| 721 | /* save register state */ | 1008 | /* save register state */ |
| 722 | state->stack[spi].spilled_ptr = state->regs[value_regno]; | 1009 | state->stack[spi].spilled_ptr = cur->regs[value_regno]; |
| 723 | state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; | 1010 | state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; |
| 724 | 1011 | ||
| 725 | for (i = 0; i < BPF_REG_SIZE; i++) | 1012 | for (i = 0; i < BPF_REG_SIZE; i++) |
| 726 | state->stack[spi].slot_type[i] = STACK_SPILL; | 1013 | state->stack[spi].slot_type[i] = STACK_SPILL; |
| 727 | } else { | 1014 | } else { |
| 1015 | u8 type = STACK_MISC; | ||
| 1016 | |||
| 728 | /* regular write of data into stack */ | 1017 | /* regular write of data into stack */ |
| 729 | state->stack[spi].spilled_ptr = (struct bpf_reg_state) {}; | 1018 | state->stack[spi].spilled_ptr = (struct bpf_reg_state) {}; |
| 730 | 1019 | ||
| 1020 | /* only mark the slot as written if all 8 bytes were written | ||
| 1021 | * otherwise read propagation may incorrectly stop too soon | ||
| 1022 | * when stack slots are partially written. | ||
| 1023 | * This heuristic means that read propagation will be | ||
| 1024 | * conservative, since it will add reg_live_read marks | ||
| 1025 | * to stack slots all the way to first state when programs | ||
| 1026 | * writes+reads less than 8 bytes | ||
| 1027 | */ | ||
| 1028 | if (size == BPF_REG_SIZE) | ||
| 1029 | state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; | ||
| 1030 | |||
| 1031 | /* when we zero initialize stack slots mark them as such */ | ||
| 1032 | if (value_regno >= 0 && | ||
| 1033 | register_is_null(&cur->regs[value_regno])) | ||
| 1034 | type = STACK_ZERO; | ||
| 1035 | |||
| 731 | for (i = 0; i < size; i++) | 1036 | for (i = 0; i < size; i++) |
| 732 | state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] = | 1037 | state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] = |
| 733 | STACK_MISC; | 1038 | type; |
| 734 | } | 1039 | } |
| 735 | return 0; | 1040 | return 0; |
| 736 | } | 1041 | } |
| 737 | 1042 | ||
| 738 | static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slot) | 1043 | /* registers of every function are unique and mark_reg_read() propagates |
| 1044 | * the liveness in the following cases: | ||
| 1045 | * - from callee into caller for R1 - R5 that were used as arguments | ||
| 1046 | * - from caller into callee for R0 that used as result of the call | ||
| 1047 | * - from caller to the same caller skipping states of the callee for R6 - R9, | ||
| 1048 | * since R6 - R9 are callee saved by implicit function prologue and | ||
| 1049 | * caller's R6 != callee's R6, so when we propagate liveness up to | ||
| 1050 | * parent states we need to skip callee states for R6 - R9. | ||
| 1051 | * | ||
| 1052 | * stack slot marking is different, since stacks of caller and callee are | ||
| 1053 | * accessible in both (since caller can pass a pointer to caller's stack to | ||
| 1054 | * callee which can pass it to another function), hence mark_stack_slot_read() | ||
| 1055 | * has to propagate the stack liveness to all parent states at given frame number. | ||
| 1056 | * Consider code: | ||
| 1057 | * f1() { | ||
| 1058 | * ptr = fp - 8; | ||
| 1059 | * *ptr = ctx; | ||
| 1060 | * call f2 { | ||
| 1061 | * .. = *ptr; | ||
| 1062 | * } | ||
| 1063 | * .. = *ptr; | ||
| 1064 | * } | ||
| 1065 | * First *ptr is reading from f1's stack and mark_stack_slot_read() has | ||
| 1066 | * to mark liveness at the f1's frame and not f2's frame. | ||
| 1067 | * Second *ptr is also reading from f1's stack and mark_stack_slot_read() has | ||
| 1068 | * to propagate liveness to f2 states at f1's frame level and further into | ||
| 1069 | * f1 states at f1's frame level until write into that stack slot | ||
| 1070 | */ | ||
| 1071 | static void mark_stack_slot_read(struct bpf_verifier_env *env, | ||
| 1072 | const struct bpf_verifier_state *state, | ||
| 1073 | struct bpf_verifier_state *parent, | ||
| 1074 | int slot, int frameno) | ||
| 739 | { | 1075 | { |
| 740 | struct bpf_verifier_state *parent = state->parent; | 1076 | bool writes = parent == state->parent; /* Observe write marks */ |
| 741 | 1077 | ||
| 742 | while (parent) { | 1078 | while (parent) { |
| 1079 | if (parent->frame[frameno]->allocated_stack <= slot * BPF_REG_SIZE) | ||
| 1080 | /* since LIVE_WRITTEN mark is only done for full 8-byte | ||
| 1081 | * write the read marks are conservative and parent | ||
| 1082 | * state may not even have the stack allocated. In such case | ||
| 1083 | * end the propagation, since the loop reached beginning | ||
| 1084 | * of the function | ||
| 1085 | */ | ||
| 1086 | break; | ||
| 743 | /* if read wasn't screened by an earlier write ... */ | 1087 | /* if read wasn't screened by an earlier write ... */ |
| 744 | if (state->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN) | 1088 | if (writes && state->frame[frameno]->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN) |
| 745 | break; | 1089 | break; |
| 746 | /* ... then we depend on parent's value */ | 1090 | /* ... then we depend on parent's value */ |
| 747 | parent->stack[slot].spilled_ptr.live |= REG_LIVE_READ; | 1091 | parent->frame[frameno]->stack[slot].spilled_ptr.live |= REG_LIVE_READ; |
| 748 | state = parent; | 1092 | state = parent; |
| 749 | parent = state->parent; | 1093 | parent = state->parent; |
| 1094 | writes = true; | ||
| 750 | } | 1095 | } |
| 751 | } | 1096 | } |
| 752 | 1097 | ||
| 753 | static int check_stack_read(struct bpf_verifier_env *env, | 1098 | static int check_stack_read(struct bpf_verifier_env *env, |
| 754 | struct bpf_verifier_state *state, int off, int size, | 1099 | struct bpf_func_state *reg_state /* func where register points to */, |
| 755 | int value_regno) | 1100 | int off, int size, int value_regno) |
| 756 | { | 1101 | { |
| 1102 | struct bpf_verifier_state *vstate = env->cur_state; | ||
| 1103 | struct bpf_func_state *state = vstate->frame[vstate->curframe]; | ||
| 757 | int i, slot = -off - 1, spi = slot / BPF_REG_SIZE; | 1104 | int i, slot = -off - 1, spi = slot / BPF_REG_SIZE; |
| 758 | u8 *stype; | 1105 | u8 *stype; |
| 759 | 1106 | ||
| 760 | if (state->allocated_stack <= slot) { | 1107 | if (reg_state->allocated_stack <= slot) { |
| 761 | verbose(env, "invalid read from stack off %d+0 size %d\n", | 1108 | verbose(env, "invalid read from stack off %d+0 size %d\n", |
| 762 | off, size); | 1109 | off, size); |
| 763 | return -EACCES; | 1110 | return -EACCES; |
| 764 | } | 1111 | } |
| 765 | stype = state->stack[spi].slot_type; | 1112 | stype = reg_state->stack[spi].slot_type; |
| 766 | 1113 | ||
| 767 | if (stype[0] == STACK_SPILL) { | 1114 | if (stype[0] == STACK_SPILL) { |
| 768 | if (size != BPF_REG_SIZE) { | 1115 | if (size != BPF_REG_SIZE) { |
| @@ -778,21 +1125,44 @@ static int check_stack_read(struct bpf_verifier_env *env, | |||
| 778 | 1125 | ||
| 779 | if (value_regno >= 0) { | 1126 | if (value_regno >= 0) { |
| 780 | /* restore register state from stack */ | 1127 | /* restore register state from stack */ |
| 781 | state->regs[value_regno] = state->stack[spi].spilled_ptr; | 1128 | state->regs[value_regno] = reg_state->stack[spi].spilled_ptr; |
| 782 | mark_stack_slot_read(state, spi); | 1129 | /* mark reg as written since spilled pointer state likely |
| 1130 | * has its liveness marks cleared by is_state_visited() | ||
| 1131 | * which resets stack/reg liveness for state transitions | ||
| 1132 | */ | ||
| 1133 | state->regs[value_regno].live |= REG_LIVE_WRITTEN; | ||
| 783 | } | 1134 | } |
| 1135 | mark_stack_slot_read(env, vstate, vstate->parent, spi, | ||
| 1136 | reg_state->frameno); | ||
| 784 | return 0; | 1137 | return 0; |
| 785 | } else { | 1138 | } else { |
| 1139 | int zeros = 0; | ||
| 1140 | |||
| 786 | for (i = 0; i < size; i++) { | 1141 | for (i = 0; i < size; i++) { |
| 787 | if (stype[(slot - i) % BPF_REG_SIZE] != STACK_MISC) { | 1142 | if (stype[(slot - i) % BPF_REG_SIZE] == STACK_MISC) |
| 788 | verbose(env, "invalid read from stack off %d+%d size %d\n", | 1143 | continue; |
| 789 | off, i, size); | 1144 | if (stype[(slot - i) % BPF_REG_SIZE] == STACK_ZERO) { |
| 790 | return -EACCES; | 1145 | zeros++; |
| 1146 | continue; | ||
| 1147 | } | ||
| 1148 | verbose(env, "invalid read from stack off %d+%d size %d\n", | ||
| 1149 | off, i, size); | ||
| 1150 | return -EACCES; | ||
| 1151 | } | ||
| 1152 | mark_stack_slot_read(env, vstate, vstate->parent, spi, | ||
| 1153 | reg_state->frameno); | ||
| 1154 | if (value_regno >= 0) { | ||
| 1155 | if (zeros == size) { | ||
| 1156 | /* any size read into register is zero extended, | ||
| 1157 | * so the whole register == const_zero | ||
| 1158 | */ | ||
| 1159 | __mark_reg_const_zero(&state->regs[value_regno]); | ||
| 1160 | } else { | ||
| 1161 | /* have read misc data from the stack */ | ||
| 1162 | mark_reg_unknown(env, state->regs, value_regno); | ||
| 791 | } | 1163 | } |
| 1164 | state->regs[value_regno].live |= REG_LIVE_WRITTEN; | ||
| 792 | } | 1165 | } |
| 793 | if (value_regno >= 0) | ||
| 794 | /* have read misc data from the stack */ | ||
| 795 | mark_reg_unknown(env, state->regs, value_regno); | ||
| 796 | return 0; | 1166 | return 0; |
| 797 | } | 1167 | } |
| 798 | } | 1168 | } |
| @@ -817,7 +1187,8 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, | |||
| 817 | static int check_map_access(struct bpf_verifier_env *env, u32 regno, | 1187 | static int check_map_access(struct bpf_verifier_env *env, u32 regno, |
| 818 | int off, int size, bool zero_size_allowed) | 1188 | int off, int size, bool zero_size_allowed) |
| 819 | { | 1189 | { |
| 820 | struct bpf_verifier_state *state = env->cur_state; | 1190 | struct bpf_verifier_state *vstate = env->cur_state; |
| 1191 | struct bpf_func_state *state = vstate->frame[vstate->curframe]; | ||
| 821 | struct bpf_reg_state *reg = &state->regs[regno]; | 1192 | struct bpf_reg_state *reg = &state->regs[regno]; |
| 822 | int err; | 1193 | int err; |
| 823 | 1194 | ||
| @@ -1079,6 +1450,103 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, | |||
| 1079 | strict); | 1450 | strict); |
| 1080 | } | 1451 | } |
| 1081 | 1452 | ||
| 1453 | static int update_stack_depth(struct bpf_verifier_env *env, | ||
| 1454 | const struct bpf_func_state *func, | ||
| 1455 | int off) | ||
| 1456 | { | ||
| 1457 | u16 stack = env->subprog_stack_depth[func->subprogno]; | ||
| 1458 | |||
| 1459 | if (stack >= -off) | ||
| 1460 | return 0; | ||
| 1461 | |||
| 1462 | /* update known max for given subprogram */ | ||
| 1463 | env->subprog_stack_depth[func->subprogno] = -off; | ||
| 1464 | return 0; | ||
| 1465 | } | ||
| 1466 | |||
| 1467 | /* starting from main bpf function walk all instructions of the function | ||
| 1468 | * and recursively walk all callees that given function can call. | ||
| 1469 | * Ignore jump and exit insns. | ||
| 1470 | * Since recursion is prevented by check_cfg() this algorithm | ||
| 1471 | * only needs a local stack of MAX_CALL_FRAMES to remember callsites | ||
| 1472 | */ | ||
| 1473 | static int check_max_stack_depth(struct bpf_verifier_env *env) | ||
| 1474 | { | ||
| 1475 | int depth = 0, frame = 0, subprog = 0, i = 0, subprog_end; | ||
| 1476 | struct bpf_insn *insn = env->prog->insnsi; | ||
| 1477 | int insn_cnt = env->prog->len; | ||
| 1478 | int ret_insn[MAX_CALL_FRAMES]; | ||
| 1479 | int ret_prog[MAX_CALL_FRAMES]; | ||
| 1480 | |||
| 1481 | process_func: | ||
| 1482 | /* round up to 32-bytes, since this is granularity | ||
| 1483 | * of interpreter stack size | ||
| 1484 | */ | ||
| 1485 | depth += round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32); | ||
| 1486 | if (depth > MAX_BPF_STACK) { | ||
| 1487 | verbose(env, "combined stack size of %d calls is %d. Too large\n", | ||
| 1488 | frame + 1, depth); | ||
| 1489 | return -EACCES; | ||
| 1490 | } | ||
| 1491 | continue_func: | ||
| 1492 | if (env->subprog_cnt == subprog) | ||
| 1493 | subprog_end = insn_cnt; | ||
| 1494 | else | ||
| 1495 | subprog_end = env->subprog_starts[subprog]; | ||
| 1496 | for (; i < subprog_end; i++) { | ||
| 1497 | if (insn[i].code != (BPF_JMP | BPF_CALL)) | ||
| 1498 | continue; | ||
| 1499 | if (insn[i].src_reg != BPF_PSEUDO_CALL) | ||
| 1500 | continue; | ||
| 1501 | /* remember insn and function to return to */ | ||
| 1502 | ret_insn[frame] = i + 1; | ||
| 1503 | ret_prog[frame] = subprog; | ||
| 1504 | |||
| 1505 | /* find the callee */ | ||
| 1506 | i = i + insn[i].imm + 1; | ||
| 1507 | subprog = find_subprog(env, i); | ||
| 1508 | if (subprog < 0) { | ||
| 1509 | WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", | ||
| 1510 | i); | ||
| 1511 | return -EFAULT; | ||
| 1512 | } | ||
| 1513 | subprog++; | ||
| 1514 | frame++; | ||
| 1515 | if (frame >= MAX_CALL_FRAMES) { | ||
| 1516 | WARN_ONCE(1, "verifier bug. Call stack is too deep\n"); | ||
| 1517 | return -EFAULT; | ||
| 1518 | } | ||
| 1519 | goto process_func; | ||
| 1520 | } | ||
| 1521 | /* end of for() loop means the last insn of the 'subprog' | ||
| 1522 | * was reached. Doesn't matter whether it was JA or EXIT | ||
| 1523 | */ | ||
| 1524 | if (frame == 0) | ||
| 1525 | return 0; | ||
| 1526 | depth -= round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32); | ||
| 1527 | frame--; | ||
| 1528 | i = ret_insn[frame]; | ||
| 1529 | subprog = ret_prog[frame]; | ||
| 1530 | goto continue_func; | ||
| 1531 | } | ||
| 1532 | |||
| 1533 | #ifndef CONFIG_BPF_JIT_ALWAYS_ON | ||
| 1534 | static int get_callee_stack_depth(struct bpf_verifier_env *env, | ||
| 1535 | const struct bpf_insn *insn, int idx) | ||
| 1536 | { | ||
| 1537 | int start = idx + insn->imm + 1, subprog; | ||
| 1538 | |||
| 1539 | subprog = find_subprog(env, start); | ||
| 1540 | if (subprog < 0) { | ||
| 1541 | WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", | ||
| 1542 | start); | ||
| 1543 | return -EFAULT; | ||
| 1544 | } | ||
| 1545 | subprog++; | ||
| 1546 | return env->subprog_stack_depth[subprog]; | ||
| 1547 | } | ||
| 1548 | #endif | ||
| 1549 | |||
| 1082 | /* truncate register to smaller size (in bytes) | 1550 | /* truncate register to smaller size (in bytes) |
| 1083 | * must be called with size < BPF_REG_SIZE | 1551 | * must be called with size < BPF_REG_SIZE |
| 1084 | */ | 1552 | */ |
| @@ -1112,9 +1580,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn | |||
| 1112 | int bpf_size, enum bpf_access_type t, | 1580 | int bpf_size, enum bpf_access_type t, |
| 1113 | int value_regno) | 1581 | int value_regno) |
| 1114 | { | 1582 | { |
| 1115 | struct bpf_verifier_state *state = env->cur_state; | ||
| 1116 | struct bpf_reg_state *regs = cur_regs(env); | 1583 | struct bpf_reg_state *regs = cur_regs(env); |
| 1117 | struct bpf_reg_state *reg = regs + regno; | 1584 | struct bpf_reg_state *reg = regs + regno; |
| 1585 | struct bpf_func_state *state; | ||
| 1118 | int size, err = 0; | 1586 | int size, err = 0; |
| 1119 | 1587 | ||
| 1120 | size = bpf_size_to_bytes(bpf_size); | 1588 | size = bpf_size_to_bytes(bpf_size); |
| @@ -1203,8 +1671,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn | |||
| 1203 | return -EACCES; | 1671 | return -EACCES; |
| 1204 | } | 1672 | } |
| 1205 | 1673 | ||
| 1206 | if (env->prog->aux->stack_depth < -off) | 1674 | state = func(env, reg); |
| 1207 | env->prog->aux->stack_depth = -off; | 1675 | err = update_stack_depth(env, state, off); |
| 1676 | if (err) | ||
| 1677 | return err; | ||
| 1208 | 1678 | ||
| 1209 | if (t == BPF_WRITE) | 1679 | if (t == BPF_WRITE) |
| 1210 | err = check_stack_write(env, state, off, size, | 1680 | err = check_stack_write(env, state, off, size, |
| @@ -1282,12 +1752,6 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins | |||
| 1282 | BPF_SIZE(insn->code), BPF_WRITE, -1); | 1752 | BPF_SIZE(insn->code), BPF_WRITE, -1); |
| 1283 | } | 1753 | } |
| 1284 | 1754 | ||
| 1285 | /* Does this register contain a constant zero? */ | ||
| 1286 | static bool register_is_null(struct bpf_reg_state reg) | ||
| 1287 | { | ||
| 1288 | return reg.type == SCALAR_VALUE && tnum_equals_const(reg.var_off, 0); | ||
| 1289 | } | ||
| 1290 | |||
| 1291 | /* when register 'regno' is passed into function that will read 'access_size' | 1755 | /* when register 'regno' is passed into function that will read 'access_size' |
| 1292 | * bytes from that pointer, make sure that it's within stack boundary | 1756 | * bytes from that pointer, make sure that it's within stack boundary |
| 1293 | * and all elements of stack are initialized. | 1757 | * and all elements of stack are initialized. |
| @@ -1298,32 +1762,32 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, | |||
| 1298 | int access_size, bool zero_size_allowed, | 1762 | int access_size, bool zero_size_allowed, |
| 1299 | struct bpf_call_arg_meta *meta) | 1763 | struct bpf_call_arg_meta *meta) |
| 1300 | { | 1764 | { |
| 1301 | struct bpf_verifier_state *state = env->cur_state; | 1765 | struct bpf_reg_state *reg = cur_regs(env) + regno; |
| 1302 | struct bpf_reg_state *regs = state->regs; | 1766 | struct bpf_func_state *state = func(env, reg); |
| 1303 | int off, i, slot, spi; | 1767 | int off, i, slot, spi; |
| 1304 | 1768 | ||
| 1305 | if (regs[regno].type != PTR_TO_STACK) { | 1769 | if (reg->type != PTR_TO_STACK) { |
| 1306 | /* Allow zero-byte read from NULL, regardless of pointer type */ | 1770 | /* Allow zero-byte read from NULL, regardless of pointer type */ |
| 1307 | if (zero_size_allowed && access_size == 0 && | 1771 | if (zero_size_allowed && access_size == 0 && |
| 1308 | register_is_null(regs[regno])) | 1772 | register_is_null(reg)) |
| 1309 | return 0; | 1773 | return 0; |
| 1310 | 1774 | ||
| 1311 | verbose(env, "R%d type=%s expected=%s\n", regno, | 1775 | verbose(env, "R%d type=%s expected=%s\n", regno, |
| 1312 | reg_type_str[regs[regno].type], | 1776 | reg_type_str[reg->type], |
| 1313 | reg_type_str[PTR_TO_STACK]); | 1777 | reg_type_str[PTR_TO_STACK]); |
| 1314 | return -EACCES; | 1778 | return -EACCES; |
| 1315 | } | 1779 | } |
| 1316 | 1780 | ||
| 1317 | /* Only allow fixed-offset stack reads */ | 1781 | /* Only allow fixed-offset stack reads */ |
| 1318 | if (!tnum_is_const(regs[regno].var_off)) { | 1782 | if (!tnum_is_const(reg->var_off)) { |
| 1319 | char tn_buf[48]; | 1783 | char tn_buf[48]; |
| 1320 | 1784 | ||
| 1321 | tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off); | 1785 | tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); |
| 1322 | verbose(env, "invalid variable stack read R%d var_off=%s\n", | 1786 | verbose(env, "invalid variable stack read R%d var_off=%s\n", |
| 1323 | regno, tn_buf); | 1787 | regno, tn_buf); |
| 1324 | return -EACCES; | 1788 | return -EACCES; |
| 1325 | } | 1789 | } |
| 1326 | off = regs[regno].off + regs[regno].var_off.value; | 1790 | off = reg->off + reg->var_off.value; |
| 1327 | if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || | 1791 | if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || |
| 1328 | access_size < 0 || (access_size == 0 && !zero_size_allowed)) { | 1792 | access_size < 0 || (access_size == 0 && !zero_size_allowed)) { |
| 1329 | verbose(env, "invalid stack type R%d off=%d access_size=%d\n", | 1793 | verbose(env, "invalid stack type R%d off=%d access_size=%d\n", |
| @@ -1331,9 +1795,6 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, | |||
| 1331 | return -EACCES; | 1795 | return -EACCES; |
| 1332 | } | 1796 | } |
| 1333 | 1797 | ||
| 1334 | if (env->prog->aux->stack_depth < -off) | ||
| 1335 | env->prog->aux->stack_depth = -off; | ||
| 1336 | |||
| 1337 | if (meta && meta->raw_mode) { | 1798 | if (meta && meta->raw_mode) { |
| 1338 | meta->access_size = access_size; | 1799 | meta->access_size = access_size; |
| 1339 | meta->regno = regno; | 1800 | meta->regno = regno; |
| @@ -1341,17 +1802,32 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, | |||
| 1341 | } | 1802 | } |
| 1342 | 1803 | ||
| 1343 | for (i = 0; i < access_size; i++) { | 1804 | for (i = 0; i < access_size; i++) { |
| 1805 | u8 *stype; | ||
| 1806 | |||
| 1344 | slot = -(off + i) - 1; | 1807 | slot = -(off + i) - 1; |
| 1345 | spi = slot / BPF_REG_SIZE; | 1808 | spi = slot / BPF_REG_SIZE; |
| 1346 | if (state->allocated_stack <= slot || | 1809 | if (state->allocated_stack <= slot) |
| 1347 | state->stack[spi].slot_type[slot % BPF_REG_SIZE] != | 1810 | goto err; |
| 1348 | STACK_MISC) { | 1811 | stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE]; |
| 1349 | verbose(env, "invalid indirect read from stack off %d+%d size %d\n", | 1812 | if (*stype == STACK_MISC) |
| 1350 | off, i, access_size); | 1813 | goto mark; |
| 1351 | return -EACCES; | 1814 | if (*stype == STACK_ZERO) { |
| 1815 | /* helper can write anything into the stack */ | ||
| 1816 | *stype = STACK_MISC; | ||
| 1817 | goto mark; | ||
| 1352 | } | 1818 | } |
| 1819 | err: | ||
| 1820 | verbose(env, "invalid indirect read from stack off %d+%d size %d\n", | ||
| 1821 | off, i, access_size); | ||
| 1822 | return -EACCES; | ||
| 1823 | mark: | ||
| 1824 | /* reading any byte out of 8-byte 'spill_slot' will cause | ||
| 1825 | * the whole slot to be marked as 'read' | ||
| 1826 | */ | ||
| 1827 | mark_stack_slot_read(env, env->cur_state, env->cur_state->parent, | ||
| 1828 | spi, state->frameno); | ||
| 1353 | } | 1829 | } |
| 1354 | return 0; | 1830 | return update_stack_depth(env, state, off); |
| 1355 | } | 1831 | } |
| 1356 | 1832 | ||
| 1357 | static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, | 1833 | static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, |
| @@ -1374,6 +1850,19 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, | |||
| 1374 | } | 1850 | } |
| 1375 | } | 1851 | } |
| 1376 | 1852 | ||
| 1853 | static bool arg_type_is_mem_ptr(enum bpf_arg_type type) | ||
| 1854 | { | ||
| 1855 | return type == ARG_PTR_TO_MEM || | ||
| 1856 | type == ARG_PTR_TO_MEM_OR_NULL || | ||
| 1857 | type == ARG_PTR_TO_UNINIT_MEM; | ||
| 1858 | } | ||
| 1859 | |||
| 1860 | static bool arg_type_is_mem_size(enum bpf_arg_type type) | ||
| 1861 | { | ||
| 1862 | return type == ARG_CONST_SIZE || | ||
| 1863 | type == ARG_CONST_SIZE_OR_ZERO; | ||
| 1864 | } | ||
| 1865 | |||
| 1377 | static int check_func_arg(struct bpf_verifier_env *env, u32 regno, | 1866 | static int check_func_arg(struct bpf_verifier_env *env, u32 regno, |
| 1378 | enum bpf_arg_type arg_type, | 1867 | enum bpf_arg_type arg_type, |
| 1379 | struct bpf_call_arg_meta *meta) | 1868 | struct bpf_call_arg_meta *meta) |
| @@ -1423,15 +1912,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, | |||
| 1423 | expected_type = PTR_TO_CTX; | 1912 | expected_type = PTR_TO_CTX; |
| 1424 | if (type != expected_type) | 1913 | if (type != expected_type) |
| 1425 | goto err_type; | 1914 | goto err_type; |
| 1426 | } else if (arg_type == ARG_PTR_TO_MEM || | 1915 | } else if (arg_type_is_mem_ptr(arg_type)) { |
| 1427 | arg_type == ARG_PTR_TO_MEM_OR_NULL || | ||
| 1428 | arg_type == ARG_PTR_TO_UNINIT_MEM) { | ||
| 1429 | expected_type = PTR_TO_STACK; | 1916 | expected_type = PTR_TO_STACK; |
| 1430 | /* One exception here. In case function allows for NULL to be | 1917 | /* One exception here. In case function allows for NULL to be |
| 1431 | * passed in as argument, it's a SCALAR_VALUE type. Final test | 1918 | * passed in as argument, it's a SCALAR_VALUE type. Final test |
| 1432 | * happens during stack boundary checking. | 1919 | * happens during stack boundary checking. |
| 1433 | */ | 1920 | */ |
| 1434 | if (register_is_null(*reg) && | 1921 | if (register_is_null(reg) && |
| 1435 | arg_type == ARG_PTR_TO_MEM_OR_NULL) | 1922 | arg_type == ARG_PTR_TO_MEM_OR_NULL) |
| 1436 | /* final test in check_stack_boundary() */; | 1923 | /* final test in check_stack_boundary() */; |
| 1437 | else if (!type_is_pkt_pointer(type) && | 1924 | else if (!type_is_pkt_pointer(type) && |
| @@ -1486,25 +1973,12 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, | |||
| 1486 | err = check_stack_boundary(env, regno, | 1973 | err = check_stack_boundary(env, regno, |
| 1487 | meta->map_ptr->value_size, | 1974 | meta->map_ptr->value_size, |
| 1488 | false, NULL); | 1975 | false, NULL); |
| 1489 | } else if (arg_type == ARG_CONST_SIZE || | 1976 | } else if (arg_type_is_mem_size(arg_type)) { |
| 1490 | arg_type == ARG_CONST_SIZE_OR_ZERO) { | ||
| 1491 | bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); | 1977 | bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); |
| 1492 | 1978 | ||
| 1493 | /* bpf_xxx(..., buf, len) call will access 'len' bytes | ||
| 1494 | * from stack pointer 'buf'. Check it | ||
| 1495 | * note: regno == len, regno - 1 == buf | ||
| 1496 | */ | ||
| 1497 | if (regno == 0) { | ||
| 1498 | /* kernel subsystem misconfigured verifier */ | ||
| 1499 | verbose(env, | ||
| 1500 | "ARG_CONST_SIZE cannot be first argument\n"); | ||
| 1501 | return -EACCES; | ||
| 1502 | } | ||
| 1503 | |||
| 1504 | /* The register is SCALAR_VALUE; the access check | 1979 | /* The register is SCALAR_VALUE; the access check |
| 1505 | * happens using its boundaries. | 1980 | * happens using its boundaries. |
| 1506 | */ | 1981 | */ |
| 1507 | |||
| 1508 | if (!tnum_is_const(reg->var_off)) | 1982 | if (!tnum_is_const(reg->var_off)) |
| 1509 | /* For unprivileged variable accesses, disable raw | 1983 | /* For unprivileged variable accesses, disable raw |
| 1510 | * mode so that the program is required to | 1984 | * mode so that the program is required to |
| @@ -1604,6 +2078,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, | |||
| 1604 | case BPF_FUNC_tail_call: | 2078 | case BPF_FUNC_tail_call: |
| 1605 | if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY) | 2079 | if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY) |
| 1606 | goto error; | 2080 | goto error; |
| 2081 | if (env->subprog_cnt) { | ||
| 2082 | verbose(env, "tail_calls are not allowed in programs with bpf-to-bpf calls\n"); | ||
| 2083 | return -EINVAL; | ||
| 2084 | } | ||
| 1607 | break; | 2085 | break; |
| 1608 | case BPF_FUNC_perf_event_read: | 2086 | case BPF_FUNC_perf_event_read: |
| 1609 | case BPF_FUNC_perf_event_output: | 2087 | case BPF_FUNC_perf_event_output: |
| @@ -1644,7 +2122,7 @@ error: | |||
| 1644 | return -EINVAL; | 2122 | return -EINVAL; |
| 1645 | } | 2123 | } |
| 1646 | 2124 | ||
| 1647 | static int check_raw_mode(const struct bpf_func_proto *fn) | 2125 | static bool check_raw_mode_ok(const struct bpf_func_proto *fn) |
| 1648 | { | 2126 | { |
| 1649 | int count = 0; | 2127 | int count = 0; |
| 1650 | 2128 | ||
| @@ -1659,15 +2137,52 @@ static int check_raw_mode(const struct bpf_func_proto *fn) | |||
| 1659 | if (fn->arg5_type == ARG_PTR_TO_UNINIT_MEM) | 2137 | if (fn->arg5_type == ARG_PTR_TO_UNINIT_MEM) |
| 1660 | count++; | 2138 | count++; |
| 1661 | 2139 | ||
| 1662 | return count > 1 ? -EINVAL : 0; | 2140 | /* We only support one arg being in raw mode at the moment, |
| 2141 | * which is sufficient for the helper functions we have | ||
| 2142 | * right now. | ||
| 2143 | */ | ||
| 2144 | return count <= 1; | ||
| 2145 | } | ||
| 2146 | |||
| 2147 | static bool check_args_pair_invalid(enum bpf_arg_type arg_curr, | ||
| 2148 | enum bpf_arg_type arg_next) | ||
| 2149 | { | ||
| 2150 | return (arg_type_is_mem_ptr(arg_curr) && | ||
| 2151 | !arg_type_is_mem_size(arg_next)) || | ||
| 2152 | (!arg_type_is_mem_ptr(arg_curr) && | ||
| 2153 | arg_type_is_mem_size(arg_next)); | ||
| 2154 | } | ||
| 2155 | |||
| 2156 | static bool check_arg_pair_ok(const struct bpf_func_proto *fn) | ||
| 2157 | { | ||
| 2158 | /* bpf_xxx(..., buf, len) call will access 'len' | ||
| 2159 | * bytes from memory 'buf'. Both arg types need | ||
| 2160 | * to be paired, so make sure there's no buggy | ||
| 2161 | * helper function specification. | ||
| 2162 | */ | ||
| 2163 | if (arg_type_is_mem_size(fn->arg1_type) || | ||
| 2164 | arg_type_is_mem_ptr(fn->arg5_type) || | ||
| 2165 | check_args_pair_invalid(fn->arg1_type, fn->arg2_type) || | ||
| 2166 | check_args_pair_invalid(fn->arg2_type, fn->arg3_type) || | ||
| 2167 | check_args_pair_invalid(fn->arg3_type, fn->arg4_type) || | ||
| 2168 | check_args_pair_invalid(fn->arg4_type, fn->arg5_type)) | ||
| 2169 | return false; | ||
| 2170 | |||
| 2171 | return true; | ||
| 2172 | } | ||
| 2173 | |||
| 2174 | static int check_func_proto(const struct bpf_func_proto *fn) | ||
| 2175 | { | ||
| 2176 | return check_raw_mode_ok(fn) && | ||
| 2177 | check_arg_pair_ok(fn) ? 0 : -EINVAL; | ||
| 1663 | } | 2178 | } |
| 1664 | 2179 | ||
| 1665 | /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] | 2180 | /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] |
| 1666 | * are now invalid, so turn them into unknown SCALAR_VALUE. | 2181 | * are now invalid, so turn them into unknown SCALAR_VALUE. |
| 1667 | */ | 2182 | */ |
| 1668 | static void clear_all_pkt_pointers(struct bpf_verifier_env *env) | 2183 | static void __clear_all_pkt_pointers(struct bpf_verifier_env *env, |
| 2184 | struct bpf_func_state *state) | ||
| 1669 | { | 2185 | { |
| 1670 | struct bpf_verifier_state *state = env->cur_state; | ||
| 1671 | struct bpf_reg_state *regs = state->regs, *reg; | 2186 | struct bpf_reg_state *regs = state->regs, *reg; |
| 1672 | int i; | 2187 | int i; |
| 1673 | 2188 | ||
| @@ -1684,7 +2199,121 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) | |||
| 1684 | } | 2199 | } |
| 1685 | } | 2200 | } |
| 1686 | 2201 | ||
| 1687 | static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) | 2202 | static void clear_all_pkt_pointers(struct bpf_verifier_env *env) |
| 2203 | { | ||
| 2204 | struct bpf_verifier_state *vstate = env->cur_state; | ||
| 2205 | int i; | ||
| 2206 | |||
| 2207 | for (i = 0; i <= vstate->curframe; i++) | ||
| 2208 | __clear_all_pkt_pointers(env, vstate->frame[i]); | ||
| 2209 | } | ||
| 2210 | |||
| 2211 | static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, | ||
| 2212 | int *insn_idx) | ||
| 2213 | { | ||
| 2214 | struct bpf_verifier_state *state = env->cur_state; | ||
| 2215 | struct bpf_func_state *caller, *callee; | ||
| 2216 | int i, subprog, target_insn; | ||
| 2217 | |||
| 2218 | if (state->curframe + 1 >= MAX_CALL_FRAMES) { | ||
| 2219 | verbose(env, "the call stack of %d frames is too deep\n", | ||
| 2220 | state->curframe + 2); | ||
| 2221 | return -E2BIG; | ||
| 2222 | } | ||
| 2223 | |||
| 2224 | target_insn = *insn_idx + insn->imm; | ||
| 2225 | subprog = find_subprog(env, target_insn + 1); | ||
| 2226 | if (subprog < 0) { | ||
| 2227 | verbose(env, "verifier bug. No program starts at insn %d\n", | ||
| 2228 | target_insn + 1); | ||
| 2229 | return -EFAULT; | ||
| 2230 | } | ||
| 2231 | |||
| 2232 | caller = state->frame[state->curframe]; | ||
| 2233 | if (state->frame[state->curframe + 1]) { | ||
| 2234 | verbose(env, "verifier bug. Frame %d already allocated\n", | ||
| 2235 | state->curframe + 1); | ||
| 2236 | return -EFAULT; | ||
| 2237 | } | ||
| 2238 | |||
| 2239 | callee = kzalloc(sizeof(*callee), GFP_KERNEL); | ||
| 2240 | if (!callee) | ||
| 2241 | return -ENOMEM; | ||
| 2242 | state->frame[state->curframe + 1] = callee; | ||
| 2243 | |||
| 2244 | /* callee cannot access r0, r6 - r9 for reading and has to write | ||
| 2245 | * into its own stack before reading from it. | ||
| 2246 | * callee can read/write into caller's stack | ||
| 2247 | */ | ||
| 2248 | init_func_state(env, callee, | ||
| 2249 | /* remember the callsite, it will be used by bpf_exit */ | ||
| 2250 | *insn_idx /* callsite */, | ||
| 2251 | state->curframe + 1 /* frameno within this callchain */, | ||
| 2252 | subprog + 1 /* subprog number within this prog */); | ||
| 2253 | |||
| 2254 | /* copy r1 - r5 args that callee can access */ | ||
| 2255 | for (i = BPF_REG_1; i <= BPF_REG_5; i++) | ||
| 2256 | callee->regs[i] = caller->regs[i]; | ||
| 2257 | |||
| 2258 | /* after the call regsiters r0 - r5 were scratched */ | ||
| 2259 | for (i = 0; i < CALLER_SAVED_REGS; i++) { | ||
| 2260 | mark_reg_not_init(env, caller->regs, caller_saved[i]); | ||
| 2261 | check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); | ||
| 2262 | } | ||
| 2263 | |||
| 2264 | /* only increment it after check_reg_arg() finished */ | ||
| 2265 | state->curframe++; | ||
| 2266 | |||
| 2267 | /* and go analyze first insn of the callee */ | ||
| 2268 | *insn_idx = target_insn; | ||
| 2269 | |||
| 2270 | if (env->log.level) { | ||
| 2271 | verbose(env, "caller:\n"); | ||
| 2272 | print_verifier_state(env, caller); | ||
| 2273 | verbose(env, "callee:\n"); | ||
| 2274 | print_verifier_state(env, callee); | ||
| 2275 | } | ||
| 2276 | return 0; | ||
| 2277 | } | ||
| 2278 | |||
| 2279 | static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) | ||
| 2280 | { | ||
| 2281 | struct bpf_verifier_state *state = env->cur_state; | ||
| 2282 | struct bpf_func_state *caller, *callee; | ||
| 2283 | struct bpf_reg_state *r0; | ||
| 2284 | |||
| 2285 | callee = state->frame[state->curframe]; | ||
| 2286 | r0 = &callee->regs[BPF_REG_0]; | ||
| 2287 | if (r0->type == PTR_TO_STACK) { | ||
| 2288 | /* technically it's ok to return caller's stack pointer | ||
| 2289 | * (or caller's caller's pointer) back to the caller, | ||
| 2290 | * since these pointers are valid. Only current stack | ||
| 2291 | * pointer will be invalid as soon as function exits, | ||
| 2292 | * but let's be conservative | ||
| 2293 | */ | ||
| 2294 | verbose(env, "cannot return stack pointer to the caller\n"); | ||
| 2295 | return -EINVAL; | ||
| 2296 | } | ||
| 2297 | |||
| 2298 | state->curframe--; | ||
| 2299 | caller = state->frame[state->curframe]; | ||
| 2300 | /* return to the caller whatever r0 had in the callee */ | ||
| 2301 | caller->regs[BPF_REG_0] = *r0; | ||
| 2302 | |||
| 2303 | *insn_idx = callee->callsite + 1; | ||
| 2304 | if (env->log.level) { | ||
| 2305 | verbose(env, "returning from callee:\n"); | ||
| 2306 | print_verifier_state(env, callee); | ||
| 2307 | verbose(env, "to caller at %d:\n", *insn_idx); | ||
| 2308 | print_verifier_state(env, caller); | ||
| 2309 | } | ||
| 2310 | /* clear everything in the callee */ | ||
| 2311 | free_func_state(callee); | ||
| 2312 | state->frame[state->curframe + 1] = NULL; | ||
| 2313 | return 0; | ||
| 2314 | } | ||
| 2315 | |||
| 2316 | static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx) | ||
| 1688 | { | 2317 | { |
| 1689 | const struct bpf_func_proto *fn = NULL; | 2318 | const struct bpf_func_proto *fn = NULL; |
| 1690 | struct bpf_reg_state *regs; | 2319 | struct bpf_reg_state *regs; |
| @@ -1701,7 +2330,6 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) | |||
| 1701 | 2330 | ||
| 1702 | if (env->ops->get_func_proto) | 2331 | if (env->ops->get_func_proto) |
| 1703 | fn = env->ops->get_func_proto(func_id); | 2332 | fn = env->ops->get_func_proto(func_id); |
| 1704 | |||
| 1705 | if (!fn) { | 2333 | if (!fn) { |
| 1706 | verbose(env, "unknown func %s#%d\n", func_id_name(func_id), | 2334 | verbose(env, "unknown func %s#%d\n", func_id_name(func_id), |
| 1707 | func_id); | 2335 | func_id); |
| @@ -1725,10 +2353,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) | |||
| 1725 | memset(&meta, 0, sizeof(meta)); | 2353 | memset(&meta, 0, sizeof(meta)); |
| 1726 | meta.pkt_access = fn->pkt_access; | 2354 | meta.pkt_access = fn->pkt_access; |
| 1727 | 2355 | ||
| 1728 | /* We only support one arg being in raw mode at the moment, which | 2356 | err = check_func_proto(fn); |
| 1729 | * is sufficient for the helper functions we have right now. | ||
| 1730 | */ | ||
| 1731 | err = check_raw_mode(fn); | ||
| 1732 | if (err) { | 2357 | if (err) { |
| 1733 | verbose(env, "kernel subsystem misconfigured func %s#%d\n", | 2358 | verbose(env, "kernel subsystem misconfigured func %s#%d\n", |
| 1734 | func_id_name(func_id), func_id); | 2359 | func_id_name(func_id), func_id); |
| @@ -1884,7 +2509,9 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, | |||
| 1884 | const struct bpf_reg_state *ptr_reg, | 2509 | const struct bpf_reg_state *ptr_reg, |
| 1885 | const struct bpf_reg_state *off_reg) | 2510 | const struct bpf_reg_state *off_reg) |
| 1886 | { | 2511 | { |
| 1887 | struct bpf_reg_state *regs = cur_regs(env), *dst_reg; | 2512 | struct bpf_verifier_state *vstate = env->cur_state; |
| 2513 | struct bpf_func_state *state = vstate->frame[vstate->curframe]; | ||
| 2514 | struct bpf_reg_state *regs = state->regs, *dst_reg; | ||
| 1888 | bool known = tnum_is_const(off_reg->var_off); | 2515 | bool known = tnum_is_const(off_reg->var_off); |
| 1889 | s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value, | 2516 | s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value, |
| 1890 | smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value; | 2517 | smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value; |
| @@ -2319,7 +2946,9 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, | |||
| 2319 | static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, | 2946 | static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, |
| 2320 | struct bpf_insn *insn) | 2947 | struct bpf_insn *insn) |
| 2321 | { | 2948 | { |
| 2322 | struct bpf_reg_state *regs = cur_regs(env), *dst_reg, *src_reg; | 2949 | struct bpf_verifier_state *vstate = env->cur_state; |
| 2950 | struct bpf_func_state *state = vstate->frame[vstate->curframe]; | ||
| 2951 | struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg; | ||
| 2323 | struct bpf_reg_state *ptr_reg = NULL, off_reg = {0}; | 2952 | struct bpf_reg_state *ptr_reg = NULL, off_reg = {0}; |
| 2324 | u8 opcode = BPF_OP(insn->code); | 2953 | u8 opcode = BPF_OP(insn->code); |
| 2325 | 2954 | ||
| @@ -2370,12 +2999,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, | |||
| 2370 | 2999 | ||
| 2371 | /* Got here implies adding two SCALAR_VALUEs */ | 3000 | /* Got here implies adding two SCALAR_VALUEs */ |
| 2372 | if (WARN_ON_ONCE(ptr_reg)) { | 3001 | if (WARN_ON_ONCE(ptr_reg)) { |
| 2373 | print_verifier_state(env, env->cur_state); | 3002 | print_verifier_state(env, state); |
| 2374 | verbose(env, "verifier internal error: unexpected ptr_reg\n"); | 3003 | verbose(env, "verifier internal error: unexpected ptr_reg\n"); |
| 2375 | return -EINVAL; | 3004 | return -EINVAL; |
| 2376 | } | 3005 | } |
| 2377 | if (WARN_ON(!src_reg)) { | 3006 | if (WARN_ON(!src_reg)) { |
| 2378 | print_verifier_state(env, env->cur_state); | 3007 | print_verifier_state(env, state); |
| 2379 | verbose(env, "verifier internal error: no src_reg\n"); | 3008 | verbose(env, "verifier internal error: no src_reg\n"); |
| 2380 | return -EINVAL; | 3009 | return -EINVAL; |
| 2381 | } | 3010 | } |
| @@ -2537,14 +3166,15 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) | |||
| 2537 | return 0; | 3166 | return 0; |
| 2538 | } | 3167 | } |
| 2539 | 3168 | ||
| 2540 | static void find_good_pkt_pointers(struct bpf_verifier_state *state, | 3169 | static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, |
| 2541 | struct bpf_reg_state *dst_reg, | 3170 | struct bpf_reg_state *dst_reg, |
| 2542 | enum bpf_reg_type type, | 3171 | enum bpf_reg_type type, |
| 2543 | bool range_right_open) | 3172 | bool range_right_open) |
| 2544 | { | 3173 | { |
| 3174 | struct bpf_func_state *state = vstate->frame[vstate->curframe]; | ||
| 2545 | struct bpf_reg_state *regs = state->regs, *reg; | 3175 | struct bpf_reg_state *regs = state->regs, *reg; |
| 2546 | u16 new_range; | 3176 | u16 new_range; |
| 2547 | int i; | 3177 | int i, j; |
| 2548 | 3178 | ||
| 2549 | if (dst_reg->off < 0 || | 3179 | if (dst_reg->off < 0 || |
| 2550 | (dst_reg->off == 0 && range_right_open)) | 3180 | (dst_reg->off == 0 && range_right_open)) |
| @@ -2614,12 +3244,15 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, | |||
| 2614 | /* keep the maximum range already checked */ | 3244 | /* keep the maximum range already checked */ |
| 2615 | regs[i].range = max(regs[i].range, new_range); | 3245 | regs[i].range = max(regs[i].range, new_range); |
| 2616 | 3246 | ||
| 2617 | for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { | 3247 | for (j = 0; j <= vstate->curframe; j++) { |
| 2618 | if (state->stack[i].slot_type[0] != STACK_SPILL) | 3248 | state = vstate->frame[j]; |
| 2619 | continue; | 3249 | for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { |
| 2620 | reg = &state->stack[i].spilled_ptr; | 3250 | if (state->stack[i].slot_type[0] != STACK_SPILL) |
| 2621 | if (reg->type == type && reg->id == dst_reg->id) | 3251 | continue; |
| 2622 | reg->range = max(reg->range, new_range); | 3252 | reg = &state->stack[i].spilled_ptr; |
| 3253 | if (reg->type == type && reg->id == dst_reg->id) | ||
| 3254 | reg->range = max(reg->range, new_range); | ||
| 3255 | } | ||
| 2623 | } | 3256 | } |
| 2624 | } | 3257 | } |
| 2625 | 3258 | ||
| @@ -2857,20 +3490,24 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, | |||
| 2857 | /* The logic is similar to find_good_pkt_pointers(), both could eventually | 3490 | /* The logic is similar to find_good_pkt_pointers(), both could eventually |
| 2858 | * be folded together at some point. | 3491 | * be folded together at some point. |
| 2859 | */ | 3492 | */ |
| 2860 | static void mark_map_regs(struct bpf_verifier_state *state, u32 regno, | 3493 | static void mark_map_regs(struct bpf_verifier_state *vstate, u32 regno, |
| 2861 | bool is_null) | 3494 | bool is_null) |
| 2862 | { | 3495 | { |
| 3496 | struct bpf_func_state *state = vstate->frame[vstate->curframe]; | ||
| 2863 | struct bpf_reg_state *regs = state->regs; | 3497 | struct bpf_reg_state *regs = state->regs; |
| 2864 | u32 id = regs[regno].id; | 3498 | u32 id = regs[regno].id; |
| 2865 | int i; | 3499 | int i, j; |
| 2866 | 3500 | ||
| 2867 | for (i = 0; i < MAX_BPF_REG; i++) | 3501 | for (i = 0; i < MAX_BPF_REG; i++) |
| 2868 | mark_map_reg(regs, i, id, is_null); | 3502 | mark_map_reg(regs, i, id, is_null); |
| 2869 | 3503 | ||
| 2870 | for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { | 3504 | for (j = 0; j <= vstate->curframe; j++) { |
| 2871 | if (state->stack[i].slot_type[0] != STACK_SPILL) | 3505 | state = vstate->frame[j]; |
| 2872 | continue; | 3506 | for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { |
| 2873 | mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null); | 3507 | if (state->stack[i].slot_type[0] != STACK_SPILL) |
| 3508 | continue; | ||
| 3509 | mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null); | ||
| 3510 | } | ||
| 2874 | } | 3511 | } |
| 2875 | } | 3512 | } |
| 2876 | 3513 | ||
| @@ -2970,8 +3607,10 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, | |||
| 2970 | static int check_cond_jmp_op(struct bpf_verifier_env *env, | 3607 | static int check_cond_jmp_op(struct bpf_verifier_env *env, |
| 2971 | struct bpf_insn *insn, int *insn_idx) | 3608 | struct bpf_insn *insn, int *insn_idx) |
| 2972 | { | 3609 | { |
| 2973 | struct bpf_verifier_state *other_branch, *this_branch = env->cur_state; | 3610 | struct bpf_verifier_state *this_branch = env->cur_state; |
| 2974 | struct bpf_reg_state *regs = this_branch->regs, *dst_reg; | 3611 | struct bpf_verifier_state *other_branch; |
| 3612 | struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs; | ||
| 3613 | struct bpf_reg_state *dst_reg, *other_branch_regs; | ||
| 2975 | u8 opcode = BPF_OP(insn->code); | 3614 | u8 opcode = BPF_OP(insn->code); |
| 2976 | int err; | 3615 | int err; |
| 2977 | 3616 | ||
| @@ -3014,8 +3653,9 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, | |||
| 3014 | if (BPF_SRC(insn->code) == BPF_K && | 3653 | if (BPF_SRC(insn->code) == BPF_K && |
| 3015 | (opcode == BPF_JEQ || opcode == BPF_JNE) && | 3654 | (opcode == BPF_JEQ || opcode == BPF_JNE) && |
| 3016 | dst_reg->type == SCALAR_VALUE && | 3655 | dst_reg->type == SCALAR_VALUE && |
| 3017 | tnum_equals_const(dst_reg->var_off, insn->imm)) { | 3656 | tnum_is_const(dst_reg->var_off)) { |
| 3018 | if (opcode == BPF_JEQ) { | 3657 | if ((opcode == BPF_JEQ && dst_reg->var_off.value == insn->imm) || |
| 3658 | (opcode == BPF_JNE && dst_reg->var_off.value != insn->imm)) { | ||
| 3019 | /* if (imm == imm) goto pc+off; | 3659 | /* if (imm == imm) goto pc+off; |
| 3020 | * only follow the goto, ignore fall-through | 3660 | * only follow the goto, ignore fall-through |
| 3021 | */ | 3661 | */ |
| @@ -3033,6 +3673,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, | |||
| 3033 | other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx); | 3673 | other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx); |
| 3034 | if (!other_branch) | 3674 | if (!other_branch) |
| 3035 | return -EFAULT; | 3675 | return -EFAULT; |
| 3676 | other_branch_regs = other_branch->frame[other_branch->curframe]->regs; | ||
| 3036 | 3677 | ||
| 3037 | /* detect if we are comparing against a constant value so we can adjust | 3678 | /* detect if we are comparing against a constant value so we can adjust |
| 3038 | * our min/max values for our dst register. | 3679 | * our min/max values for our dst register. |
| @@ -3045,22 +3686,22 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, | |||
| 3045 | if (dst_reg->type == SCALAR_VALUE && | 3686 | if (dst_reg->type == SCALAR_VALUE && |
| 3046 | regs[insn->src_reg].type == SCALAR_VALUE) { | 3687 | regs[insn->src_reg].type == SCALAR_VALUE) { |
| 3047 | if (tnum_is_const(regs[insn->src_reg].var_off)) | 3688 | if (tnum_is_const(regs[insn->src_reg].var_off)) |
| 3048 | reg_set_min_max(&other_branch->regs[insn->dst_reg], | 3689 | reg_set_min_max(&other_branch_regs[insn->dst_reg], |
| 3049 | dst_reg, regs[insn->src_reg].var_off.value, | 3690 | dst_reg, regs[insn->src_reg].var_off.value, |
| 3050 | opcode); | 3691 | opcode); |
| 3051 | else if (tnum_is_const(dst_reg->var_off)) | 3692 | else if (tnum_is_const(dst_reg->var_off)) |
| 3052 | reg_set_min_max_inv(&other_branch->regs[insn->src_reg], | 3693 | reg_set_min_max_inv(&other_branch_regs[insn->src_reg], |
| 3053 | ®s[insn->src_reg], | 3694 | ®s[insn->src_reg], |
| 3054 | dst_reg->var_off.value, opcode); | 3695 | dst_reg->var_off.value, opcode); |
| 3055 | else if (opcode == BPF_JEQ || opcode == BPF_JNE) | 3696 | else if (opcode == BPF_JEQ || opcode == BPF_JNE) |
| 3056 | /* Comparing for equality, we can combine knowledge */ | 3697 | /* Comparing for equality, we can combine knowledge */ |
| 3057 | reg_combine_min_max(&other_branch->regs[insn->src_reg], | 3698 | reg_combine_min_max(&other_branch_regs[insn->src_reg], |
| 3058 | &other_branch->regs[insn->dst_reg], | 3699 | &other_branch_regs[insn->dst_reg], |
| 3059 | ®s[insn->src_reg], | 3700 | ®s[insn->src_reg], |
| 3060 | ®s[insn->dst_reg], opcode); | 3701 | ®s[insn->dst_reg], opcode); |
| 3061 | } | 3702 | } |
| 3062 | } else if (dst_reg->type == SCALAR_VALUE) { | 3703 | } else if (dst_reg->type == SCALAR_VALUE) { |
| 3063 | reg_set_min_max(&other_branch->regs[insn->dst_reg], | 3704 | reg_set_min_max(&other_branch_regs[insn->dst_reg], |
| 3064 | dst_reg, insn->imm, opcode); | 3705 | dst_reg, insn->imm, opcode); |
| 3065 | } | 3706 | } |
| 3066 | 3707 | ||
| @@ -3081,7 +3722,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, | |||
| 3081 | return -EACCES; | 3722 | return -EACCES; |
| 3082 | } | 3723 | } |
| 3083 | if (env->log.level) | 3724 | if (env->log.level) |
| 3084 | print_verifier_state(env, this_branch); | 3725 | print_verifier_state(env, this_branch->frame[this_branch->curframe]); |
| 3085 | return 0; | 3726 | return 0; |
| 3086 | } | 3727 | } |
| 3087 | 3728 | ||
| @@ -3166,6 +3807,18 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) | |||
| 3166 | return -EINVAL; | 3807 | return -EINVAL; |
| 3167 | } | 3808 | } |
| 3168 | 3809 | ||
| 3810 | if (env->subprog_cnt) { | ||
| 3811 | /* when program has LD_ABS insn JITs and interpreter assume | ||
| 3812 | * that r1 == ctx == skb which is not the case for callees | ||
| 3813 | * that can have arbitrary arguments. It's problematic | ||
| 3814 | * for main prog as well since JITs would need to analyze | ||
| 3815 | * all functions in order to make proper register save/restore | ||
| 3816 | * decisions in the main prog. Hence disallow LD_ABS with calls | ||
| 3817 | */ | ||
| 3818 | verbose(env, "BPF_LD_[ABS|IND] instructions cannot be mixed with bpf-to-bpf calls\n"); | ||
| 3819 | return -EINVAL; | ||
| 3820 | } | ||
| 3821 | |||
| 3169 | if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || | 3822 | if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || |
| 3170 | BPF_SIZE(insn->code) == BPF_DW || | 3823 | BPF_SIZE(insn->code) == BPF_DW || |
| 3171 | (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { | 3824 | (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { |
| @@ -3342,6 +3995,10 @@ static int check_cfg(struct bpf_verifier_env *env) | |||
| 3342 | int ret = 0; | 3995 | int ret = 0; |
| 3343 | int i, t; | 3996 | int i, t; |
| 3344 | 3997 | ||
| 3998 | ret = check_subprogs(env); | ||
| 3999 | if (ret < 0) | ||
| 4000 | return ret; | ||
| 4001 | |||
| 3345 | insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); | 4002 | insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); |
| 3346 | if (!insn_state) | 4003 | if (!insn_state) |
| 3347 | return -ENOMEM; | 4004 | return -ENOMEM; |
| @@ -3374,6 +4031,14 @@ peek_stack: | |||
| 3374 | goto err_free; | 4031 | goto err_free; |
| 3375 | if (t + 1 < insn_cnt) | 4032 | if (t + 1 < insn_cnt) |
| 3376 | env->explored_states[t + 1] = STATE_LIST_MARK; | 4033 | env->explored_states[t + 1] = STATE_LIST_MARK; |
| 4034 | if (insns[t].src_reg == BPF_PSEUDO_CALL) { | ||
| 4035 | env->explored_states[t] = STATE_LIST_MARK; | ||
| 4036 | ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env); | ||
| 4037 | if (ret == 1) | ||
| 4038 | goto peek_stack; | ||
| 4039 | else if (ret < 0) | ||
| 4040 | goto err_free; | ||
| 4041 | } | ||
| 3377 | } else if (opcode == BPF_JA) { | 4042 | } else if (opcode == BPF_JA) { |
| 3378 | if (BPF_SRC(insns[t].code) != BPF_K) { | 4043 | if (BPF_SRC(insns[t].code) != BPF_K) { |
| 3379 | ret = -EINVAL; | 4044 | ret = -EINVAL; |
| @@ -3492,11 +4157,21 @@ static bool check_ids(u32 old_id, u32 cur_id, struct idpair *idmap) | |||
| 3492 | static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, | 4157 | static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, |
| 3493 | struct idpair *idmap) | 4158 | struct idpair *idmap) |
| 3494 | { | 4159 | { |
| 4160 | bool equal; | ||
| 4161 | |||
| 3495 | if (!(rold->live & REG_LIVE_READ)) | 4162 | if (!(rold->live & REG_LIVE_READ)) |
| 3496 | /* explored state didn't use this */ | 4163 | /* explored state didn't use this */ |
| 3497 | return true; | 4164 | return true; |
| 3498 | 4165 | ||
| 3499 | if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, live)) == 0) | 4166 | equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, frameno)) == 0; |
| 4167 | |||
| 4168 | if (rold->type == PTR_TO_STACK) | ||
| 4169 | /* two stack pointers are equal only if they're pointing to | ||
| 4170 | * the same stack frame, since fp-8 in foo != fp-8 in bar | ||
| 4171 | */ | ||
| 4172 | return equal && rold->frameno == rcur->frameno; | ||
| 4173 | |||
| 4174 | if (equal) | ||
| 3500 | return true; | 4175 | return true; |
| 3501 | 4176 | ||
| 3502 | if (rold->type == NOT_INIT) | 4177 | if (rold->type == NOT_INIT) |
| @@ -3568,7 +4243,6 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, | |||
| 3568 | tnum_in(rold->var_off, rcur->var_off); | 4243 | tnum_in(rold->var_off, rcur->var_off); |
| 3569 | case PTR_TO_CTX: | 4244 | case PTR_TO_CTX: |
| 3570 | case CONST_PTR_TO_MAP: | 4245 | case CONST_PTR_TO_MAP: |
| 3571 | case PTR_TO_STACK: | ||
| 3572 | case PTR_TO_PACKET_END: | 4246 | case PTR_TO_PACKET_END: |
| 3573 | /* Only valid matches are exact, which memcmp() above | 4247 | /* Only valid matches are exact, which memcmp() above |
| 3574 | * would have accepted | 4248 | * would have accepted |
| @@ -3583,8 +4257,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, | |||
| 3583 | return false; | 4257 | return false; |
| 3584 | } | 4258 | } |
| 3585 | 4259 | ||
| 3586 | static bool stacksafe(struct bpf_verifier_state *old, | 4260 | static bool stacksafe(struct bpf_func_state *old, |
| 3587 | struct bpf_verifier_state *cur, | 4261 | struct bpf_func_state *cur, |
| 3588 | struct idpair *idmap) | 4262 | struct idpair *idmap) |
| 3589 | { | 4263 | { |
| 3590 | int i, spi; | 4264 | int i, spi; |
| @@ -3602,8 +4276,19 @@ static bool stacksafe(struct bpf_verifier_state *old, | |||
| 3602 | for (i = 0; i < old->allocated_stack; i++) { | 4276 | for (i = 0; i < old->allocated_stack; i++) { |
| 3603 | spi = i / BPF_REG_SIZE; | 4277 | spi = i / BPF_REG_SIZE; |
| 3604 | 4278 | ||
| 4279 | if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) | ||
| 4280 | /* explored state didn't use this */ | ||
| 4281 | continue; | ||
| 4282 | |||
| 3605 | if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID) | 4283 | if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID) |
| 3606 | continue; | 4284 | continue; |
| 4285 | /* if old state was safe with misc data in the stack | ||
| 4286 | * it will be safe with zero-initialized stack. | ||
| 4287 | * The opposite is not true | ||
| 4288 | */ | ||
| 4289 | if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC && | ||
| 4290 | cur->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_ZERO) | ||
| 4291 | continue; | ||
| 3607 | if (old->stack[spi].slot_type[i % BPF_REG_SIZE] != | 4292 | if (old->stack[spi].slot_type[i % BPF_REG_SIZE] != |
| 3608 | cur->stack[spi].slot_type[i % BPF_REG_SIZE]) | 4293 | cur->stack[spi].slot_type[i % BPF_REG_SIZE]) |
| 3609 | /* Ex: old explored (safe) state has STACK_SPILL in | 4294 | /* Ex: old explored (safe) state has STACK_SPILL in |
| @@ -3660,9 +4345,8 @@ static bool stacksafe(struct bpf_verifier_state *old, | |||
| 3660 | * whereas register type in current state is meaningful, it means that | 4345 | * whereas register type in current state is meaningful, it means that |
| 3661 | * the current state will reach 'bpf_exit' instruction safely | 4346 | * the current state will reach 'bpf_exit' instruction safely |
| 3662 | */ | 4347 | */ |
| 3663 | static bool states_equal(struct bpf_verifier_env *env, | 4348 | static bool func_states_equal(struct bpf_func_state *old, |
| 3664 | struct bpf_verifier_state *old, | 4349 | struct bpf_func_state *cur) |
| 3665 | struct bpf_verifier_state *cur) | ||
| 3666 | { | 4350 | { |
| 3667 | struct idpair *idmap; | 4351 | struct idpair *idmap; |
| 3668 | bool ret = false; | 4352 | bool ret = false; |
| @@ -3686,71 +4370,72 @@ out_free: | |||
| 3686 | return ret; | 4370 | return ret; |
| 3687 | } | 4371 | } |
| 3688 | 4372 | ||
| 4373 | static bool states_equal(struct bpf_verifier_env *env, | ||
| 4374 | struct bpf_verifier_state *old, | ||
| 4375 | struct bpf_verifier_state *cur) | ||
| 4376 | { | ||
| 4377 | int i; | ||
| 4378 | |||
| 4379 | if (old->curframe != cur->curframe) | ||
| 4380 | return false; | ||
| 4381 | |||
| 4382 | /* for states to be equal callsites have to be the same | ||
| 4383 | * and all frame states need to be equivalent | ||
| 4384 | */ | ||
| 4385 | for (i = 0; i <= old->curframe; i++) { | ||
| 4386 | if (old->frame[i]->callsite != cur->frame[i]->callsite) | ||
| 4387 | return false; | ||
| 4388 | if (!func_states_equal(old->frame[i], cur->frame[i])) | ||
| 4389 | return false; | ||
| 4390 | } | ||
| 4391 | return true; | ||
| 4392 | } | ||
| 4393 | |||
| 3689 | /* A write screens off any subsequent reads; but write marks come from the | 4394 | /* A write screens off any subsequent reads; but write marks come from the |
| 3690 | * straight-line code between a state and its parent. When we arrive at a | 4395 | * straight-line code between a state and its parent. When we arrive at an |
| 3691 | * jump target (in the first iteration of the propagate_liveness() loop), | 4396 | * equivalent state (jump target or such) we didn't arrive by the straight-line |
| 3692 | * we didn't arrive by the straight-line code, so read marks in state must | 4397 | * code, so read marks in the state must propagate to the parent regardless |
| 3693 | * propagate to parent regardless of state's write marks. | 4398 | * of the state's write marks. That's what 'parent == state->parent' comparison |
| 4399 | * in mark_reg_read() and mark_stack_slot_read() is for. | ||
| 3694 | */ | 4400 | */ |
| 3695 | static bool do_propagate_liveness(const struct bpf_verifier_state *state, | 4401 | static int propagate_liveness(struct bpf_verifier_env *env, |
| 3696 | struct bpf_verifier_state *parent) | 4402 | const struct bpf_verifier_state *vstate, |
| 4403 | struct bpf_verifier_state *vparent) | ||
| 3697 | { | 4404 | { |
| 3698 | bool writes = parent == state->parent; /* Observe write marks */ | 4405 | int i, frame, err = 0; |
| 3699 | bool touched = false; /* any changes made? */ | 4406 | struct bpf_func_state *state, *parent; |
| 3700 | int i; | ||
| 3701 | 4407 | ||
| 3702 | if (!parent) | 4408 | if (vparent->curframe != vstate->curframe) { |
| 3703 | return touched; | 4409 | WARN(1, "propagate_live: parent frame %d current frame %d\n", |
| 4410 | vparent->curframe, vstate->curframe); | ||
| 4411 | return -EFAULT; | ||
| 4412 | } | ||
| 3704 | /* Propagate read liveness of registers... */ | 4413 | /* Propagate read liveness of registers... */ |
| 3705 | BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); | 4414 | BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); |
| 3706 | /* We don't need to worry about FP liveness because it's read-only */ | 4415 | /* We don't need to worry about FP liveness because it's read-only */ |
| 3707 | for (i = 0; i < BPF_REG_FP; i++) { | 4416 | for (i = 0; i < BPF_REG_FP; i++) { |
| 3708 | if (parent->regs[i].live & REG_LIVE_READ) | 4417 | if (vparent->frame[vparent->curframe]->regs[i].live & REG_LIVE_READ) |
| 3709 | continue; | 4418 | continue; |
| 3710 | if (writes && (state->regs[i].live & REG_LIVE_WRITTEN)) | 4419 | if (vstate->frame[vstate->curframe]->regs[i].live & REG_LIVE_READ) { |
| 3711 | continue; | 4420 | err = mark_reg_read(env, vstate, vparent, i); |
| 3712 | if (state->regs[i].live & REG_LIVE_READ) { | 4421 | if (err) |
| 3713 | parent->regs[i].live |= REG_LIVE_READ; | 4422 | return err; |
| 3714 | touched = true; | ||
| 3715 | } | 4423 | } |
| 3716 | } | 4424 | } |
| 4425 | |||
| 3717 | /* ... and stack slots */ | 4426 | /* ... and stack slots */ |
| 3718 | for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && | 4427 | for (frame = 0; frame <= vstate->curframe; frame++) { |
| 3719 | i < parent->allocated_stack / BPF_REG_SIZE; i++) { | 4428 | state = vstate->frame[frame]; |
| 3720 | if (parent->stack[i].slot_type[0] != STACK_SPILL) | 4429 | parent = vparent->frame[frame]; |
| 3721 | continue; | 4430 | for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && |
| 3722 | if (state->stack[i].slot_type[0] != STACK_SPILL) | 4431 | i < parent->allocated_stack / BPF_REG_SIZE; i++) { |
| 3723 | continue; | 4432 | if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ) |
| 3724 | if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ) | 4433 | continue; |
| 3725 | continue; | 4434 | if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) |
| 3726 | if (writes && | 4435 | mark_stack_slot_read(env, vstate, vparent, i, frame); |
| 3727 | (state->stack[i].spilled_ptr.live & REG_LIVE_WRITTEN)) | ||
| 3728 | continue; | ||
| 3729 | if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) { | ||
| 3730 | parent->stack[i].spilled_ptr.live |= REG_LIVE_READ; | ||
| 3731 | touched = true; | ||
| 3732 | } | 4436 | } |
| 3733 | } | 4437 | } |
| 3734 | return touched; | 4438 | return err; |
| 3735 | } | ||
| 3736 | |||
| 3737 | /* "parent" is "a state from which we reach the current state", but initially | ||
| 3738 | * it is not the state->parent (i.e. "the state whose straight-line code leads | ||
| 3739 | * to the current state"), instead it is the state that happened to arrive at | ||
| 3740 | * a (prunable) equivalent of the current state. See comment above | ||
| 3741 | * do_propagate_liveness() for consequences of this. | ||
| 3742 | * This function is just a more efficient way of calling mark_reg_read() or | ||
| 3743 | * mark_stack_slot_read() on each reg in "parent" that is read in "state", | ||
| 3744 | * though it requires that parent != state->parent in the call arguments. | ||
| 3745 | */ | ||
| 3746 | static void propagate_liveness(const struct bpf_verifier_state *state, | ||
| 3747 | struct bpf_verifier_state *parent) | ||
| 3748 | { | ||
| 3749 | while (do_propagate_liveness(state, parent)) { | ||
| 3750 | /* Something changed, so we need to feed those changes onward */ | ||
| 3751 | state = parent; | ||
| 3752 | parent = state->parent; | ||
| 3753 | } | ||
| 3754 | } | 4439 | } |
| 3755 | 4440 | ||
| 3756 | static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) | 4441 | static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) |
| @@ -3758,7 +4443,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) | |||
| 3758 | struct bpf_verifier_state_list *new_sl; | 4443 | struct bpf_verifier_state_list *new_sl; |
| 3759 | struct bpf_verifier_state_list *sl; | 4444 | struct bpf_verifier_state_list *sl; |
| 3760 | struct bpf_verifier_state *cur = env->cur_state; | 4445 | struct bpf_verifier_state *cur = env->cur_state; |
| 3761 | int i, err; | 4446 | int i, j, err; |
| 3762 | 4447 | ||
| 3763 | sl = env->explored_states[insn_idx]; | 4448 | sl = env->explored_states[insn_idx]; |
| 3764 | if (!sl) | 4449 | if (!sl) |
| @@ -3779,7 +4464,9 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) | |||
| 3779 | * they'll be immediately forgotten as we're pruning | 4464 | * they'll be immediately forgotten as we're pruning |
| 3780 | * this state and will pop a new one. | 4465 | * this state and will pop a new one. |
| 3781 | */ | 4466 | */ |
| 3782 | propagate_liveness(&sl->state, cur); | 4467 | err = propagate_liveness(env, &sl->state, cur); |
| 4468 | if (err) | ||
| 4469 | return err; | ||
| 3783 | return 1; | 4470 | return 1; |
| 3784 | } | 4471 | } |
| 3785 | sl = sl->next; | 4472 | sl = sl->next; |
| @@ -3787,9 +4474,10 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) | |||
| 3787 | 4474 | ||
| 3788 | /* there were no equivalent states, remember current one. | 4475 | /* there were no equivalent states, remember current one. |
| 3789 | * technically the current state is not proven to be safe yet, | 4476 | * technically the current state is not proven to be safe yet, |
| 3790 | * but it will either reach bpf_exit (which means it's safe) or | 4477 | * but it will either reach outer most bpf_exit (which means it's safe) |
| 3791 | * it will be rejected. Since there are no loops, we won't be | 4478 | * or it will be rejected. Since there are no loops, we won't be |
| 3792 | * seeing this 'insn_idx' instruction again on the way to bpf_exit | 4479 | * seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx) |
| 4480 | * again on the way to bpf_exit | ||
| 3793 | */ | 4481 | */ |
| 3794 | new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL); | 4482 | new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL); |
| 3795 | if (!new_sl) | 4483 | if (!new_sl) |
| @@ -3813,19 +4501,15 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) | |||
| 3813 | * explored_states can get read marks.) | 4501 | * explored_states can get read marks.) |
| 3814 | */ | 4502 | */ |
| 3815 | for (i = 0; i < BPF_REG_FP; i++) | 4503 | for (i = 0; i < BPF_REG_FP; i++) |
| 3816 | cur->regs[i].live = REG_LIVE_NONE; | 4504 | cur->frame[cur->curframe]->regs[i].live = REG_LIVE_NONE; |
| 3817 | for (i = 0; i < cur->allocated_stack / BPF_REG_SIZE; i++) | ||
| 3818 | if (cur->stack[i].slot_type[0] == STACK_SPILL) | ||
| 3819 | cur->stack[i].spilled_ptr.live = REG_LIVE_NONE; | ||
| 3820 | return 0; | ||
| 3821 | } | ||
| 3822 | 4505 | ||
| 3823 | static int ext_analyzer_insn_hook(struct bpf_verifier_env *env, | 4506 | /* all stack frames are accessible from callee, clear them all */ |
| 3824 | int insn_idx, int prev_insn_idx) | 4507 | for (j = 0; j <= cur->curframe; j++) { |
| 3825 | { | 4508 | struct bpf_func_state *frame = cur->frame[j]; |
| 3826 | if (env->dev_ops && env->dev_ops->insn_hook) | ||
| 3827 | return env->dev_ops->insn_hook(env, insn_idx, prev_insn_idx); | ||
| 3828 | 4509 | ||
| 4510 | for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) | ||
| 4511 | frame->stack[i].spilled_ptr.live = REG_LIVE_NONE; | ||
| 4512 | } | ||
| 3829 | return 0; | 4513 | return 0; |
| 3830 | } | 4514 | } |
| 3831 | 4515 | ||
| @@ -3834,7 +4518,7 @@ static int do_check(struct bpf_verifier_env *env) | |||
| 3834 | struct bpf_verifier_state *state; | 4518 | struct bpf_verifier_state *state; |
| 3835 | struct bpf_insn *insns = env->prog->insnsi; | 4519 | struct bpf_insn *insns = env->prog->insnsi; |
| 3836 | struct bpf_reg_state *regs; | 4520 | struct bpf_reg_state *regs; |
| 3837 | int insn_cnt = env->prog->len; | 4521 | int insn_cnt = env->prog->len, i; |
| 3838 | int insn_idx, prev_insn_idx = 0; | 4522 | int insn_idx, prev_insn_idx = 0; |
| 3839 | int insn_processed = 0; | 4523 | int insn_processed = 0; |
| 3840 | bool do_print_state = false; | 4524 | bool do_print_state = false; |
| @@ -3842,9 +4526,18 @@ static int do_check(struct bpf_verifier_env *env) | |||
| 3842 | state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL); | 4526 | state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL); |
| 3843 | if (!state) | 4527 | if (!state) |
| 3844 | return -ENOMEM; | 4528 | return -ENOMEM; |
| 3845 | env->cur_state = state; | 4529 | state->curframe = 0; |
| 3846 | init_reg_state(env, state->regs); | ||
| 3847 | state->parent = NULL; | 4530 | state->parent = NULL; |
| 4531 | state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); | ||
| 4532 | if (!state->frame[0]) { | ||
| 4533 | kfree(state); | ||
| 4534 | return -ENOMEM; | ||
| 4535 | } | ||
| 4536 | env->cur_state = state; | ||
| 4537 | init_func_state(env, state->frame[0], | ||
| 4538 | BPF_MAIN_FUNC /* callsite */, | ||
| 4539 | 0 /* frameno */, | ||
| 4540 | 0 /* subprogno, zero == main subprog */); | ||
| 3848 | insn_idx = 0; | 4541 | insn_idx = 0; |
| 3849 | for (;;) { | 4542 | for (;;) { |
| 3850 | struct bpf_insn *insn; | 4543 | struct bpf_insn *insn; |
| @@ -3891,19 +4584,25 @@ static int do_check(struct bpf_verifier_env *env) | |||
| 3891 | else | 4584 | else |
| 3892 | verbose(env, "\nfrom %d to %d:", | 4585 | verbose(env, "\nfrom %d to %d:", |
| 3893 | prev_insn_idx, insn_idx); | 4586 | prev_insn_idx, insn_idx); |
| 3894 | print_verifier_state(env, state); | 4587 | print_verifier_state(env, state->frame[state->curframe]); |
| 3895 | do_print_state = false; | 4588 | do_print_state = false; |
| 3896 | } | 4589 | } |
| 3897 | 4590 | ||
| 3898 | if (env->log.level) { | 4591 | if (env->log.level) { |
| 4592 | const struct bpf_insn_cbs cbs = { | ||
| 4593 | .cb_print = verbose, | ||
| 4594 | }; | ||
| 4595 | |||
| 3899 | verbose(env, "%d: ", insn_idx); | 4596 | verbose(env, "%d: ", insn_idx); |
| 3900 | print_bpf_insn(verbose, env, insn, | 4597 | print_bpf_insn(&cbs, env, insn, env->allow_ptr_leaks); |
| 3901 | env->allow_ptr_leaks); | ||
| 3902 | } | 4598 | } |
| 3903 | 4599 | ||
| 3904 | err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx); | 4600 | if (bpf_prog_is_dev_bound(env->prog->aux)) { |
| 3905 | if (err) | 4601 | err = bpf_prog_offload_verify_insn(env, insn_idx, |
| 3906 | return err; | 4602 | prev_insn_idx); |
| 4603 | if (err) | ||
| 4604 | return err; | ||
| 4605 | } | ||
| 3907 | 4606 | ||
| 3908 | regs = cur_regs(env); | 4607 | regs = cur_regs(env); |
| 3909 | env->insn_aux_data[insn_idx].seen = true; | 4608 | env->insn_aux_data[insn_idx].seen = true; |
| @@ -4030,13 +4729,17 @@ static int do_check(struct bpf_verifier_env *env) | |||
| 4030 | if (opcode == BPF_CALL) { | 4729 | if (opcode == BPF_CALL) { |
| 4031 | if (BPF_SRC(insn->code) != BPF_K || | 4730 | if (BPF_SRC(insn->code) != BPF_K || |
| 4032 | insn->off != 0 || | 4731 | insn->off != 0 || |
| 4033 | insn->src_reg != BPF_REG_0 || | 4732 | (insn->src_reg != BPF_REG_0 && |
| 4733 | insn->src_reg != BPF_PSEUDO_CALL) || | ||
| 4034 | insn->dst_reg != BPF_REG_0) { | 4734 | insn->dst_reg != BPF_REG_0) { |
| 4035 | verbose(env, "BPF_CALL uses reserved fields\n"); | 4735 | verbose(env, "BPF_CALL uses reserved fields\n"); |
| 4036 | return -EINVAL; | 4736 | return -EINVAL; |
| 4037 | } | 4737 | } |
| 4038 | 4738 | ||
| 4039 | err = check_call(env, insn->imm, insn_idx); | 4739 | if (insn->src_reg == BPF_PSEUDO_CALL) |
| 4740 | err = check_func_call(env, insn, &insn_idx); | ||
| 4741 | else | ||
| 4742 | err = check_helper_call(env, insn->imm, insn_idx); | ||
| 4040 | if (err) | 4743 | if (err) |
| 4041 | return err; | 4744 | return err; |
| 4042 | 4745 | ||
| @@ -4061,6 +4764,16 @@ static int do_check(struct bpf_verifier_env *env) | |||
| 4061 | return -EINVAL; | 4764 | return -EINVAL; |
| 4062 | } | 4765 | } |
| 4063 | 4766 | ||
| 4767 | if (state->curframe) { | ||
| 4768 | /* exit from nested function */ | ||
| 4769 | prev_insn_idx = insn_idx; | ||
| 4770 | err = prepare_func_exit(env, &insn_idx); | ||
| 4771 | if (err) | ||
| 4772 | return err; | ||
| 4773 | do_print_state = true; | ||
| 4774 | continue; | ||
| 4775 | } | ||
| 4776 | |||
| 4064 | /* eBPF calling convetion is such that R0 is used | 4777 | /* eBPF calling convetion is such that R0 is used |
| 4065 | * to return the value from eBPF program. | 4778 | * to return the value from eBPF program. |
| 4066 | * Make sure that it's readable at this time | 4779 | * Make sure that it's readable at this time |
| @@ -4121,8 +4834,17 @@ process_bpf_exit: | |||
| 4121 | insn_idx++; | 4834 | insn_idx++; |
| 4122 | } | 4835 | } |
| 4123 | 4836 | ||
| 4124 | verbose(env, "processed %d insns, stack depth %d\n", insn_processed, | 4837 | verbose(env, "processed %d insns (limit %d), stack depth ", |
| 4125 | env->prog->aux->stack_depth); | 4838 | insn_processed, BPF_COMPLEXITY_LIMIT_INSNS); |
| 4839 | for (i = 0; i < env->subprog_cnt + 1; i++) { | ||
| 4840 | u32 depth = env->subprog_stack_depth[i]; | ||
| 4841 | |||
| 4842 | verbose(env, "%d", depth); | ||
| 4843 | if (i + 1 < env->subprog_cnt + 1) | ||
| 4844 | verbose(env, "+"); | ||
| 4845 | } | ||
| 4846 | verbose(env, "\n"); | ||
| 4847 | env->prog->aux->stack_depth = env->subprog_stack_depth[0]; | ||
| 4126 | return 0; | 4848 | return 0; |
| 4127 | } | 4849 | } |
| 4128 | 4850 | ||
| @@ -4155,6 +4877,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, | |||
| 4155 | return -EINVAL; | 4877 | return -EINVAL; |
| 4156 | } | 4878 | } |
| 4157 | } | 4879 | } |
| 4880 | |||
| 4881 | if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) && | ||
| 4882 | !bpf_offload_dev_match(prog, map)) { | ||
| 4883 | verbose(env, "offload device mismatch between prog and map\n"); | ||
| 4884 | return -EINVAL; | ||
| 4885 | } | ||
| 4886 | |||
| 4158 | return 0; | 4887 | return 0; |
| 4159 | } | 4888 | } |
| 4160 | 4889 | ||
| @@ -4252,6 +4981,13 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) | |||
| 4252 | next_insn: | 4981 | next_insn: |
| 4253 | insn++; | 4982 | insn++; |
| 4254 | i++; | 4983 | i++; |
| 4984 | continue; | ||
| 4985 | } | ||
| 4986 | |||
| 4987 | /* Basic sanity check before we invest more work here. */ | ||
| 4988 | if (!bpf_opcode_in_insntable(insn->code)) { | ||
| 4989 | verbose(env, "unknown opcode %02x\n", insn->code); | ||
| 4990 | return -EINVAL; | ||
| 4255 | } | 4991 | } |
| 4256 | } | 4992 | } |
| 4257 | 4993 | ||
| @@ -4308,6 +5044,19 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len, | |||
| 4308 | return 0; | 5044 | return 0; |
| 4309 | } | 5045 | } |
| 4310 | 5046 | ||
| 5047 | static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len) | ||
| 5048 | { | ||
| 5049 | int i; | ||
| 5050 | |||
| 5051 | if (len == 1) | ||
| 5052 | return; | ||
| 5053 | for (i = 0; i < env->subprog_cnt; i++) { | ||
| 5054 | if (env->subprog_starts[i] < off) | ||
| 5055 | continue; | ||
| 5056 | env->subprog_starts[i] += len - 1; | ||
| 5057 | } | ||
| 5058 | } | ||
| 5059 | |||
| 4311 | static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off, | 5060 | static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off, |
| 4312 | const struct bpf_insn *patch, u32 len) | 5061 | const struct bpf_insn *patch, u32 len) |
| 4313 | { | 5062 | { |
| @@ -4318,17 +5067,25 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of | |||
| 4318 | return NULL; | 5067 | return NULL; |
| 4319 | if (adjust_insn_aux_data(env, new_prog->len, off, len)) | 5068 | if (adjust_insn_aux_data(env, new_prog->len, off, len)) |
| 4320 | return NULL; | 5069 | return NULL; |
| 5070 | adjust_subprog_starts(env, off, len); | ||
| 4321 | return new_prog; | 5071 | return new_prog; |
| 4322 | } | 5072 | } |
| 4323 | 5073 | ||
| 4324 | /* The verifier does more data flow analysis than llvm and will not explore | 5074 | /* The verifier does more data flow analysis than llvm and will not |
| 4325 | * branches that are dead at run time. Malicious programs can have dead code | 5075 | * explore branches that are dead at run time. Malicious programs can |
| 4326 | * too. Therefore replace all dead at-run-time code with nops. | 5076 | * have dead code too. Therefore replace all dead at-run-time code |
| 5077 | * with 'ja -1'. | ||
| 5078 | * | ||
| 5079 | * Just nops are not optimal, e.g. if they would sit at the end of the | ||
| 5080 | * program and through another bug we would manage to jump there, then | ||
| 5081 | * we'd execute beyond program memory otherwise. Returning exception | ||
| 5082 | * code also wouldn't work since we can have subprogs where the dead | ||
| 5083 | * code could be located. | ||
| 4327 | */ | 5084 | */ |
| 4328 | static void sanitize_dead_code(struct bpf_verifier_env *env) | 5085 | static void sanitize_dead_code(struct bpf_verifier_env *env) |
| 4329 | { | 5086 | { |
| 4330 | struct bpf_insn_aux_data *aux_data = env->insn_aux_data; | 5087 | struct bpf_insn_aux_data *aux_data = env->insn_aux_data; |
| 4331 | struct bpf_insn nop = BPF_MOV64_REG(BPF_REG_0, BPF_REG_0); | 5088 | struct bpf_insn trap = BPF_JMP_IMM(BPF_JA, 0, 0, -1); |
| 4332 | struct bpf_insn *insn = env->prog->insnsi; | 5089 | struct bpf_insn *insn = env->prog->insnsi; |
| 4333 | const int insn_cnt = env->prog->len; | 5090 | const int insn_cnt = env->prog->len; |
| 4334 | int i; | 5091 | int i; |
| @@ -4336,7 +5093,7 @@ static void sanitize_dead_code(struct bpf_verifier_env *env) | |||
| 4336 | for (i = 0; i < insn_cnt; i++) { | 5093 | for (i = 0; i < insn_cnt; i++) { |
| 4337 | if (aux_data[i].seen) | 5094 | if (aux_data[i].seen) |
| 4338 | continue; | 5095 | continue; |
| 4339 | memcpy(insn + i, &nop, sizeof(nop)); | 5096 | memcpy(insn + i, &trap, sizeof(trap)); |
| 4340 | } | 5097 | } |
| 4341 | } | 5098 | } |
| 4342 | 5099 | ||
| @@ -4452,6 +5209,180 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) | |||
| 4452 | return 0; | 5209 | return 0; |
| 4453 | } | 5210 | } |
| 4454 | 5211 | ||
| 5212 | static int jit_subprogs(struct bpf_verifier_env *env) | ||
| 5213 | { | ||
| 5214 | struct bpf_prog *prog = env->prog, **func, *tmp; | ||
| 5215 | int i, j, subprog_start, subprog_end = 0, len, subprog; | ||
| 5216 | struct bpf_insn *insn; | ||
| 5217 | void *old_bpf_func; | ||
| 5218 | int err = -ENOMEM; | ||
| 5219 | |||
| 5220 | if (env->subprog_cnt == 0) | ||
| 5221 | return 0; | ||
| 5222 | |||
| 5223 | for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { | ||
| 5224 | if (insn->code != (BPF_JMP | BPF_CALL) || | ||
| 5225 | insn->src_reg != BPF_PSEUDO_CALL) | ||
| 5226 | continue; | ||
| 5227 | subprog = find_subprog(env, i + insn->imm + 1); | ||
| 5228 | if (subprog < 0) { | ||
| 5229 | WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", | ||
| 5230 | i + insn->imm + 1); | ||
| 5231 | return -EFAULT; | ||
| 5232 | } | ||
| 5233 | /* temporarily remember subprog id inside insn instead of | ||
| 5234 | * aux_data, since next loop will split up all insns into funcs | ||
| 5235 | */ | ||
| 5236 | insn->off = subprog + 1; | ||
| 5237 | /* remember original imm in case JIT fails and fallback | ||
| 5238 | * to interpreter will be needed | ||
| 5239 | */ | ||
| 5240 | env->insn_aux_data[i].call_imm = insn->imm; | ||
| 5241 | /* point imm to __bpf_call_base+1 from JITs point of view */ | ||
| 5242 | insn->imm = 1; | ||
| 5243 | } | ||
| 5244 | |||
| 5245 | func = kzalloc(sizeof(prog) * (env->subprog_cnt + 1), GFP_KERNEL); | ||
| 5246 | if (!func) | ||
| 5247 | return -ENOMEM; | ||
| 5248 | |||
| 5249 | for (i = 0; i <= env->subprog_cnt; i++) { | ||
| 5250 | subprog_start = subprog_end; | ||
| 5251 | if (env->subprog_cnt == i) | ||
| 5252 | subprog_end = prog->len; | ||
| 5253 | else | ||
| 5254 | subprog_end = env->subprog_starts[i]; | ||
| 5255 | |||
| 5256 | len = subprog_end - subprog_start; | ||
| 5257 | func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER); | ||
| 5258 | if (!func[i]) | ||
| 5259 | goto out_free; | ||
| 5260 | memcpy(func[i]->insnsi, &prog->insnsi[subprog_start], | ||
| 5261 | len * sizeof(struct bpf_insn)); | ||
| 5262 | func[i]->type = prog->type; | ||
| 5263 | func[i]->len = len; | ||
| 5264 | if (bpf_prog_calc_tag(func[i])) | ||
| 5265 | goto out_free; | ||
| 5266 | func[i]->is_func = 1; | ||
| 5267 | /* Use bpf_prog_F_tag to indicate functions in stack traces. | ||
| 5268 | * Long term would need debug info to populate names | ||
| 5269 | */ | ||
| 5270 | func[i]->aux->name[0] = 'F'; | ||
| 5271 | func[i]->aux->stack_depth = env->subprog_stack_depth[i]; | ||
| 5272 | func[i]->jit_requested = 1; | ||
| 5273 | func[i] = bpf_int_jit_compile(func[i]); | ||
| 5274 | if (!func[i]->jited) { | ||
| 5275 | err = -ENOTSUPP; | ||
| 5276 | goto out_free; | ||
| 5277 | } | ||
| 5278 | cond_resched(); | ||
| 5279 | } | ||
| 5280 | /* at this point all bpf functions were successfully JITed | ||
| 5281 | * now populate all bpf_calls with correct addresses and | ||
| 5282 | * run last pass of JIT | ||
| 5283 | */ | ||
| 5284 | for (i = 0; i <= env->subprog_cnt; i++) { | ||
| 5285 | insn = func[i]->insnsi; | ||
| 5286 | for (j = 0; j < func[i]->len; j++, insn++) { | ||
| 5287 | if (insn->code != (BPF_JMP | BPF_CALL) || | ||
| 5288 | insn->src_reg != BPF_PSEUDO_CALL) | ||
| 5289 | continue; | ||
| 5290 | subprog = insn->off; | ||
| 5291 | insn->off = 0; | ||
| 5292 | insn->imm = (u64 (*)(u64, u64, u64, u64, u64)) | ||
| 5293 | func[subprog]->bpf_func - | ||
| 5294 | __bpf_call_base; | ||
| 5295 | } | ||
| 5296 | } | ||
| 5297 | for (i = 0; i <= env->subprog_cnt; i++) { | ||
| 5298 | old_bpf_func = func[i]->bpf_func; | ||
| 5299 | tmp = bpf_int_jit_compile(func[i]); | ||
| 5300 | if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) { | ||
| 5301 | verbose(env, "JIT doesn't support bpf-to-bpf calls\n"); | ||
| 5302 | err = -EFAULT; | ||
| 5303 | goto out_free; | ||
| 5304 | } | ||
| 5305 | cond_resched(); | ||
| 5306 | } | ||
| 5307 | |||
| 5308 | /* finally lock prog and jit images for all functions and | ||
| 5309 | * populate kallsysm | ||
| 5310 | */ | ||
| 5311 | for (i = 0; i <= env->subprog_cnt; i++) { | ||
| 5312 | bpf_prog_lock_ro(func[i]); | ||
| 5313 | bpf_prog_kallsyms_add(func[i]); | ||
| 5314 | } | ||
| 5315 | |||
| 5316 | /* Last step: make now unused interpreter insns from main | ||
| 5317 | * prog consistent for later dump requests, so they can | ||
| 5318 | * later look the same as if they were interpreted only. | ||
| 5319 | */ | ||
| 5320 | for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { | ||
| 5321 | unsigned long addr; | ||
| 5322 | |||
| 5323 | if (insn->code != (BPF_JMP | BPF_CALL) || | ||
| 5324 | insn->src_reg != BPF_PSEUDO_CALL) | ||
| 5325 | continue; | ||
| 5326 | insn->off = env->insn_aux_data[i].call_imm; | ||
| 5327 | subprog = find_subprog(env, i + insn->off + 1); | ||
| 5328 | addr = (unsigned long)func[subprog + 1]->bpf_func; | ||
| 5329 | addr &= PAGE_MASK; | ||
| 5330 | insn->imm = (u64 (*)(u64, u64, u64, u64, u64)) | ||
| 5331 | addr - __bpf_call_base; | ||
| 5332 | } | ||
| 5333 | |||
| 5334 | prog->jited = 1; | ||
| 5335 | prog->bpf_func = func[0]->bpf_func; | ||
| 5336 | prog->aux->func = func; | ||
| 5337 | prog->aux->func_cnt = env->subprog_cnt + 1; | ||
| 5338 | return 0; | ||
| 5339 | out_free: | ||
| 5340 | for (i = 0; i <= env->subprog_cnt; i++) | ||
| 5341 | if (func[i]) | ||
| 5342 | bpf_jit_free(func[i]); | ||
| 5343 | kfree(func); | ||
| 5344 | /* cleanup main prog to be interpreted */ | ||
| 5345 | prog->jit_requested = 0; | ||
| 5346 | for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { | ||
| 5347 | if (insn->code != (BPF_JMP | BPF_CALL) || | ||
| 5348 | insn->src_reg != BPF_PSEUDO_CALL) | ||
| 5349 | continue; | ||
| 5350 | insn->off = 0; | ||
| 5351 | insn->imm = env->insn_aux_data[i].call_imm; | ||
| 5352 | } | ||
| 5353 | return err; | ||
| 5354 | } | ||
| 5355 | |||
| 5356 | static int fixup_call_args(struct bpf_verifier_env *env) | ||
| 5357 | { | ||
| 5358 | #ifndef CONFIG_BPF_JIT_ALWAYS_ON | ||
| 5359 | struct bpf_prog *prog = env->prog; | ||
| 5360 | struct bpf_insn *insn = prog->insnsi; | ||
| 5361 | int i, depth; | ||
| 5362 | #endif | ||
| 5363 | int err; | ||
| 5364 | |||
| 5365 | err = 0; | ||
| 5366 | if (env->prog->jit_requested) { | ||
| 5367 | err = jit_subprogs(env); | ||
| 5368 | if (err == 0) | ||
| 5369 | return 0; | ||
| 5370 | } | ||
| 5371 | #ifndef CONFIG_BPF_JIT_ALWAYS_ON | ||
| 5372 | for (i = 0; i < prog->len; i++, insn++) { | ||
| 5373 | if (insn->code != (BPF_JMP | BPF_CALL) || | ||
| 5374 | insn->src_reg != BPF_PSEUDO_CALL) | ||
| 5375 | continue; | ||
| 5376 | depth = get_callee_stack_depth(env, insn, i); | ||
| 5377 | if (depth < 0) | ||
| 5378 | return depth; | ||
| 5379 | bpf_patch_call_args(insn, depth); | ||
| 5380 | } | ||
| 5381 | err = 0; | ||
| 5382 | #endif | ||
| 5383 | return err; | ||
| 5384 | } | ||
| 5385 | |||
| 4455 | /* fixup insn->imm field of bpf_call instructions | 5386 | /* fixup insn->imm field of bpf_call instructions |
| 4456 | * and inline eligible helpers as explicit sequence of BPF instructions | 5387 | * and inline eligible helpers as explicit sequence of BPF instructions |
| 4457 | * | 5388 | * |
| @@ -4469,15 +5400,37 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) | |||
| 4469 | int i, cnt, delta = 0; | 5400 | int i, cnt, delta = 0; |
| 4470 | 5401 | ||
| 4471 | for (i = 0; i < insn_cnt; i++, insn++) { | 5402 | for (i = 0; i < insn_cnt; i++, insn++) { |
| 4472 | if (insn->code == (BPF_ALU | BPF_MOD | BPF_X) || | 5403 | if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) || |
| 5404 | insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) || | ||
| 5405 | insn->code == (BPF_ALU | BPF_MOD | BPF_X) || | ||
| 4473 | insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { | 5406 | insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { |
| 4474 | /* due to JIT bugs clear upper 32-bits of src register | 5407 | bool is64 = BPF_CLASS(insn->code) == BPF_ALU64; |
| 4475 | * before div/mod operation | 5408 | struct bpf_insn mask_and_div[] = { |
| 4476 | */ | 5409 | BPF_MOV32_REG(insn->src_reg, insn->src_reg), |
| 4477 | insn_buf[0] = BPF_MOV32_REG(insn->src_reg, insn->src_reg); | 5410 | /* Rx div 0 -> 0 */ |
| 4478 | insn_buf[1] = *insn; | 5411 | BPF_JMP_IMM(BPF_JNE, insn->src_reg, 0, 2), |
| 4479 | cnt = 2; | 5412 | BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg), |
| 4480 | new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); | 5413 | BPF_JMP_IMM(BPF_JA, 0, 0, 1), |
| 5414 | *insn, | ||
| 5415 | }; | ||
| 5416 | struct bpf_insn mask_and_mod[] = { | ||
| 5417 | BPF_MOV32_REG(insn->src_reg, insn->src_reg), | ||
| 5418 | /* Rx mod 0 -> Rx */ | ||
| 5419 | BPF_JMP_IMM(BPF_JEQ, insn->src_reg, 0, 1), | ||
| 5420 | *insn, | ||
| 5421 | }; | ||
| 5422 | struct bpf_insn *patchlet; | ||
| 5423 | |||
| 5424 | if (insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) || | ||
| 5425 | insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { | ||
| 5426 | patchlet = mask_and_div + (is64 ? 1 : 0); | ||
| 5427 | cnt = ARRAY_SIZE(mask_and_div) - (is64 ? 1 : 0); | ||
| 5428 | } else { | ||
| 5429 | patchlet = mask_and_mod + (is64 ? 1 : 0); | ||
| 5430 | cnt = ARRAY_SIZE(mask_and_mod) - (is64 ? 1 : 0); | ||
| 5431 | } | ||
| 5432 | |||
| 5433 | new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt); | ||
| 4481 | if (!new_prog) | 5434 | if (!new_prog) |
| 4482 | return -ENOMEM; | 5435 | return -ENOMEM; |
| 4483 | 5436 | ||
| @@ -4489,11 +5442,15 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) | |||
| 4489 | 5442 | ||
| 4490 | if (insn->code != (BPF_JMP | BPF_CALL)) | 5443 | if (insn->code != (BPF_JMP | BPF_CALL)) |
| 4491 | continue; | 5444 | continue; |
| 5445 | if (insn->src_reg == BPF_PSEUDO_CALL) | ||
| 5446 | continue; | ||
| 4492 | 5447 | ||
| 4493 | if (insn->imm == BPF_FUNC_get_route_realm) | 5448 | if (insn->imm == BPF_FUNC_get_route_realm) |
| 4494 | prog->dst_needed = 1; | 5449 | prog->dst_needed = 1; |
| 4495 | if (insn->imm == BPF_FUNC_get_prandom_u32) | 5450 | if (insn->imm == BPF_FUNC_get_prandom_u32) |
| 4496 | bpf_user_rnd_init_once(); | 5451 | bpf_user_rnd_init_once(); |
| 5452 | if (insn->imm == BPF_FUNC_override_return) | ||
| 5453 | prog->kprobe_override = 1; | ||
| 4497 | if (insn->imm == BPF_FUNC_tail_call) { | 5454 | if (insn->imm == BPF_FUNC_tail_call) { |
| 4498 | /* If we tail call into other programs, we | 5455 | /* If we tail call into other programs, we |
| 4499 | * cannot make any assumptions since they can | 5456 | * cannot make any assumptions since they can |
| @@ -4545,7 +5502,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) | |||
| 4545 | /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup | 5502 | /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup |
| 4546 | * handlers are currently limited to 64 bit only. | 5503 | * handlers are currently limited to 64 bit only. |
| 4547 | */ | 5504 | */ |
| 4548 | if (ebpf_jit_enabled() && BITS_PER_LONG == 64 && | 5505 | if (prog->jit_requested && BITS_PER_LONG == 64 && |
| 4549 | insn->imm == BPF_FUNC_map_lookup_elem) { | 5506 | insn->imm == BPF_FUNC_map_lookup_elem) { |
| 4550 | map_ptr = env->insn_aux_data[i + delta].map_ptr; | 5507 | map_ptr = env->insn_aux_data[i + delta].map_ptr; |
| 4551 | if (map_ptr == BPF_MAP_PTR_POISON || | 5508 | if (map_ptr == BPF_MAP_PTR_POISON || |
| @@ -4680,7 +5637,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) | |||
| 4680 | if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) | 5637 | if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) |
| 4681 | env->strict_alignment = true; | 5638 | env->strict_alignment = true; |
| 4682 | 5639 | ||
| 4683 | if (env->prog->aux->offload) { | 5640 | if (bpf_prog_is_dev_bound(env->prog->aux)) { |
| 4684 | ret = bpf_prog_offload_verifier_prep(env); | 5641 | ret = bpf_prog_offload_verifier_prep(env); |
| 4685 | if (ret) | 5642 | if (ret) |
| 4686 | goto err_unlock; | 5643 | goto err_unlock; |
| @@ -4697,12 +5654,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) | |||
| 4697 | if (!env->explored_states) | 5654 | if (!env->explored_states) |
| 4698 | goto skip_full_check; | 5655 | goto skip_full_check; |
| 4699 | 5656 | ||
| 5657 | env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); | ||
| 5658 | |||
| 4700 | ret = check_cfg(env); | 5659 | ret = check_cfg(env); |
| 4701 | if (ret < 0) | 5660 | if (ret < 0) |
| 4702 | goto skip_full_check; | 5661 | goto skip_full_check; |
| 4703 | 5662 | ||
| 4704 | env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); | ||
| 4705 | |||
| 4706 | ret = do_check(env); | 5663 | ret = do_check(env); |
| 4707 | if (env->cur_state) { | 5664 | if (env->cur_state) { |
| 4708 | free_verifier_state(env->cur_state, true); | 5665 | free_verifier_state(env->cur_state, true); |
| @@ -4717,12 +5674,18 @@ skip_full_check: | |||
| 4717 | sanitize_dead_code(env); | 5674 | sanitize_dead_code(env); |
| 4718 | 5675 | ||
| 4719 | if (ret == 0) | 5676 | if (ret == 0) |
| 5677 | ret = check_max_stack_depth(env); | ||
| 5678 | |||
| 5679 | if (ret == 0) | ||
| 4720 | /* program is valid, convert *(u32*)(ctx + off) accesses */ | 5680 | /* program is valid, convert *(u32*)(ctx + off) accesses */ |
| 4721 | ret = convert_ctx_accesses(env); | 5681 | ret = convert_ctx_accesses(env); |
| 4722 | 5682 | ||
| 4723 | if (ret == 0) | 5683 | if (ret == 0) |
| 4724 | ret = fixup_bpf_calls(env); | 5684 | ret = fixup_bpf_calls(env); |
| 4725 | 5685 | ||
| 5686 | if (ret == 0) | ||
| 5687 | ret = fixup_call_args(env); | ||
| 5688 | |||
| 4726 | if (log->level && bpf_verifier_log_full(log)) | 5689 | if (log->level && bpf_verifier_log_full(log)) |
| 4727 | ret = -ENOSPC; | 5690 | ret = -ENOSPC; |
| 4728 | if (log->level && !log->ubuf) { | 5691 | if (log->level && !log->ubuf) { |
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 7e4c44538119..8cda3bc3ae22 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c | |||
| @@ -1397,7 +1397,7 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, | |||
| 1397 | cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, | 1397 | cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, |
| 1398 | cft->name); | 1398 | cft->name); |
| 1399 | else | 1399 | else |
| 1400 | strlcpy(buf, cft->name, CGROUP_FILE_NAME_MAX); | 1400 | strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX); |
| 1401 | return buf; | 1401 | return buf; |
| 1402 | } | 1402 | } |
| 1403 | 1403 | ||
| @@ -1864,9 +1864,9 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) | |||
| 1864 | 1864 | ||
| 1865 | root->flags = opts->flags; | 1865 | root->flags = opts->flags; |
| 1866 | if (opts->release_agent) | 1866 | if (opts->release_agent) |
| 1867 | strlcpy(root->release_agent_path, opts->release_agent, PATH_MAX); | 1867 | strscpy(root->release_agent_path, opts->release_agent, PATH_MAX); |
| 1868 | if (opts->name) | 1868 | if (opts->name) |
| 1869 | strlcpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN); | 1869 | strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN); |
| 1870 | if (opts->cpuset_clone_children) | 1870 | if (opts->cpuset_clone_children) |
| 1871 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); | 1871 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); |
| 1872 | } | 1872 | } |
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index c8146d53ca67..dbb0781a0533 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c | |||
| @@ -2441,7 +2441,6 @@ static int kdb_kill(int argc, const char **argv) | |||
| 2441 | long sig, pid; | 2441 | long sig, pid; |
| 2442 | char *endp; | 2442 | char *endp; |
| 2443 | struct task_struct *p; | 2443 | struct task_struct *p; |
| 2444 | struct siginfo info; | ||
| 2445 | 2444 | ||
| 2446 | if (argc != 2) | 2445 | if (argc != 2) |
| 2447 | return KDB_ARGCOUNT; | 2446 | return KDB_ARGCOUNT; |
| @@ -2449,7 +2448,7 @@ static int kdb_kill(int argc, const char **argv) | |||
| 2449 | sig = simple_strtol(argv[1], &endp, 0); | 2448 | sig = simple_strtol(argv[1], &endp, 0); |
| 2450 | if (*endp) | 2449 | if (*endp) |
| 2451 | return KDB_BADINT; | 2450 | return KDB_BADINT; |
| 2452 | if (sig >= 0) { | 2451 | if ((sig >= 0) || !valid_signal(-sig)) { |
| 2453 | kdb_printf("Invalid signal parameter.<-signal>\n"); | 2452 | kdb_printf("Invalid signal parameter.<-signal>\n"); |
| 2454 | return 0; | 2453 | return 0; |
| 2455 | } | 2454 | } |
| @@ -2470,12 +2469,7 @@ static int kdb_kill(int argc, const char **argv) | |||
| 2470 | return 0; | 2469 | return 0; |
| 2471 | } | 2470 | } |
| 2472 | p = p->group_leader; | 2471 | p = p->group_leader; |
| 2473 | info.si_signo = sig; | 2472 | kdb_send_sig(p, sig); |
| 2474 | info.si_errno = 0; | ||
| 2475 | info.si_code = SI_USER; | ||
| 2476 | info.si_pid = pid; /* same capabilities as process being signalled */ | ||
| 2477 | info.si_uid = 0; /* kdb has root authority */ | ||
| 2478 | kdb_send_sig_info(p, &info); | ||
| 2479 | return 0; | 2473 | return 0; |
| 2480 | } | 2474 | } |
| 2481 | 2475 | ||
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index fc224fbcf954..1e5a502ba4a7 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h | |||
| @@ -208,7 +208,7 @@ extern unsigned long kdb_task_state(const struct task_struct *p, | |||
| 208 | extern void kdb_ps_suppressed(void); | 208 | extern void kdb_ps_suppressed(void); |
| 209 | extern void kdb_ps1(const struct task_struct *p); | 209 | extern void kdb_ps1(const struct task_struct *p); |
| 210 | extern void kdb_print_nameval(const char *name, unsigned long val); | 210 | extern void kdb_print_nameval(const char *name, unsigned long val); |
| 211 | extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info); | 211 | extern void kdb_send_sig(struct task_struct *p, int sig); |
| 212 | extern void kdb_meminfo_proc_show(void); | 212 | extern void kdb_meminfo_proc_show(void); |
| 213 | extern char *kdb_getstr(char *, size_t, const char *); | 213 | extern char *kdb_getstr(char *, size_t, const char *); |
| 214 | extern void kdb_gdb_state_pass(char *buf); | 214 | extern void kdb_gdb_state_pass(char *buf); |
diff --git a/kernel/events/core.c b/kernel/events/core.c index d99fe3fdec8a..f0549e79978b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
| @@ -4520,11 +4520,11 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) | |||
| 4520 | return ret; | 4520 | return ret; |
| 4521 | } | 4521 | } |
| 4522 | 4522 | ||
| 4523 | static unsigned int perf_poll(struct file *file, poll_table *wait) | 4523 | static __poll_t perf_poll(struct file *file, poll_table *wait) |
| 4524 | { | 4524 | { |
| 4525 | struct perf_event *event = file->private_data; | 4525 | struct perf_event *event = file->private_data; |
| 4526 | struct ring_buffer *rb; | 4526 | struct ring_buffer *rb; |
| 4527 | unsigned int events = POLLHUP; | 4527 | __poll_t events = POLLHUP; |
| 4528 | 4528 | ||
| 4529 | poll_wait(file, &event->waitq, wait); | 4529 | poll_wait(file, &event->waitq, wait); |
| 4530 | 4530 | ||
| @@ -4732,6 +4732,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon | |||
| 4732 | rcu_read_unlock(); | 4732 | rcu_read_unlock(); |
| 4733 | return 0; | 4733 | return 0; |
| 4734 | } | 4734 | } |
| 4735 | |||
| 4736 | case PERF_EVENT_IOC_QUERY_BPF: | ||
| 4737 | return perf_event_query_prog_array(event, (void __user *)arg); | ||
| 4735 | default: | 4738 | default: |
| 4736 | return -ENOTTY; | 4739 | return -ENOTTY; |
| 4737 | } | 4740 | } |
| @@ -4913,6 +4916,7 @@ void perf_event_update_userpage(struct perf_event *event) | |||
| 4913 | unlock: | 4916 | unlock: |
| 4914 | rcu_read_unlock(); | 4917 | rcu_read_unlock(); |
| 4915 | } | 4918 | } |
| 4919 | EXPORT_SYMBOL_GPL(perf_event_update_userpage); | ||
| 4916 | 4920 | ||
| 4917 | static int perf_mmap_fault(struct vm_fault *vmf) | 4921 | static int perf_mmap_fault(struct vm_fault *vmf) |
| 4918 | { | 4922 | { |
| @@ -8099,6 +8103,13 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) | |||
| 8099 | return -EINVAL; | 8103 | return -EINVAL; |
| 8100 | } | 8104 | } |
| 8101 | 8105 | ||
| 8106 | /* Kprobe override only works for kprobes, not uprobes. */ | ||
| 8107 | if (prog->kprobe_override && | ||
| 8108 | !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) { | ||
| 8109 | bpf_prog_put(prog); | ||
| 8110 | return -EINVAL; | ||
| 8111 | } | ||
| 8112 | |||
| 8102 | if (is_tracepoint || is_syscall_tp) { | 8113 | if (is_tracepoint || is_syscall_tp) { |
| 8103 | int off = trace_event_get_offsets(event->tp_event); | 8114 | int off = trace_event_get_offsets(event->tp_event); |
| 8104 | 8115 | ||
diff --git a/kernel/fail_function.c b/kernel/fail_function.c new file mode 100644 index 000000000000..21b0122cb39c --- /dev/null +++ b/kernel/fail_function.c | |||
| @@ -0,0 +1,349 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0 | ||
| 2 | /* | ||
| 3 | * fail_function.c: Function-based error injection | ||
| 4 | */ | ||
| 5 | #include <linux/error-injection.h> | ||
| 6 | #include <linux/debugfs.h> | ||
| 7 | #include <linux/fault-inject.h> | ||
| 8 | #include <linux/kallsyms.h> | ||
| 9 | #include <linux/kprobes.h> | ||
| 10 | #include <linux/module.h> | ||
| 11 | #include <linux/mutex.h> | ||
| 12 | #include <linux/slab.h> | ||
| 13 | #include <linux/uaccess.h> | ||
| 14 | |||
| 15 | static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs); | ||
| 16 | |||
| 17 | struct fei_attr { | ||
| 18 | struct list_head list; | ||
| 19 | struct kprobe kp; | ||
| 20 | unsigned long retval; | ||
| 21 | }; | ||
| 22 | static DEFINE_MUTEX(fei_lock); | ||
| 23 | static LIST_HEAD(fei_attr_list); | ||
| 24 | static DECLARE_FAULT_ATTR(fei_fault_attr); | ||
| 25 | static struct dentry *fei_debugfs_dir; | ||
| 26 | |||
| 27 | static unsigned long adjust_error_retval(unsigned long addr, unsigned long retv) | ||
| 28 | { | ||
| 29 | switch (get_injectable_error_type(addr)) { | ||
| 30 | case EI_ETYPE_NULL: | ||
| 31 | if (retv != 0) | ||
| 32 | return 0; | ||
| 33 | break; | ||
| 34 | case EI_ETYPE_ERRNO: | ||
| 35 | if (retv < (unsigned long)-MAX_ERRNO) | ||
| 36 | return (unsigned long)-EINVAL; | ||
| 37 | break; | ||
| 38 | case EI_ETYPE_ERRNO_NULL: | ||
| 39 | if (retv != 0 && retv < (unsigned long)-MAX_ERRNO) | ||
| 40 | return (unsigned long)-EINVAL; | ||
| 41 | break; | ||
| 42 | } | ||
| 43 | |||
| 44 | return retv; | ||
| 45 | } | ||
| 46 | |||
| 47 | static struct fei_attr *fei_attr_new(const char *sym, unsigned long addr) | ||
| 48 | { | ||
| 49 | struct fei_attr *attr; | ||
| 50 | |||
| 51 | attr = kzalloc(sizeof(*attr), GFP_KERNEL); | ||
| 52 | if (attr) { | ||
| 53 | attr->kp.symbol_name = kstrdup(sym, GFP_KERNEL); | ||
| 54 | if (!attr->kp.symbol_name) { | ||
| 55 | kfree(attr); | ||
| 56 | return NULL; | ||
| 57 | } | ||
| 58 | attr->kp.pre_handler = fei_kprobe_handler; | ||
| 59 | attr->retval = adjust_error_retval(addr, 0); | ||
| 60 | INIT_LIST_HEAD(&attr->list); | ||
| 61 | } | ||
| 62 | return attr; | ||
| 63 | } | ||
| 64 | |||
| 65 | static void fei_attr_free(struct fei_attr *attr) | ||
| 66 | { | ||
| 67 | if (attr) { | ||
| 68 | kfree(attr->kp.symbol_name); | ||
| 69 | kfree(attr); | ||
| 70 | } | ||
| 71 | } | ||
| 72 | |||
| 73 | static struct fei_attr *fei_attr_lookup(const char *sym) | ||
| 74 | { | ||
| 75 | struct fei_attr *attr; | ||
| 76 | |||
| 77 | list_for_each_entry(attr, &fei_attr_list, list) { | ||
| 78 | if (!strcmp(attr->kp.symbol_name, sym)) | ||
| 79 | return attr; | ||
| 80 | } | ||
| 81 | |||
| 82 | return NULL; | ||
| 83 | } | ||
| 84 | |||
| 85 | static bool fei_attr_is_valid(struct fei_attr *_attr) | ||
| 86 | { | ||
| 87 | struct fei_attr *attr; | ||
| 88 | |||
| 89 | list_for_each_entry(attr, &fei_attr_list, list) { | ||
| 90 | if (attr == _attr) | ||
| 91 | return true; | ||
| 92 | } | ||
| 93 | |||
| 94 | return false; | ||
| 95 | } | ||
| 96 | |||
| 97 | static int fei_retval_set(void *data, u64 val) | ||
| 98 | { | ||
| 99 | struct fei_attr *attr = data; | ||
| 100 | unsigned long retv = (unsigned long)val; | ||
| 101 | int err = 0; | ||
| 102 | |||
| 103 | mutex_lock(&fei_lock); | ||
| 104 | /* | ||
| 105 | * Since this operation can be done after retval file is removed, | ||
| 106 | * It is safer to check the attr is still valid before accessing | ||
| 107 | * its member. | ||
| 108 | */ | ||
| 109 | if (!fei_attr_is_valid(attr)) { | ||
| 110 | err = -ENOENT; | ||
| 111 | goto out; | ||
| 112 | } | ||
| 113 | |||
| 114 | if (attr->kp.addr) { | ||
| 115 | if (adjust_error_retval((unsigned long)attr->kp.addr, | ||
| 116 | val) != retv) | ||
| 117 | err = -EINVAL; | ||
| 118 | } | ||
| 119 | if (!err) | ||
| 120 | attr->retval = val; | ||
| 121 | out: | ||
| 122 | mutex_unlock(&fei_lock); | ||
| 123 | |||
| 124 | return err; | ||
| 125 | } | ||
| 126 | |||
| 127 | static int fei_retval_get(void *data, u64 *val) | ||
| 128 | { | ||
| 129 | struct fei_attr *attr = data; | ||
| 130 | int err = 0; | ||
| 131 | |||
| 132 | mutex_lock(&fei_lock); | ||
| 133 | /* Here we also validate @attr to ensure it still exists. */ | ||
| 134 | if (!fei_attr_is_valid(attr)) | ||
| 135 | err = -ENOENT; | ||
| 136 | else | ||
| 137 | *val = attr->retval; | ||
| 138 | mutex_unlock(&fei_lock); | ||
| 139 | |||
| 140 | return err; | ||
| 141 | } | ||
| 142 | DEFINE_DEBUGFS_ATTRIBUTE(fei_retval_ops, fei_retval_get, fei_retval_set, | ||
| 143 | "%llx\n"); | ||
| 144 | |||
| 145 | static int fei_debugfs_add_attr(struct fei_attr *attr) | ||
| 146 | { | ||
| 147 | struct dentry *dir; | ||
| 148 | |||
| 149 | dir = debugfs_create_dir(attr->kp.symbol_name, fei_debugfs_dir); | ||
| 150 | if (!dir) | ||
| 151 | return -ENOMEM; | ||
| 152 | |||
| 153 | if (!debugfs_create_file("retval", 0600, dir, attr, &fei_retval_ops)) { | ||
| 154 | debugfs_remove_recursive(dir); | ||
| 155 | return -ENOMEM; | ||
| 156 | } | ||
| 157 | |||
| 158 | return 0; | ||
| 159 | } | ||
| 160 | |||
| 161 | static void fei_debugfs_remove_attr(struct fei_attr *attr) | ||
| 162 | { | ||
| 163 | struct dentry *dir; | ||
| 164 | |||
| 165 | dir = debugfs_lookup(attr->kp.symbol_name, fei_debugfs_dir); | ||
| 166 | if (dir) | ||
| 167 | debugfs_remove_recursive(dir); | ||
| 168 | } | ||
| 169 | |||
| 170 | static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs) | ||
| 171 | { | ||
| 172 | struct fei_attr *attr = container_of(kp, struct fei_attr, kp); | ||
| 173 | |||
| 174 | if (should_fail(&fei_fault_attr, 1)) { | ||
| 175 | regs_set_return_value(regs, attr->retval); | ||
| 176 | override_function_with_return(regs); | ||
| 177 | /* Kprobe specific fixup */ | ||
| 178 | reset_current_kprobe(); | ||
| 179 | preempt_enable_no_resched(); | ||
| 180 | return 1; | ||
| 181 | } | ||
| 182 | |||
| 183 | return 0; | ||
| 184 | } | ||
| 185 | NOKPROBE_SYMBOL(fei_kprobe_handler) | ||
| 186 | |||
| 187 | static void *fei_seq_start(struct seq_file *m, loff_t *pos) | ||
| 188 | { | ||
| 189 | mutex_lock(&fei_lock); | ||
| 190 | return seq_list_start(&fei_attr_list, *pos); | ||
| 191 | } | ||
| 192 | |||
| 193 | static void fei_seq_stop(struct seq_file *m, void *v) | ||
| 194 | { | ||
| 195 | mutex_unlock(&fei_lock); | ||
| 196 | } | ||
| 197 | |||
| 198 | static void *fei_seq_next(struct seq_file *m, void *v, loff_t *pos) | ||
| 199 | { | ||
| 200 | return seq_list_next(v, &fei_attr_list, pos); | ||
| 201 | } | ||
| 202 | |||
| 203 | static int fei_seq_show(struct seq_file *m, void *v) | ||
| 204 | { | ||
| 205 | struct fei_attr *attr = list_entry(v, struct fei_attr, list); | ||
| 206 | |||
| 207 | seq_printf(m, "%pf\n", attr->kp.addr); | ||
| 208 | return 0; | ||
| 209 | } | ||
| 210 | |||
| 211 | static const struct seq_operations fei_seq_ops = { | ||
| 212 | .start = fei_seq_start, | ||
| 213 | .next = fei_seq_next, | ||
| 214 | .stop = fei_seq_stop, | ||
| 215 | .show = fei_seq_show, | ||
| 216 | }; | ||
| 217 | |||
| 218 | static int fei_open(struct inode *inode, struct file *file) | ||
| 219 | { | ||
| 220 | return seq_open(file, &fei_seq_ops); | ||
| 221 | } | ||
| 222 | |||
| 223 | static void fei_attr_remove(struct fei_attr *attr) | ||
| 224 | { | ||
| 225 | fei_debugfs_remove_attr(attr); | ||
| 226 | unregister_kprobe(&attr->kp); | ||
| 227 | list_del(&attr->list); | ||
| 228 | fei_attr_free(attr); | ||
| 229 | } | ||
| 230 | |||
| 231 | static void fei_attr_remove_all(void) | ||
| 232 | { | ||
| 233 | struct fei_attr *attr, *n; | ||
| 234 | |||
| 235 | list_for_each_entry_safe(attr, n, &fei_attr_list, list) { | ||
| 236 | fei_attr_remove(attr); | ||
| 237 | } | ||
| 238 | } | ||
| 239 | |||
| 240 | static ssize_t fei_write(struct file *file, const char __user *buffer, | ||
| 241 | size_t count, loff_t *ppos) | ||
| 242 | { | ||
| 243 | struct fei_attr *attr; | ||
| 244 | unsigned long addr; | ||
| 245 | char *buf, *sym; | ||
| 246 | int ret; | ||
| 247 | |||
| 248 | /* cut off if it is too long */ | ||
| 249 | if (count > KSYM_NAME_LEN) | ||
| 250 | count = KSYM_NAME_LEN; | ||
| 251 | buf = kmalloc(sizeof(char) * (count + 1), GFP_KERNEL); | ||
| 252 | if (!buf) | ||
| 253 | return -ENOMEM; | ||
| 254 | |||
| 255 | if (copy_from_user(buf, buffer, count)) { | ||
| 256 | ret = -EFAULT; | ||
| 257 | goto out; | ||
| 258 | } | ||
| 259 | buf[count] = '\0'; | ||
| 260 | sym = strstrip(buf); | ||
| 261 | |||
| 262 | mutex_lock(&fei_lock); | ||
| 263 | |||
| 264 | /* Writing just spaces will remove all injection points */ | ||
| 265 | if (sym[0] == '\0') { | ||
| 266 | fei_attr_remove_all(); | ||
| 267 | ret = count; | ||
| 268 | goto out; | ||
| 269 | } | ||
| 270 | /* Writing !function will remove one injection point */ | ||
| 271 | if (sym[0] == '!') { | ||
| 272 | attr = fei_attr_lookup(sym + 1); | ||
| 273 | if (!attr) { | ||
| 274 | ret = -ENOENT; | ||
| 275 | goto out; | ||
| 276 | } | ||
| 277 | fei_attr_remove(attr); | ||
| 278 | ret = count; | ||
| 279 | goto out; | ||
| 280 | } | ||
| 281 | |||
| 282 | addr = kallsyms_lookup_name(sym); | ||
| 283 | if (!addr) { | ||
| 284 | ret = -EINVAL; | ||
| 285 | goto out; | ||
| 286 | } | ||
| 287 | if (!within_error_injection_list(addr)) { | ||
| 288 | ret = -ERANGE; | ||
| 289 | goto out; | ||
| 290 | } | ||
| 291 | if (fei_attr_lookup(sym)) { | ||
| 292 | ret = -EBUSY; | ||
| 293 | goto out; | ||
| 294 | } | ||
| 295 | attr = fei_attr_new(sym, addr); | ||
| 296 | if (!attr) { | ||
| 297 | ret = -ENOMEM; | ||
| 298 | goto out; | ||
| 299 | } | ||
| 300 | |||
| 301 | ret = register_kprobe(&attr->kp); | ||
| 302 | if (!ret) | ||
| 303 | ret = fei_debugfs_add_attr(attr); | ||
| 304 | if (ret < 0) | ||
| 305 | fei_attr_remove(attr); | ||
| 306 | else { | ||
| 307 | list_add_tail(&attr->list, &fei_attr_list); | ||
| 308 | ret = count; | ||
| 309 | } | ||
| 310 | out: | ||
| 311 | kfree(buf); | ||
| 312 | mutex_unlock(&fei_lock); | ||
| 313 | return ret; | ||
| 314 | } | ||
| 315 | |||
| 316 | static const struct file_operations fei_ops = { | ||
| 317 | .open = fei_open, | ||
| 318 | .read = seq_read, | ||
| 319 | .write = fei_write, | ||
| 320 | .llseek = seq_lseek, | ||
| 321 | .release = seq_release, | ||
| 322 | }; | ||
| 323 | |||
| 324 | static int __init fei_debugfs_init(void) | ||
| 325 | { | ||
| 326 | struct dentry *dir; | ||
| 327 | |||
| 328 | dir = fault_create_debugfs_attr("fail_function", NULL, | ||
| 329 | &fei_fault_attr); | ||
| 330 | if (IS_ERR(dir)) | ||
| 331 | return PTR_ERR(dir); | ||
| 332 | |||
| 333 | /* injectable attribute is just a symlink of error_inject/list */ | ||
| 334 | if (!debugfs_create_symlink("injectable", dir, | ||
| 335 | "../error_injection/list")) | ||
| 336 | goto error; | ||
| 337 | |||
| 338 | if (!debugfs_create_file("inject", 0600, dir, NULL, &fei_ops)) | ||
| 339 | goto error; | ||
| 340 | |||
| 341 | fei_debugfs_dir = dir; | ||
| 342 | |||
| 343 | return 0; | ||
| 344 | error: | ||
| 345 | debugfs_remove_recursive(dir); | ||
| 346 | return -ENOMEM; | ||
| 347 | } | ||
| 348 | |||
| 349 | late_initcall(fei_debugfs_init); | ||
diff --git a/kernel/fork.c b/kernel/fork.c index 2295fc69717f..c7c112391d79 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -77,6 +77,7 @@ | |||
| 77 | #include <linux/blkdev.h> | 77 | #include <linux/blkdev.h> |
| 78 | #include <linux/fs_struct.h> | 78 | #include <linux/fs_struct.h> |
| 79 | #include <linux/magic.h> | 79 | #include <linux/magic.h> |
| 80 | #include <linux/sched/mm.h> | ||
| 80 | #include <linux/perf_event.h> | 81 | #include <linux/perf_event.h> |
| 81 | #include <linux/posix-timers.h> | 82 | #include <linux/posix-timers.h> |
| 82 | #include <linux/user-return-notifier.h> | 83 | #include <linux/user-return-notifier.h> |
| @@ -282,8 +283,9 @@ static void free_thread_stack(struct task_struct *tsk) | |||
| 282 | 283 | ||
| 283 | void thread_stack_cache_init(void) | 284 | void thread_stack_cache_init(void) |
| 284 | { | 285 | { |
| 285 | thread_stack_cache = kmem_cache_create("thread_stack", THREAD_SIZE, | 286 | thread_stack_cache = kmem_cache_create_usercopy("thread_stack", |
| 286 | THREAD_SIZE, 0, NULL); | 287 | THREAD_SIZE, THREAD_SIZE, 0, 0, |
| 288 | THREAD_SIZE, NULL); | ||
| 287 | BUG_ON(thread_stack_cache == NULL); | 289 | BUG_ON(thread_stack_cache == NULL); |
| 288 | } | 290 | } |
| 289 | # endif | 291 | # endif |
| @@ -390,6 +392,246 @@ void free_task(struct task_struct *tsk) | |||
| 390 | } | 392 | } |
| 391 | EXPORT_SYMBOL(free_task); | 393 | EXPORT_SYMBOL(free_task); |
| 392 | 394 | ||
| 395 | #ifdef CONFIG_MMU | ||
| 396 | static __latent_entropy int dup_mmap(struct mm_struct *mm, | ||
| 397 | struct mm_struct *oldmm) | ||
| 398 | { | ||
| 399 | struct vm_area_struct *mpnt, *tmp, *prev, **pprev; | ||
| 400 | struct rb_node **rb_link, *rb_parent; | ||
| 401 | int retval; | ||
| 402 | unsigned long charge; | ||
| 403 | LIST_HEAD(uf); | ||
| 404 | |||
| 405 | uprobe_start_dup_mmap(); | ||
| 406 | if (down_write_killable(&oldmm->mmap_sem)) { | ||
| 407 | retval = -EINTR; | ||
| 408 | goto fail_uprobe_end; | ||
| 409 | } | ||
| 410 | flush_cache_dup_mm(oldmm); | ||
| 411 | uprobe_dup_mmap(oldmm, mm); | ||
| 412 | /* | ||
| 413 | * Not linked in yet - no deadlock potential: | ||
| 414 | */ | ||
| 415 | down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); | ||
| 416 | |||
| 417 | /* No ordering required: file already has been exposed. */ | ||
| 418 | RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); | ||
| 419 | |||
| 420 | mm->total_vm = oldmm->total_vm; | ||
| 421 | mm->data_vm = oldmm->data_vm; | ||
| 422 | mm->exec_vm = oldmm->exec_vm; | ||
| 423 | mm->stack_vm = oldmm->stack_vm; | ||
| 424 | |||
| 425 | rb_link = &mm->mm_rb.rb_node; | ||
| 426 | rb_parent = NULL; | ||
| 427 | pprev = &mm->mmap; | ||
| 428 | retval = ksm_fork(mm, oldmm); | ||
| 429 | if (retval) | ||
| 430 | goto out; | ||
| 431 | retval = khugepaged_fork(mm, oldmm); | ||
| 432 | if (retval) | ||
| 433 | goto out; | ||
| 434 | |||
| 435 | prev = NULL; | ||
| 436 | for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { | ||
| 437 | struct file *file; | ||
| 438 | |||
| 439 | if (mpnt->vm_flags & VM_DONTCOPY) { | ||
| 440 | vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); | ||
| 441 | continue; | ||
| 442 | } | ||
| 443 | charge = 0; | ||
| 444 | if (mpnt->vm_flags & VM_ACCOUNT) { | ||
| 445 | unsigned long len = vma_pages(mpnt); | ||
| 446 | |||
| 447 | if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ | ||
| 448 | goto fail_nomem; | ||
| 449 | charge = len; | ||
| 450 | } | ||
| 451 | tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); | ||
| 452 | if (!tmp) | ||
| 453 | goto fail_nomem; | ||
| 454 | *tmp = *mpnt; | ||
| 455 | INIT_LIST_HEAD(&tmp->anon_vma_chain); | ||
| 456 | retval = vma_dup_policy(mpnt, tmp); | ||
| 457 | if (retval) | ||
| 458 | goto fail_nomem_policy; | ||
| 459 | tmp->vm_mm = mm; | ||
| 460 | retval = dup_userfaultfd(tmp, &uf); | ||
| 461 | if (retval) | ||
| 462 | goto fail_nomem_anon_vma_fork; | ||
| 463 | if (tmp->vm_flags & VM_WIPEONFORK) { | ||
| 464 | /* VM_WIPEONFORK gets a clean slate in the child. */ | ||
| 465 | tmp->anon_vma = NULL; | ||
| 466 | if (anon_vma_prepare(tmp)) | ||
| 467 | goto fail_nomem_anon_vma_fork; | ||
| 468 | } else if (anon_vma_fork(tmp, mpnt)) | ||
| 469 | goto fail_nomem_anon_vma_fork; | ||
| 470 | tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); | ||
| 471 | tmp->vm_next = tmp->vm_prev = NULL; | ||
| 472 | file = tmp->vm_file; | ||
| 473 | if (file) { | ||
| 474 | struct inode *inode = file_inode(file); | ||
| 475 | struct address_space *mapping = file->f_mapping; | ||
| 476 | |||
| 477 | get_file(file); | ||
| 478 | if (tmp->vm_flags & VM_DENYWRITE) | ||
| 479 | atomic_dec(&inode->i_writecount); | ||
| 480 | i_mmap_lock_write(mapping); | ||
| 481 | if (tmp->vm_flags & VM_SHARED) | ||
| 482 | atomic_inc(&mapping->i_mmap_writable); | ||
| 483 | flush_dcache_mmap_lock(mapping); | ||
| 484 | /* insert tmp into the share list, just after mpnt */ | ||
| 485 | vma_interval_tree_insert_after(tmp, mpnt, | ||
| 486 | &mapping->i_mmap); | ||
| 487 | flush_dcache_mmap_unlock(mapping); | ||
| 488 | i_mmap_unlock_write(mapping); | ||
| 489 | } | ||
| 490 | |||
| 491 | /* | ||
| 492 | * Clear hugetlb-related page reserves for children. This only | ||
| 493 | * affects MAP_PRIVATE mappings. Faults generated by the child | ||
| 494 | * are not guaranteed to succeed, even if read-only | ||
| 495 | */ | ||
| 496 | if (is_vm_hugetlb_page(tmp)) | ||
| 497 | reset_vma_resv_huge_pages(tmp); | ||
| 498 | |||
| 499 | /* | ||
| 500 | * Link in the new vma and copy the page table entries. | ||
| 501 | */ | ||
| 502 | *pprev = tmp; | ||
| 503 | pprev = &tmp->vm_next; | ||
| 504 | tmp->vm_prev = prev; | ||
| 505 | prev = tmp; | ||
| 506 | |||
| 507 | __vma_link_rb(mm, tmp, rb_link, rb_parent); | ||
| 508 | rb_link = &tmp->vm_rb.rb_right; | ||
| 509 | rb_parent = &tmp->vm_rb; | ||
| 510 | |||
| 511 | mm->map_count++; | ||
| 512 | if (!(tmp->vm_flags & VM_WIPEONFORK)) | ||
| 513 | retval = copy_page_range(mm, oldmm, mpnt); | ||
| 514 | |||
| 515 | if (tmp->vm_ops && tmp->vm_ops->open) | ||
| 516 | tmp->vm_ops->open(tmp); | ||
| 517 | |||
| 518 | if (retval) | ||
| 519 | goto out; | ||
| 520 | } | ||
| 521 | /* a new mm has just been created */ | ||
| 522 | arch_dup_mmap(oldmm, mm); | ||
| 523 | retval = 0; | ||
| 524 | out: | ||
| 525 | up_write(&mm->mmap_sem); | ||
| 526 | flush_tlb_mm(oldmm); | ||
| 527 | up_write(&oldmm->mmap_sem); | ||
| 528 | dup_userfaultfd_complete(&uf); | ||
| 529 | fail_uprobe_end: | ||
| 530 | uprobe_end_dup_mmap(); | ||
| 531 | return retval; | ||
| 532 | fail_nomem_anon_vma_fork: | ||
| 533 | mpol_put(vma_policy(tmp)); | ||
| 534 | fail_nomem_policy: | ||
| 535 | kmem_cache_free(vm_area_cachep, tmp); | ||
| 536 | fail_nomem: | ||
| 537 | retval = -ENOMEM; | ||
| 538 | vm_unacct_memory(charge); | ||
| 539 | goto out; | ||
| 540 | } | ||
| 541 | |||
| 542 | static inline int mm_alloc_pgd(struct mm_struct *mm) | ||
| 543 | { | ||
| 544 | mm->pgd = pgd_alloc(mm); | ||
| 545 | if (unlikely(!mm->pgd)) | ||
| 546 | return -ENOMEM; | ||
| 547 | return 0; | ||
| 548 | } | ||
| 549 | |||
| 550 | static inline void mm_free_pgd(struct mm_struct *mm) | ||
| 551 | { | ||
| 552 | pgd_free(mm, mm->pgd); | ||
| 553 | } | ||
| 554 | #else | ||
| 555 | static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | ||
| 556 | { | ||
| 557 | down_write(&oldmm->mmap_sem); | ||
| 558 | RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); | ||
| 559 | up_write(&oldmm->mmap_sem); | ||
| 560 | return 0; | ||
| 561 | } | ||
| 562 | #define mm_alloc_pgd(mm) (0) | ||
| 563 | #define mm_free_pgd(mm) | ||
| 564 | #endif /* CONFIG_MMU */ | ||
| 565 | |||
| 566 | static void check_mm(struct mm_struct *mm) | ||
| 567 | { | ||
| 568 | int i; | ||
| 569 | |||
| 570 | for (i = 0; i < NR_MM_COUNTERS; i++) { | ||
| 571 | long x = atomic_long_read(&mm->rss_stat.count[i]); | ||
| 572 | |||
| 573 | if (unlikely(x)) | ||
| 574 | printk(KERN_ALERT "BUG: Bad rss-counter state " | ||
| 575 | "mm:%p idx:%d val:%ld\n", mm, i, x); | ||
| 576 | } | ||
| 577 | |||
| 578 | if (mm_pgtables_bytes(mm)) | ||
| 579 | pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n", | ||
| 580 | mm_pgtables_bytes(mm)); | ||
| 581 | |||
| 582 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS | ||
| 583 | VM_BUG_ON_MM(mm->pmd_huge_pte, mm); | ||
| 584 | #endif | ||
| 585 | } | ||
| 586 | |||
| 587 | #define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) | ||
| 588 | #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) | ||
| 589 | |||
| 590 | /* | ||
| 591 | * Called when the last reference to the mm | ||
| 592 | * is dropped: either by a lazy thread or by | ||
| 593 | * mmput. Free the page directory and the mm. | ||
| 594 | */ | ||
| 595 | static void __mmdrop(struct mm_struct *mm) | ||
| 596 | { | ||
| 597 | BUG_ON(mm == &init_mm); | ||
| 598 | mm_free_pgd(mm); | ||
| 599 | destroy_context(mm); | ||
| 600 | hmm_mm_destroy(mm); | ||
| 601 | mmu_notifier_mm_destroy(mm); | ||
| 602 | check_mm(mm); | ||
| 603 | put_user_ns(mm->user_ns); | ||
| 604 | free_mm(mm); | ||
| 605 | } | ||
| 606 | |||
| 607 | void mmdrop(struct mm_struct *mm) | ||
| 608 | { | ||
| 609 | /* | ||
| 610 | * The implicit full barrier implied by atomic_dec_and_test() is | ||
| 611 | * required by the membarrier system call before returning to | ||
| 612 | * user-space, after storing to rq->curr. | ||
| 613 | */ | ||
| 614 | if (unlikely(atomic_dec_and_test(&mm->mm_count))) | ||
| 615 | __mmdrop(mm); | ||
| 616 | } | ||
| 617 | EXPORT_SYMBOL_GPL(mmdrop); | ||
| 618 | |||
| 619 | static void mmdrop_async_fn(struct work_struct *work) | ||
| 620 | { | ||
| 621 | struct mm_struct *mm; | ||
| 622 | |||
| 623 | mm = container_of(work, struct mm_struct, async_put_work); | ||
| 624 | __mmdrop(mm); | ||
| 625 | } | ||
| 626 | |||
| 627 | static void mmdrop_async(struct mm_struct *mm) | ||
| 628 | { | ||
| 629 | if (unlikely(atomic_dec_and_test(&mm->mm_count))) { | ||
| 630 | INIT_WORK(&mm->async_put_work, mmdrop_async_fn); | ||
| 631 | schedule_work(&mm->async_put_work); | ||
| 632 | } | ||
| 633 | } | ||
| 634 | |||
| 393 | static inline void free_signal_struct(struct signal_struct *sig) | 635 | static inline void free_signal_struct(struct signal_struct *sig) |
| 394 | { | 636 | { |
| 395 | taskstats_tgid_free(sig); | 637 | taskstats_tgid_free(sig); |
| @@ -457,6 +699,21 @@ static void set_max_threads(unsigned int max_threads_suggested) | |||
| 457 | int arch_task_struct_size __read_mostly; | 699 | int arch_task_struct_size __read_mostly; |
| 458 | #endif | 700 | #endif |
| 459 | 701 | ||
| 702 | static void task_struct_whitelist(unsigned long *offset, unsigned long *size) | ||
| 703 | { | ||
| 704 | /* Fetch thread_struct whitelist for the architecture. */ | ||
| 705 | arch_thread_struct_whitelist(offset, size); | ||
| 706 | |||
| 707 | /* | ||
| 708 | * Handle zero-sized whitelist or empty thread_struct, otherwise | ||
| 709 | * adjust offset to position of thread_struct in task_struct. | ||
| 710 | */ | ||
| 711 | if (unlikely(*size == 0)) | ||
| 712 | *offset = 0; | ||
| 713 | else | ||
| 714 | *offset += offsetof(struct task_struct, thread); | ||
| 715 | } | ||
| 716 | |||
| 460 | void __init fork_init(void) | 717 | void __init fork_init(void) |
| 461 | { | 718 | { |
| 462 | int i; | 719 | int i; |
| @@ -465,11 +722,14 @@ void __init fork_init(void) | |||
| 465 | #define ARCH_MIN_TASKALIGN 0 | 722 | #define ARCH_MIN_TASKALIGN 0 |
| 466 | #endif | 723 | #endif |
| 467 | int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN); | 724 | int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN); |
| 725 | unsigned long useroffset, usersize; | ||
| 468 | 726 | ||
| 469 | /* create a slab on which task_structs can be allocated */ | 727 | /* create a slab on which task_structs can be allocated */ |
| 470 | task_struct_cachep = kmem_cache_create("task_struct", | 728 | task_struct_whitelist(&useroffset, &usersize); |
| 729 | task_struct_cachep = kmem_cache_create_usercopy("task_struct", | ||
| 471 | arch_task_struct_size, align, | 730 | arch_task_struct_size, align, |
| 472 | SLAB_PANIC|SLAB_ACCOUNT, NULL); | 731 | SLAB_PANIC|SLAB_ACCOUNT, |
| 732 | useroffset, usersize, NULL); | ||
| 473 | #endif | 733 | #endif |
| 474 | 734 | ||
| 475 | /* do the arch specific task caches init */ | 735 | /* do the arch specific task caches init */ |
| @@ -594,181 +854,8 @@ free_tsk: | |||
| 594 | return NULL; | 854 | return NULL; |
| 595 | } | 855 | } |
| 596 | 856 | ||
| 597 | #ifdef CONFIG_MMU | ||
| 598 | static __latent_entropy int dup_mmap(struct mm_struct *mm, | ||
| 599 | struct mm_struct *oldmm) | ||
| 600 | { | ||
| 601 | struct vm_area_struct *mpnt, *tmp, *prev, **pprev; | ||
| 602 | struct rb_node **rb_link, *rb_parent; | ||
| 603 | int retval; | ||
| 604 | unsigned long charge; | ||
| 605 | LIST_HEAD(uf); | ||
| 606 | |||
| 607 | uprobe_start_dup_mmap(); | ||
| 608 | if (down_write_killable(&oldmm->mmap_sem)) { | ||
| 609 | retval = -EINTR; | ||
| 610 | goto fail_uprobe_end; | ||
| 611 | } | ||
| 612 | flush_cache_dup_mm(oldmm); | ||
| 613 | uprobe_dup_mmap(oldmm, mm); | ||
| 614 | /* | ||
| 615 | * Not linked in yet - no deadlock potential: | ||
| 616 | */ | ||
| 617 | down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); | ||
| 618 | |||
| 619 | /* No ordering required: file already has been exposed. */ | ||
| 620 | RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); | ||
| 621 | |||
| 622 | mm->total_vm = oldmm->total_vm; | ||
| 623 | mm->data_vm = oldmm->data_vm; | ||
| 624 | mm->exec_vm = oldmm->exec_vm; | ||
| 625 | mm->stack_vm = oldmm->stack_vm; | ||
| 626 | |||
| 627 | rb_link = &mm->mm_rb.rb_node; | ||
| 628 | rb_parent = NULL; | ||
| 629 | pprev = &mm->mmap; | ||
| 630 | retval = ksm_fork(mm, oldmm); | ||
| 631 | if (retval) | ||
| 632 | goto out; | ||
| 633 | retval = khugepaged_fork(mm, oldmm); | ||
| 634 | if (retval) | ||
| 635 | goto out; | ||
| 636 | |||
| 637 | prev = NULL; | ||
| 638 | for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { | ||
| 639 | struct file *file; | ||
| 640 | |||
| 641 | if (mpnt->vm_flags & VM_DONTCOPY) { | ||
| 642 | vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); | ||
| 643 | continue; | ||
| 644 | } | ||
| 645 | charge = 0; | ||
| 646 | if (mpnt->vm_flags & VM_ACCOUNT) { | ||
| 647 | unsigned long len = vma_pages(mpnt); | ||
| 648 | |||
| 649 | if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ | ||
| 650 | goto fail_nomem; | ||
| 651 | charge = len; | ||
| 652 | } | ||
| 653 | tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); | ||
| 654 | if (!tmp) | ||
| 655 | goto fail_nomem; | ||
| 656 | *tmp = *mpnt; | ||
| 657 | INIT_LIST_HEAD(&tmp->anon_vma_chain); | ||
| 658 | retval = vma_dup_policy(mpnt, tmp); | ||
| 659 | if (retval) | ||
| 660 | goto fail_nomem_policy; | ||
| 661 | tmp->vm_mm = mm; | ||
| 662 | retval = dup_userfaultfd(tmp, &uf); | ||
| 663 | if (retval) | ||
| 664 | goto fail_nomem_anon_vma_fork; | ||
| 665 | if (tmp->vm_flags & VM_WIPEONFORK) { | ||
| 666 | /* VM_WIPEONFORK gets a clean slate in the child. */ | ||
| 667 | tmp->anon_vma = NULL; | ||
| 668 | if (anon_vma_prepare(tmp)) | ||
| 669 | goto fail_nomem_anon_vma_fork; | ||
| 670 | } else if (anon_vma_fork(tmp, mpnt)) | ||
| 671 | goto fail_nomem_anon_vma_fork; | ||
| 672 | tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); | ||
| 673 | tmp->vm_next = tmp->vm_prev = NULL; | ||
| 674 | file = tmp->vm_file; | ||
| 675 | if (file) { | ||
| 676 | struct inode *inode = file_inode(file); | ||
| 677 | struct address_space *mapping = file->f_mapping; | ||
| 678 | |||
| 679 | get_file(file); | ||
| 680 | if (tmp->vm_flags & VM_DENYWRITE) | ||
| 681 | atomic_dec(&inode->i_writecount); | ||
| 682 | i_mmap_lock_write(mapping); | ||
| 683 | if (tmp->vm_flags & VM_SHARED) | ||
| 684 | atomic_inc(&mapping->i_mmap_writable); | ||
| 685 | flush_dcache_mmap_lock(mapping); | ||
| 686 | /* insert tmp into the share list, just after mpnt */ | ||
| 687 | vma_interval_tree_insert_after(tmp, mpnt, | ||
| 688 | &mapping->i_mmap); | ||
| 689 | flush_dcache_mmap_unlock(mapping); | ||
| 690 | i_mmap_unlock_write(mapping); | ||
| 691 | } | ||
| 692 | |||
| 693 | /* | ||
| 694 | * Clear hugetlb-related page reserves for children. This only | ||
| 695 | * affects MAP_PRIVATE mappings. Faults generated by the child | ||
| 696 | * are not guaranteed to succeed, even if read-only | ||
| 697 | */ | ||
| 698 | if (is_vm_hugetlb_page(tmp)) | ||
| 699 | reset_vma_resv_huge_pages(tmp); | ||
| 700 | |||
| 701 | /* | ||
| 702 | * Link in the new vma and copy the page table entries. | ||
| 703 | */ | ||
| 704 | *pprev = tmp; | ||
| 705 | pprev = &tmp->vm_next; | ||
| 706 | tmp->vm_prev = prev; | ||
| 707 | prev = tmp; | ||
| 708 | |||
| 709 | __vma_link_rb(mm, tmp, rb_link, rb_parent); | ||
| 710 | rb_link = &tmp->vm_rb.rb_right; | ||
| 711 | rb_parent = &tmp->vm_rb; | ||
| 712 | |||
| 713 | mm->map_count++; | ||
| 714 | if (!(tmp->vm_flags & VM_WIPEONFORK)) | ||
| 715 | retval = copy_page_range(mm, oldmm, mpnt); | ||
| 716 | |||
| 717 | if (tmp->vm_ops && tmp->vm_ops->open) | ||
| 718 | tmp->vm_ops->open(tmp); | ||
| 719 | |||
| 720 | if (retval) | ||
| 721 | goto out; | ||
| 722 | } | ||
| 723 | /* a new mm has just been created */ | ||
| 724 | retval = arch_dup_mmap(oldmm, mm); | ||
| 725 | out: | ||
| 726 | up_write(&mm->mmap_sem); | ||
| 727 | flush_tlb_mm(oldmm); | ||
| 728 | up_write(&oldmm->mmap_sem); | ||
| 729 | dup_userfaultfd_complete(&uf); | ||
| 730 | fail_uprobe_end: | ||
| 731 | uprobe_end_dup_mmap(); | ||
| 732 | return retval; | ||
| 733 | fail_nomem_anon_vma_fork: | ||
| 734 | mpol_put(vma_policy(tmp)); | ||
| 735 | fail_nomem_policy: | ||
| 736 | kmem_cache_free(vm_area_cachep, tmp); | ||
| 737 | fail_nomem: | ||
| 738 | retval = -ENOMEM; | ||
| 739 | vm_unacct_memory(charge); | ||
| 740 | goto out; | ||
| 741 | } | ||
| 742 | |||
| 743 | static inline int mm_alloc_pgd(struct mm_struct *mm) | ||
| 744 | { | ||
| 745 | mm->pgd = pgd_alloc(mm); | ||
| 746 | if (unlikely(!mm->pgd)) | ||
| 747 | return -ENOMEM; | ||
| 748 | return 0; | ||
| 749 | } | ||
| 750 | |||
| 751 | static inline void mm_free_pgd(struct mm_struct *mm) | ||
| 752 | { | ||
| 753 | pgd_free(mm, mm->pgd); | ||
| 754 | } | ||
| 755 | #else | ||
| 756 | static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | ||
| 757 | { | ||
| 758 | down_write(&oldmm->mmap_sem); | ||
| 759 | RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); | ||
| 760 | up_write(&oldmm->mmap_sem); | ||
| 761 | return 0; | ||
| 762 | } | ||
| 763 | #define mm_alloc_pgd(mm) (0) | ||
| 764 | #define mm_free_pgd(mm) | ||
| 765 | #endif /* CONFIG_MMU */ | ||
| 766 | |||
| 767 | __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); | 857 | __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); |
| 768 | 858 | ||
| 769 | #define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) | ||
| 770 | #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) | ||
| 771 | |||
| 772 | static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT; | 859 | static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT; |
| 773 | 860 | ||
| 774 | static int __init coredump_filter_setup(char *s) | 861 | static int __init coredump_filter_setup(char *s) |
| @@ -858,27 +945,6 @@ fail_nopgd: | |||
| 858 | return NULL; | 945 | return NULL; |
| 859 | } | 946 | } |
| 860 | 947 | ||
| 861 | static void check_mm(struct mm_struct *mm) | ||
| 862 | { | ||
| 863 | int i; | ||
| 864 | |||
| 865 | for (i = 0; i < NR_MM_COUNTERS; i++) { | ||
| 866 | long x = atomic_long_read(&mm->rss_stat.count[i]); | ||
| 867 | |||
| 868 | if (unlikely(x)) | ||
| 869 | printk(KERN_ALERT "BUG: Bad rss-counter state " | ||
| 870 | "mm:%p idx:%d val:%ld\n", mm, i, x); | ||
| 871 | } | ||
| 872 | |||
| 873 | if (mm_pgtables_bytes(mm)) | ||
| 874 | pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n", | ||
| 875 | mm_pgtables_bytes(mm)); | ||
| 876 | |||
| 877 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS | ||
| 878 | VM_BUG_ON_MM(mm->pmd_huge_pte, mm); | ||
| 879 | #endif | ||
| 880 | } | ||
| 881 | |||
| 882 | /* | 948 | /* |
| 883 | * Allocate and initialize an mm_struct. | 949 | * Allocate and initialize an mm_struct. |
| 884 | */ | 950 | */ |
| @@ -894,24 +960,6 @@ struct mm_struct *mm_alloc(void) | |||
| 894 | return mm_init(mm, current, current_user_ns()); | 960 | return mm_init(mm, current, current_user_ns()); |
| 895 | } | 961 | } |
| 896 | 962 | ||
| 897 | /* | ||
| 898 | * Called when the last reference to the mm | ||
| 899 | * is dropped: either by a lazy thread or by | ||
| 900 | * mmput. Free the page directory and the mm. | ||
| 901 | */ | ||
| 902 | void __mmdrop(struct mm_struct *mm) | ||
| 903 | { | ||
| 904 | BUG_ON(mm == &init_mm); | ||
| 905 | mm_free_pgd(mm); | ||
| 906 | destroy_context(mm); | ||
| 907 | hmm_mm_destroy(mm); | ||
| 908 | mmu_notifier_mm_destroy(mm); | ||
| 909 | check_mm(mm); | ||
| 910 | put_user_ns(mm->user_ns); | ||
| 911 | free_mm(mm); | ||
| 912 | } | ||
| 913 | EXPORT_SYMBOL_GPL(__mmdrop); | ||
| 914 | |||
| 915 | static inline void __mmput(struct mm_struct *mm) | 963 | static inline void __mmput(struct mm_struct *mm) |
| 916 | { | 964 | { |
| 917 | VM_BUG_ON(atomic_read(&mm->mm_users)); | 965 | VM_BUG_ON(atomic_read(&mm->mm_users)); |
| @@ -2224,9 +2272,11 @@ void __init proc_caches_init(void) | |||
| 2224 | * maximum number of CPU's we can ever have. The cpumask_allocation | 2272 | * maximum number of CPU's we can ever have. The cpumask_allocation |
| 2225 | * is at the end of the structure, exactly for that reason. | 2273 | * is at the end of the structure, exactly for that reason. |
| 2226 | */ | 2274 | */ |
| 2227 | mm_cachep = kmem_cache_create("mm_struct", | 2275 | mm_cachep = kmem_cache_create_usercopy("mm_struct", |
| 2228 | sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, | 2276 | sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, |
| 2229 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, | 2277 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, |
| 2278 | offsetof(struct mm_struct, saved_auxv), | ||
| 2279 | sizeof_field(struct mm_struct, saved_auxv), | ||
| 2230 | NULL); | 2280 | NULL); |
| 2231 | vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); | 2281 | vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); |
| 2232 | mmap_init(); | 2282 | mmap_init(); |
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 4e8089b319ae..8c82ea26e837 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c | |||
| @@ -71,7 +71,7 @@ unsigned long probe_irq_on(void) | |||
| 71 | raw_spin_lock_irq(&desc->lock); | 71 | raw_spin_lock_irq(&desc->lock); |
| 72 | if (!desc->action && irq_settings_can_probe(desc)) { | 72 | if (!desc->action && irq_settings_can_probe(desc)) { |
| 73 | desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; | 73 | desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; |
| 74 | if (irq_startup(desc, IRQ_NORESEND, IRQ_START_FORCE)) | 74 | if (irq_activate_and_startup(desc, IRQ_NORESEND)) |
| 75 | desc->istate |= IRQS_PENDING; | 75 | desc->istate |= IRQS_PENDING; |
| 76 | } | 76 | } |
| 77 | raw_spin_unlock_irq(&desc->lock); | 77 | raw_spin_unlock_irq(&desc->lock); |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 043bfc35b353..c69357a43849 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
| @@ -294,11 +294,11 @@ int irq_activate(struct irq_desc *desc) | |||
| 294 | return 0; | 294 | return 0; |
| 295 | } | 295 | } |
| 296 | 296 | ||
| 297 | void irq_activate_and_startup(struct irq_desc *desc, bool resend) | 297 | int irq_activate_and_startup(struct irq_desc *desc, bool resend) |
| 298 | { | 298 | { |
| 299 | if (WARN_ON(irq_activate(desc))) | 299 | if (WARN_ON(irq_activate(desc))) |
| 300 | return; | 300 | return 0; |
| 301 | irq_startup(desc, resend, IRQ_START_FORCE); | 301 | return irq_startup(desc, resend, IRQ_START_FORCE); |
| 302 | } | 302 | } |
| 303 | 303 | ||
| 304 | static void __irq_disable(struct irq_desc *desc, bool mask); | 304 | static void __irq_disable(struct irq_desc *desc, bool mask); |
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h index e4d3819a91cc..8ccb326d2977 100644 --- a/kernel/irq/debug.h +++ b/kernel/irq/debug.h | |||
| @@ -3,8 +3,6 @@ | |||
| 3 | * Debugging printout: | 3 | * Debugging printout: |
| 4 | */ | 4 | */ |
| 5 | 5 | ||
| 6 | #include <linux/kallsyms.h> | ||
| 7 | |||
| 8 | #define ___P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f) | 6 | #define ___P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f) |
| 9 | #define ___PS(f) if (desc->istate & f) printk("%14s set\n", #f) | 7 | #define ___PS(f) if (desc->istate & f) printk("%14s set\n", #f) |
| 10 | /* FIXME */ | 8 | /* FIXME */ |
| @@ -19,14 +17,14 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) | |||
| 19 | 17 | ||
| 20 | printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n", | 18 | printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n", |
| 21 | irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); | 19 | irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); |
| 22 | printk("->handle_irq(): %p, ", desc->handle_irq); | 20 | printk("->handle_irq(): %p, %pS\n", |
| 23 | print_symbol("%s\n", (unsigned long)desc->handle_irq); | 21 | desc->handle_irq, desc->handle_irq); |
| 24 | printk("->irq_data.chip(): %p, ", desc->irq_data.chip); | 22 | printk("->irq_data.chip(): %p, %pS\n", |
| 25 | print_symbol("%s\n", (unsigned long)desc->irq_data.chip); | 23 | desc->irq_data.chip, desc->irq_data.chip); |
| 26 | printk("->action(): %p\n", desc->action); | 24 | printk("->action(): %p\n", desc->action); |
| 27 | if (desc->action) { | 25 | if (desc->action) { |
| 28 | printk("->action->handler(): %p, ", desc->action->handler); | 26 | printk("->action->handler(): %p, %pS\n", |
| 29 | print_symbol("%s\n", (unsigned long)desc->action->handler); | 27 | desc->action->handler, desc->action->handler); |
| 30 | } | 28 | } |
| 31 | 29 | ||
| 32 | ___P(IRQ_LEVEL); | 30 | ___P(IRQ_LEVEL); |
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index ab19371eab9b..ca6afa267070 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h | |||
| @@ -76,7 +76,7 @@ extern void __enable_irq(struct irq_desc *desc); | |||
| 76 | #define IRQ_START_COND false | 76 | #define IRQ_START_COND false |
| 77 | 77 | ||
| 78 | extern int irq_activate(struct irq_desc *desc); | 78 | extern int irq_activate(struct irq_desc *desc); |
| 79 | extern void irq_activate_and_startup(struct irq_desc *desc, bool resend); | 79 | extern int irq_activate_and_startup(struct irq_desc *desc, bool resend); |
| 80 | extern int irq_startup(struct irq_desc *desc, bool resend, bool force); | 80 | extern int irq_startup(struct irq_desc *desc, bool resend, bool force); |
| 81 | 81 | ||
| 82 | extern void irq_shutdown(struct irq_desc *desc); | 82 | extern void irq_shutdown(struct irq_desc *desc); |
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index d5fa4116688a..a23e21ada81b 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c | |||
| @@ -12,7 +12,6 @@ | |||
| 12 | * compression (see scripts/kallsyms.c for a more complete description) | 12 | * compression (see scripts/kallsyms.c for a more complete description) |
| 13 | */ | 13 | */ |
| 14 | #include <linux/kallsyms.h> | 14 | #include <linux/kallsyms.h> |
| 15 | #include <linux/module.h> | ||
| 16 | #include <linux/init.h> | 15 | #include <linux/init.h> |
| 17 | #include <linux/seq_file.h> | 16 | #include <linux/seq_file.h> |
| 18 | #include <linux/fs.h> | 17 | #include <linux/fs.h> |
| @@ -20,15 +19,12 @@ | |||
| 20 | #include <linux/err.h> | 19 | #include <linux/err.h> |
| 21 | #include <linux/proc_fs.h> | 20 | #include <linux/proc_fs.h> |
| 22 | #include <linux/sched.h> /* for cond_resched */ | 21 | #include <linux/sched.h> /* for cond_resched */ |
| 23 | #include <linux/mm.h> | ||
| 24 | #include <linux/ctype.h> | 22 | #include <linux/ctype.h> |
| 25 | #include <linux/slab.h> | 23 | #include <linux/slab.h> |
| 26 | #include <linux/filter.h> | 24 | #include <linux/filter.h> |
| 27 | #include <linux/ftrace.h> | 25 | #include <linux/ftrace.h> |
| 28 | #include <linux/compiler.h> | 26 | #include <linux/compiler.h> |
| 29 | 27 | ||
| 30 | #include <asm/sections.h> | ||
| 31 | |||
| 32 | /* | 28 | /* |
| 33 | * These will be re-linked against their real values | 29 | * These will be re-linked against their real values |
| 34 | * during the second link stage. | 30 | * during the second link stage. |
| @@ -52,37 +48,6 @@ extern const u16 kallsyms_token_index[] __weak; | |||
| 52 | 48 | ||
| 53 | extern const unsigned long kallsyms_markers[] __weak; | 49 | extern const unsigned long kallsyms_markers[] __weak; |
| 54 | 50 | ||
| 55 | static inline int is_kernel_inittext(unsigned long addr) | ||
| 56 | { | ||
| 57 | if (addr >= (unsigned long)_sinittext | ||
| 58 | && addr <= (unsigned long)_einittext) | ||
| 59 | return 1; | ||
| 60 | return 0; | ||
| 61 | } | ||
| 62 | |||
| 63 | static inline int is_kernel_text(unsigned long addr) | ||
| 64 | { | ||
| 65 | if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) || | ||
| 66 | arch_is_kernel_text(addr)) | ||
| 67 | return 1; | ||
| 68 | return in_gate_area_no_mm(addr); | ||
| 69 | } | ||
| 70 | |||
| 71 | static inline int is_kernel(unsigned long addr) | ||
| 72 | { | ||
| 73 | if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end) | ||
| 74 | return 1; | ||
| 75 | return in_gate_area_no_mm(addr); | ||
| 76 | } | ||
| 77 | |||
| 78 | static int is_ksym_addr(unsigned long addr) | ||
| 79 | { | ||
| 80 | if (IS_ENABLED(CONFIG_KALLSYMS_ALL)) | ||
| 81 | return is_kernel(addr); | ||
| 82 | |||
| 83 | return is_kernel_text(addr) || is_kernel_inittext(addr); | ||
| 84 | } | ||
| 85 | |||
| 86 | /* | 51 | /* |
| 87 | * Expand a compressed symbol data into the resulting uncompressed string, | 52 | * Expand a compressed symbol data into the resulting uncompressed string, |
| 88 | * if uncompressed string is too long (>= maxlen), it will be truncated, | 53 | * if uncompressed string is too long (>= maxlen), it will be truncated, |
| @@ -464,17 +429,6 @@ int sprint_backtrace(char *buffer, unsigned long address) | |||
| 464 | return __sprint_symbol(buffer, address, -1, 1); | 429 | return __sprint_symbol(buffer, address, -1, 1); |
| 465 | } | 430 | } |
| 466 | 431 | ||
| 467 | /* Look up a kernel symbol and print it to the kernel messages. */ | ||
| 468 | void __print_symbol(const char *fmt, unsigned long address) | ||
| 469 | { | ||
| 470 | char buffer[KSYM_SYMBOL_LEN]; | ||
| 471 | |||
| 472 | sprint_symbol(buffer, address); | ||
| 473 | |||
| 474 | printk(fmt, buffer); | ||
| 475 | } | ||
| 476 | EXPORT_SYMBOL(__print_symbol); | ||
| 477 | |||
| 478 | /* To avoid using get_symbol_offset for every symbol, we carry prefix along. */ | 432 | /* To avoid using get_symbol_offset for every symbol, we carry prefix along. */ |
| 479 | struct kallsym_iter { | 433 | struct kallsym_iter { |
| 480 | loff_t pos; | 434 | loff_t pos; |
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index de9e45dca70f..3a4656fb7047 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c | |||
| @@ -366,11 +366,6 @@ static int __klp_enable_patch(struct klp_patch *patch) | |||
| 366 | /* | 366 | /* |
| 367 | * A reference is taken on the patch module to prevent it from being | 367 | * A reference is taken on the patch module to prevent it from being |
| 368 | * unloaded. | 368 | * unloaded. |
| 369 | * | ||
| 370 | * Note: For immediate (no consistency model) patches we don't allow | ||
| 371 | * patch modules to unload since there is no safe/sane method to | ||
| 372 | * determine if a thread is still running in the patched code contained | ||
| 373 | * in the patch module once the ftrace registration is successful. | ||
| 374 | */ | 369 | */ |
| 375 | if (!try_module_get(patch->mod)) | 370 | if (!try_module_get(patch->mod)) |
| 376 | return -ENODEV; | 371 | return -ENODEV; |
| @@ -454,6 +449,8 @@ EXPORT_SYMBOL_GPL(klp_enable_patch); | |||
| 454 | * /sys/kernel/livepatch/<patch> | 449 | * /sys/kernel/livepatch/<patch> |
| 455 | * /sys/kernel/livepatch/<patch>/enabled | 450 | * /sys/kernel/livepatch/<patch>/enabled |
| 456 | * /sys/kernel/livepatch/<patch>/transition | 451 | * /sys/kernel/livepatch/<patch>/transition |
| 452 | * /sys/kernel/livepatch/<patch>/signal | ||
| 453 | * /sys/kernel/livepatch/<patch>/force | ||
| 457 | * /sys/kernel/livepatch/<patch>/<object> | 454 | * /sys/kernel/livepatch/<patch>/<object> |
| 458 | * /sys/kernel/livepatch/<patch>/<object>/<function,sympos> | 455 | * /sys/kernel/livepatch/<patch>/<object>/<function,sympos> |
| 459 | */ | 456 | */ |
| @@ -528,11 +525,73 @@ static ssize_t transition_show(struct kobject *kobj, | |||
| 528 | patch == klp_transition_patch); | 525 | patch == klp_transition_patch); |
| 529 | } | 526 | } |
| 530 | 527 | ||
| 528 | static ssize_t signal_store(struct kobject *kobj, struct kobj_attribute *attr, | ||
| 529 | const char *buf, size_t count) | ||
| 530 | { | ||
| 531 | struct klp_patch *patch; | ||
| 532 | int ret; | ||
| 533 | bool val; | ||
| 534 | |||
| 535 | ret = kstrtobool(buf, &val); | ||
| 536 | if (ret) | ||
| 537 | return ret; | ||
| 538 | |||
| 539 | if (!val) | ||
| 540 | return count; | ||
| 541 | |||
| 542 | mutex_lock(&klp_mutex); | ||
| 543 | |||
| 544 | patch = container_of(kobj, struct klp_patch, kobj); | ||
| 545 | if (patch != klp_transition_patch) { | ||
| 546 | mutex_unlock(&klp_mutex); | ||
| 547 | return -EINVAL; | ||
| 548 | } | ||
| 549 | |||
| 550 | klp_send_signals(); | ||
| 551 | |||
| 552 | mutex_unlock(&klp_mutex); | ||
| 553 | |||
| 554 | return count; | ||
| 555 | } | ||
| 556 | |||
| 557 | static ssize_t force_store(struct kobject *kobj, struct kobj_attribute *attr, | ||
| 558 | const char *buf, size_t count) | ||
| 559 | { | ||
| 560 | struct klp_patch *patch; | ||
| 561 | int ret; | ||
| 562 | bool val; | ||
| 563 | |||
| 564 | ret = kstrtobool(buf, &val); | ||
| 565 | if (ret) | ||
| 566 | return ret; | ||
| 567 | |||
| 568 | if (!val) | ||
| 569 | return count; | ||
| 570 | |||
| 571 | mutex_lock(&klp_mutex); | ||
| 572 | |||
| 573 | patch = container_of(kobj, struct klp_patch, kobj); | ||
| 574 | if (patch != klp_transition_patch) { | ||
| 575 | mutex_unlock(&klp_mutex); | ||
| 576 | return -EINVAL; | ||
| 577 | } | ||
| 578 | |||
| 579 | klp_force_transition(); | ||
| 580 | |||
| 581 | mutex_unlock(&klp_mutex); | ||
| 582 | |||
| 583 | return count; | ||
| 584 | } | ||
| 585 | |||
| 531 | static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled); | 586 | static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled); |
| 532 | static struct kobj_attribute transition_kobj_attr = __ATTR_RO(transition); | 587 | static struct kobj_attribute transition_kobj_attr = __ATTR_RO(transition); |
| 588 | static struct kobj_attribute signal_kobj_attr = __ATTR_WO(signal); | ||
| 589 | static struct kobj_attribute force_kobj_attr = __ATTR_WO(force); | ||
| 533 | static struct attribute *klp_patch_attrs[] = { | 590 | static struct attribute *klp_patch_attrs[] = { |
| 534 | &enabled_kobj_attr.attr, | 591 | &enabled_kobj_attr.attr, |
| 535 | &transition_kobj_attr.attr, | 592 | &transition_kobj_attr.attr, |
| 593 | &signal_kobj_attr.attr, | ||
| 594 | &force_kobj_attr.attr, | ||
| 536 | NULL | 595 | NULL |
| 537 | }; | 596 | }; |
| 538 | 597 | ||
| @@ -830,12 +889,7 @@ int klp_register_patch(struct klp_patch *patch) | |||
| 830 | if (!klp_initialized()) | 889 | if (!klp_initialized()) |
| 831 | return -ENODEV; | 890 | return -ENODEV; |
| 832 | 891 | ||
| 833 | /* | 892 | if (!klp_have_reliable_stack()) { |
| 834 | * Architectures without reliable stack traces have to set | ||
| 835 | * patch->immediate because there's currently no way to patch kthreads | ||
| 836 | * with the consistency model. | ||
| 837 | */ | ||
| 838 | if (!klp_have_reliable_stack() && !patch->immediate) { | ||
| 839 | pr_err("This architecture doesn't have support for the livepatch consistency model.\n"); | 893 | pr_err("This architecture doesn't have support for the livepatch consistency model.\n"); |
| 840 | return -ENOSYS; | 894 | return -ENOSYS; |
| 841 | } | 895 | } |
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index 56add6327736..7c6631e693bc 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c | |||
| @@ -33,6 +33,8 @@ struct klp_patch *klp_transition_patch; | |||
| 33 | 33 | ||
| 34 | static int klp_target_state = KLP_UNDEFINED; | 34 | static int klp_target_state = KLP_UNDEFINED; |
| 35 | 35 | ||
| 36 | static bool klp_forced = false; | ||
| 37 | |||
| 36 | /* | 38 | /* |
| 37 | * This work can be performed periodically to finish patching or unpatching any | 39 | * This work can be performed periodically to finish patching or unpatching any |
| 38 | * "straggler" tasks which failed to transition in the first attempt. | 40 | * "straggler" tasks which failed to transition in the first attempt. |
| @@ -80,7 +82,6 @@ static void klp_complete_transition(void) | |||
| 80 | struct klp_func *func; | 82 | struct klp_func *func; |
| 81 | struct task_struct *g, *task; | 83 | struct task_struct *g, *task; |
| 82 | unsigned int cpu; | 84 | unsigned int cpu; |
| 83 | bool immediate_func = false; | ||
| 84 | 85 | ||
| 85 | pr_debug("'%s': completing %s transition\n", | 86 | pr_debug("'%s': completing %s transition\n", |
| 86 | klp_transition_patch->mod->name, | 87 | klp_transition_patch->mod->name, |
| @@ -102,16 +103,9 @@ static void klp_complete_transition(void) | |||
| 102 | klp_synchronize_transition(); | 103 | klp_synchronize_transition(); |
| 103 | } | 104 | } |
| 104 | 105 | ||
| 105 | if (klp_transition_patch->immediate) | 106 | klp_for_each_object(klp_transition_patch, obj) |
| 106 | goto done; | 107 | klp_for_each_func(obj, func) |
| 107 | |||
| 108 | klp_for_each_object(klp_transition_patch, obj) { | ||
| 109 | klp_for_each_func(obj, func) { | ||
| 110 | func->transition = false; | 108 | func->transition = false; |
| 111 | if (func->immediate) | ||
| 112 | immediate_func = true; | ||
| 113 | } | ||
| 114 | } | ||
| 115 | 109 | ||
| 116 | /* Prevent klp_ftrace_handler() from seeing KLP_UNDEFINED state */ | 110 | /* Prevent klp_ftrace_handler() from seeing KLP_UNDEFINED state */ |
| 117 | if (klp_target_state == KLP_PATCHED) | 111 | if (klp_target_state == KLP_PATCHED) |
| @@ -130,7 +124,6 @@ static void klp_complete_transition(void) | |||
| 130 | task->patch_state = KLP_UNDEFINED; | 124 | task->patch_state = KLP_UNDEFINED; |
| 131 | } | 125 | } |
| 132 | 126 | ||
| 133 | done: | ||
| 134 | klp_for_each_object(klp_transition_patch, obj) { | 127 | klp_for_each_object(klp_transition_patch, obj) { |
| 135 | if (!klp_is_object_loaded(obj)) | 128 | if (!klp_is_object_loaded(obj)) |
| 136 | continue; | 129 | continue; |
| @@ -144,13 +137,11 @@ done: | |||
| 144 | klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); | 137 | klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); |
| 145 | 138 | ||
| 146 | /* | 139 | /* |
| 147 | * See complementary comment in __klp_enable_patch() for why we | 140 | * klp_forced set implies unbounded increase of module's ref count if |
| 148 | * keep the module reference for immediate patches. | 141 | * the module is disabled/enabled in a loop. |
| 149 | */ | 142 | */ |
| 150 | if (!klp_transition_patch->immediate && !immediate_func && | 143 | if (!klp_forced && klp_target_state == KLP_UNPATCHED) |
| 151 | klp_target_state == KLP_UNPATCHED) { | ||
| 152 | module_put(klp_transition_patch->mod); | 144 | module_put(klp_transition_patch->mod); |
| 153 | } | ||
| 154 | 145 | ||
| 155 | klp_target_state = KLP_UNDEFINED; | 146 | klp_target_state = KLP_UNDEFINED; |
| 156 | klp_transition_patch = NULL; | 147 | klp_transition_patch = NULL; |
| @@ -218,9 +209,6 @@ static int klp_check_stack_func(struct klp_func *func, | |||
| 218 | struct klp_ops *ops; | 209 | struct klp_ops *ops; |
| 219 | int i; | 210 | int i; |
| 220 | 211 | ||
| 221 | if (func->immediate) | ||
| 222 | return 0; | ||
| 223 | |||
| 224 | for (i = 0; i < trace->nr_entries; i++) { | 212 | for (i = 0; i < trace->nr_entries; i++) { |
| 225 | address = trace->entries[i]; | 213 | address = trace->entries[i]; |
| 226 | 214 | ||
| @@ -383,13 +371,6 @@ void klp_try_complete_transition(void) | |||
| 383 | WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED); | 371 | WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED); |
| 384 | 372 | ||
| 385 | /* | 373 | /* |
| 386 | * If the patch can be applied or reverted immediately, skip the | ||
| 387 | * per-task transitions. | ||
| 388 | */ | ||
| 389 | if (klp_transition_patch->immediate) | ||
| 390 | goto success; | ||
| 391 | |||
| 392 | /* | ||
| 393 | * Try to switch the tasks to the target patch state by walking their | 374 | * Try to switch the tasks to the target patch state by walking their |
| 394 | * stacks and looking for any to-be-patched or to-be-unpatched | 375 | * stacks and looking for any to-be-patched or to-be-unpatched |
| 395 | * functions. If such functions are found on a stack, or if the stack | 376 | * functions. If such functions are found on a stack, or if the stack |
| @@ -432,7 +413,6 @@ void klp_try_complete_transition(void) | |||
| 432 | return; | 413 | return; |
| 433 | } | 414 | } |
| 434 | 415 | ||
| 435 | success: | ||
| 436 | /* we're done, now cleanup the data structures */ | 416 | /* we're done, now cleanup the data structures */ |
| 437 | klp_complete_transition(); | 417 | klp_complete_transition(); |
| 438 | } | 418 | } |
| @@ -453,13 +433,6 @@ void klp_start_transition(void) | |||
| 453 | klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); | 433 | klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); |
| 454 | 434 | ||
| 455 | /* | 435 | /* |
| 456 | * If the patch can be applied or reverted immediately, skip the | ||
| 457 | * per-task transitions. | ||
| 458 | */ | ||
| 459 | if (klp_transition_patch->immediate) | ||
| 460 | return; | ||
| 461 | |||
| 462 | /* | ||
| 463 | * Mark all normal tasks as needing a patch state update. They'll | 436 | * Mark all normal tasks as needing a patch state update. They'll |
| 464 | * switch either in klp_try_complete_transition() or as they exit the | 437 | * switch either in klp_try_complete_transition() or as they exit the |
| 465 | * kernel. | 438 | * kernel. |
| @@ -509,13 +482,6 @@ void klp_init_transition(struct klp_patch *patch, int state) | |||
| 509 | klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); | 482 | klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); |
| 510 | 483 | ||
| 511 | /* | 484 | /* |
| 512 | * If the patch can be applied or reverted immediately, skip the | ||
| 513 | * per-task transitions. | ||
| 514 | */ | ||
| 515 | if (patch->immediate) | ||
| 516 | return; | ||
| 517 | |||
| 518 | /* | ||
| 519 | * Initialize all tasks to the initial patch state to prepare them for | 485 | * Initialize all tasks to the initial patch state to prepare them for |
| 520 | * switching to the target state. | 486 | * switching to the target state. |
| 521 | */ | 487 | */ |
| @@ -608,3 +574,71 @@ void klp_copy_process(struct task_struct *child) | |||
| 608 | 574 | ||
| 609 | /* TIF_PATCH_PENDING gets copied in setup_thread_stack() */ | 575 | /* TIF_PATCH_PENDING gets copied in setup_thread_stack() */ |
| 610 | } | 576 | } |
| 577 | |||
| 578 | /* | ||
| 579 | * Sends a fake signal to all non-kthread tasks with TIF_PATCH_PENDING set. | ||
| 580 | * Kthreads with TIF_PATCH_PENDING set are woken up. Only admin can request this | ||
| 581 | * action currently. | ||
| 582 | */ | ||
| 583 | void klp_send_signals(void) | ||
| 584 | { | ||
| 585 | struct task_struct *g, *task; | ||
| 586 | |||
| 587 | pr_notice("signaling remaining tasks\n"); | ||
| 588 | |||
| 589 | read_lock(&tasklist_lock); | ||
| 590 | for_each_process_thread(g, task) { | ||
| 591 | if (!klp_patch_pending(task)) | ||
| 592 | continue; | ||
| 593 | |||
| 594 | /* | ||
| 595 | * There is a small race here. We could see TIF_PATCH_PENDING | ||
| 596 | * set and decide to wake up a kthread or send a fake signal. | ||
| 597 | * Meanwhile the task could migrate itself and the action | ||
| 598 | * would be meaningless. It is not serious though. | ||
| 599 | */ | ||
| 600 | if (task->flags & PF_KTHREAD) { | ||
| 601 | /* | ||
| 602 | * Wake up a kthread which sleeps interruptedly and | ||
| 603 | * still has not been migrated. | ||
| 604 | */ | ||
| 605 | wake_up_state(task, TASK_INTERRUPTIBLE); | ||
| 606 | } else { | ||
| 607 | /* | ||
| 608 | * Send fake signal to all non-kthread tasks which are | ||
| 609 | * still not migrated. | ||
| 610 | */ | ||
| 611 | spin_lock_irq(&task->sighand->siglock); | ||
| 612 | signal_wake_up(task, 0); | ||
| 613 | spin_unlock_irq(&task->sighand->siglock); | ||
| 614 | } | ||
| 615 | } | ||
| 616 | read_unlock(&tasklist_lock); | ||
| 617 | } | ||
| 618 | |||
| 619 | /* | ||
| 620 | * Drop TIF_PATCH_PENDING of all tasks on admin's request. This forces an | ||
| 621 | * existing transition to finish. | ||
| 622 | * | ||
| 623 | * NOTE: klp_update_patch_state(task) requires the task to be inactive or | ||
| 624 | * 'current'. This is not the case here and the consistency model could be | ||
| 625 | * broken. Administrator, who is the only one to execute the | ||
| 626 | * klp_force_transitions(), has to be aware of this. | ||
| 627 | */ | ||
| 628 | void klp_force_transition(void) | ||
| 629 | { | ||
| 630 | struct task_struct *g, *task; | ||
| 631 | unsigned int cpu; | ||
| 632 | |||
| 633 | pr_warn("forcing remaining tasks to the patched state\n"); | ||
| 634 | |||
| 635 | read_lock(&tasklist_lock); | ||
| 636 | for_each_process_thread(g, task) | ||
| 637 | klp_update_patch_state(task); | ||
| 638 | read_unlock(&tasklist_lock); | ||
| 639 | |||
| 640 | for_each_possible_cpu(cpu) | ||
| 641 | klp_update_patch_state(idle_task(cpu)); | ||
| 642 | |||
| 643 | klp_forced = true; | ||
| 644 | } | ||
diff --git a/kernel/livepatch/transition.h b/kernel/livepatch/transition.h index 0f6e27c481f9..f9d0bc016067 100644 --- a/kernel/livepatch/transition.h +++ b/kernel/livepatch/transition.h | |||
| @@ -11,5 +11,7 @@ void klp_cancel_transition(void); | |||
| 11 | void klp_start_transition(void); | 11 | void klp_start_transition(void); |
| 12 | void klp_try_complete_transition(void); | 12 | void klp_try_complete_transition(void); |
| 13 | void klp_reverse_transition(void); | 13 | void klp_reverse_transition(void); |
| 14 | void klp_send_signals(void); | ||
| 15 | void klp_force_transition(void); | ||
| 14 | 16 | ||
| 15 | #endif /* _LIVEPATCH_TRANSITION_H */ | 17 | #endif /* _LIVEPATCH_TRANSITION_H */ |
diff --git a/kernel/memremap.c b/kernel/memremap.c index 403ab9cdb949..4849be5f9b3c 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c | |||
| @@ -188,13 +188,6 @@ static RADIX_TREE(pgmap_radix, GFP_KERNEL); | |||
| 188 | #define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1) | 188 | #define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1) |
| 189 | #define SECTION_SIZE (1UL << PA_SECTION_SHIFT) | 189 | #define SECTION_SIZE (1UL << PA_SECTION_SHIFT) |
| 190 | 190 | ||
| 191 | struct page_map { | ||
| 192 | struct resource res; | ||
| 193 | struct percpu_ref *ref; | ||
| 194 | struct dev_pagemap pgmap; | ||
| 195 | struct vmem_altmap altmap; | ||
| 196 | }; | ||
| 197 | |||
| 198 | static unsigned long order_at(struct resource *res, unsigned long pgoff) | 191 | static unsigned long order_at(struct resource *res, unsigned long pgoff) |
| 199 | { | 192 | { |
| 200 | unsigned long phys_pgoff = PHYS_PFN(res->start) + pgoff; | 193 | unsigned long phys_pgoff = PHYS_PFN(res->start) + pgoff; |
| @@ -248,34 +241,36 @@ int device_private_entry_fault(struct vm_area_struct *vma, | |||
| 248 | EXPORT_SYMBOL(device_private_entry_fault); | 241 | EXPORT_SYMBOL(device_private_entry_fault); |
| 249 | #endif /* CONFIG_DEVICE_PRIVATE */ | 242 | #endif /* CONFIG_DEVICE_PRIVATE */ |
| 250 | 243 | ||
| 251 | static void pgmap_radix_release(struct resource *res) | 244 | static void pgmap_radix_release(struct resource *res, unsigned long end_pgoff) |
| 252 | { | 245 | { |
| 253 | unsigned long pgoff, order; | 246 | unsigned long pgoff, order; |
| 254 | 247 | ||
| 255 | mutex_lock(&pgmap_lock); | 248 | mutex_lock(&pgmap_lock); |
| 256 | foreach_order_pgoff(res, order, pgoff) | 249 | foreach_order_pgoff(res, order, pgoff) { |
| 250 | if (pgoff >= end_pgoff) | ||
| 251 | break; | ||
| 257 | radix_tree_delete(&pgmap_radix, PHYS_PFN(res->start) + pgoff); | 252 | radix_tree_delete(&pgmap_radix, PHYS_PFN(res->start) + pgoff); |
| 253 | } | ||
| 258 | mutex_unlock(&pgmap_lock); | 254 | mutex_unlock(&pgmap_lock); |
| 259 | 255 | ||
| 260 | synchronize_rcu(); | 256 | synchronize_rcu(); |
| 261 | } | 257 | } |
| 262 | 258 | ||
| 263 | static unsigned long pfn_first(struct page_map *page_map) | 259 | static unsigned long pfn_first(struct dev_pagemap *pgmap) |
| 264 | { | 260 | { |
| 265 | struct dev_pagemap *pgmap = &page_map->pgmap; | 261 | const struct resource *res = &pgmap->res; |
| 266 | const struct resource *res = &page_map->res; | 262 | struct vmem_altmap *altmap = &pgmap->altmap; |
| 267 | struct vmem_altmap *altmap = pgmap->altmap; | ||
| 268 | unsigned long pfn; | 263 | unsigned long pfn; |
| 269 | 264 | ||
| 270 | pfn = res->start >> PAGE_SHIFT; | 265 | pfn = res->start >> PAGE_SHIFT; |
| 271 | if (altmap) | 266 | if (pgmap->altmap_valid) |
| 272 | pfn += vmem_altmap_offset(altmap); | 267 | pfn += vmem_altmap_offset(altmap); |
| 273 | return pfn; | 268 | return pfn; |
| 274 | } | 269 | } |
| 275 | 270 | ||
| 276 | static unsigned long pfn_end(struct page_map *page_map) | 271 | static unsigned long pfn_end(struct dev_pagemap *pgmap) |
| 277 | { | 272 | { |
| 278 | const struct resource *res = &page_map->res; | 273 | const struct resource *res = &pgmap->res; |
| 279 | 274 | ||
| 280 | return (res->start + resource_size(res)) >> PAGE_SHIFT; | 275 | return (res->start + resource_size(res)) >> PAGE_SHIFT; |
| 281 | } | 276 | } |
| @@ -283,15 +278,15 @@ static unsigned long pfn_end(struct page_map *page_map) | |||
| 283 | #define for_each_device_pfn(pfn, map) \ | 278 | #define for_each_device_pfn(pfn, map) \ |
| 284 | for (pfn = pfn_first(map); pfn < pfn_end(map); pfn++) | 279 | for (pfn = pfn_first(map); pfn < pfn_end(map); pfn++) |
| 285 | 280 | ||
| 286 | static void devm_memremap_pages_release(struct device *dev, void *data) | 281 | static void devm_memremap_pages_release(void *data) |
| 287 | { | 282 | { |
| 288 | struct page_map *page_map = data; | 283 | struct dev_pagemap *pgmap = data; |
| 289 | struct resource *res = &page_map->res; | 284 | struct device *dev = pgmap->dev; |
| 285 | struct resource *res = &pgmap->res; | ||
| 290 | resource_size_t align_start, align_size; | 286 | resource_size_t align_start, align_size; |
| 291 | struct dev_pagemap *pgmap = &page_map->pgmap; | ||
| 292 | unsigned long pfn; | 287 | unsigned long pfn; |
| 293 | 288 | ||
| 294 | for_each_device_pfn(pfn, page_map) | 289 | for_each_device_pfn(pfn, pgmap) |
| 295 | put_page(pfn_to_page(pfn)); | 290 | put_page(pfn_to_page(pfn)); |
| 296 | 291 | ||
| 297 | if (percpu_ref_tryget_live(pgmap->ref)) { | 292 | if (percpu_ref_tryget_live(pgmap->ref)) { |
| @@ -301,56 +296,51 @@ static void devm_memremap_pages_release(struct device *dev, void *data) | |||
| 301 | 296 | ||
| 302 | /* pages are dead and unused, undo the arch mapping */ | 297 | /* pages are dead and unused, undo the arch mapping */ |
| 303 | align_start = res->start & ~(SECTION_SIZE - 1); | 298 | align_start = res->start & ~(SECTION_SIZE - 1); |
| 304 | align_size = ALIGN(resource_size(res), SECTION_SIZE); | 299 | align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) |
| 300 | - align_start; | ||
| 305 | 301 | ||
| 306 | mem_hotplug_begin(); | 302 | mem_hotplug_begin(); |
| 307 | arch_remove_memory(align_start, align_size); | 303 | arch_remove_memory(align_start, align_size, pgmap->altmap_valid ? |
| 304 | &pgmap->altmap : NULL); | ||
| 308 | mem_hotplug_done(); | 305 | mem_hotplug_done(); |
| 309 | 306 | ||
| 310 | untrack_pfn(NULL, PHYS_PFN(align_start), align_size); | 307 | untrack_pfn(NULL, PHYS_PFN(align_start), align_size); |
| 311 | pgmap_radix_release(res); | 308 | pgmap_radix_release(res, -1); |
| 312 | dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc, | 309 | dev_WARN_ONCE(dev, pgmap->altmap.alloc, |
| 313 | "%s: failed to free all reserved pages\n", __func__); | 310 | "%s: failed to free all reserved pages\n", __func__); |
| 314 | } | ||
| 315 | |||
| 316 | /* assumes rcu_read_lock() held at entry */ | ||
| 317 | struct dev_pagemap *find_dev_pagemap(resource_size_t phys) | ||
| 318 | { | ||
| 319 | struct page_map *page_map; | ||
| 320 | |||
| 321 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
| 322 | |||
| 323 | page_map = radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys)); | ||
| 324 | return page_map ? &page_map->pgmap : NULL; | ||
| 325 | } | 311 | } |
| 326 | 312 | ||
| 327 | /** | 313 | /** |
| 328 | * devm_memremap_pages - remap and provide memmap backing for the given resource | 314 | * devm_memremap_pages - remap and provide memmap backing for the given resource |
| 329 | * @dev: hosting device for @res | 315 | * @dev: hosting device for @res |
| 330 | * @res: "host memory" address range | 316 | * @pgmap: pointer to a struct dev_pgmap |
| 331 | * @ref: a live per-cpu reference count | ||
| 332 | * @altmap: optional descriptor for allocating the memmap from @res | ||
| 333 | * | 317 | * |
| 334 | * Notes: | 318 | * Notes: |
| 335 | * 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time | 319 | * 1/ At a minimum the res, ref and type members of @pgmap must be initialized |
| 336 | * (or devm release event). The expected order of events is that @ref has | 320 | * by the caller before passing it to this function |
| 321 | * | ||
| 322 | * 2/ The altmap field may optionally be initialized, in which case altmap_valid | ||
| 323 | * must be set to true | ||
| 324 | * | ||
| 325 | * 3/ pgmap.ref must be 'live' on entry and 'dead' before devm_memunmap_pages() | ||
| 326 | * time (or devm release event). The expected order of events is that ref has | ||
| 337 | * been through percpu_ref_kill() before devm_memremap_pages_release(). The | 327 | * been through percpu_ref_kill() before devm_memremap_pages_release(). The |
| 338 | * wait for the completion of all references being dropped and | 328 | * wait for the completion of all references being dropped and |
| 339 | * percpu_ref_exit() must occur after devm_memremap_pages_release(). | 329 | * percpu_ref_exit() must occur after devm_memremap_pages_release(). |
| 340 | * | 330 | * |
| 341 | * 2/ @res is expected to be a host memory range that could feasibly be | 331 | * 4/ res is expected to be a host memory range that could feasibly be |
| 342 | * treated as a "System RAM" range, i.e. not a device mmio range, but | 332 | * treated as a "System RAM" range, i.e. not a device mmio range, but |
| 343 | * this is not enforced. | 333 | * this is not enforced. |
| 344 | */ | 334 | */ |
| 345 | void *devm_memremap_pages(struct device *dev, struct resource *res, | 335 | void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) |
| 346 | struct percpu_ref *ref, struct vmem_altmap *altmap) | ||
| 347 | { | 336 | { |
| 348 | resource_size_t align_start, align_size, align_end; | 337 | resource_size_t align_start, align_size, align_end; |
| 338 | struct vmem_altmap *altmap = pgmap->altmap_valid ? | ||
| 339 | &pgmap->altmap : NULL; | ||
| 349 | unsigned long pfn, pgoff, order; | 340 | unsigned long pfn, pgoff, order; |
| 350 | pgprot_t pgprot = PAGE_KERNEL; | 341 | pgprot_t pgprot = PAGE_KERNEL; |
| 351 | struct dev_pagemap *pgmap; | ||
| 352 | struct page_map *page_map; | ||
| 353 | int error, nid, is_ram, i = 0; | 342 | int error, nid, is_ram, i = 0; |
| 343 | struct resource *res = &pgmap->res; | ||
| 354 | 344 | ||
| 355 | align_start = res->start & ~(SECTION_SIZE - 1); | 345 | align_start = res->start & ~(SECTION_SIZE - 1); |
| 356 | align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) | 346 | align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) |
| @@ -367,47 +357,18 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, | |||
| 367 | if (is_ram == REGION_INTERSECTS) | 357 | if (is_ram == REGION_INTERSECTS) |
| 368 | return __va(res->start); | 358 | return __va(res->start); |
| 369 | 359 | ||
| 370 | if (!ref) | 360 | if (!pgmap->ref) |
| 371 | return ERR_PTR(-EINVAL); | 361 | return ERR_PTR(-EINVAL); |
| 372 | 362 | ||
| 373 | page_map = devres_alloc_node(devm_memremap_pages_release, | ||
| 374 | sizeof(*page_map), GFP_KERNEL, dev_to_node(dev)); | ||
| 375 | if (!page_map) | ||
| 376 | return ERR_PTR(-ENOMEM); | ||
| 377 | pgmap = &page_map->pgmap; | ||
| 378 | |||
| 379 | memcpy(&page_map->res, res, sizeof(*res)); | ||
| 380 | |||
| 381 | pgmap->dev = dev; | 363 | pgmap->dev = dev; |
| 382 | if (altmap) { | ||
| 383 | memcpy(&page_map->altmap, altmap, sizeof(*altmap)); | ||
| 384 | pgmap->altmap = &page_map->altmap; | ||
| 385 | } | ||
| 386 | pgmap->ref = ref; | ||
| 387 | pgmap->res = &page_map->res; | ||
| 388 | pgmap->type = MEMORY_DEVICE_HOST; | ||
| 389 | pgmap->page_fault = NULL; | ||
| 390 | pgmap->page_free = NULL; | ||
| 391 | pgmap->data = NULL; | ||
| 392 | 364 | ||
| 393 | mutex_lock(&pgmap_lock); | 365 | mutex_lock(&pgmap_lock); |
| 394 | error = 0; | 366 | error = 0; |
| 395 | align_end = align_start + align_size - 1; | 367 | align_end = align_start + align_size - 1; |
| 396 | 368 | ||
| 397 | foreach_order_pgoff(res, order, pgoff) { | 369 | foreach_order_pgoff(res, order, pgoff) { |
| 398 | struct dev_pagemap *dup; | ||
| 399 | |||
| 400 | rcu_read_lock(); | ||
| 401 | dup = find_dev_pagemap(res->start + PFN_PHYS(pgoff)); | ||
| 402 | rcu_read_unlock(); | ||
| 403 | if (dup) { | ||
| 404 | dev_err(dev, "%s: %pr collides with mapping for %s\n", | ||
| 405 | __func__, res, dev_name(dup->dev)); | ||
| 406 | error = -EBUSY; | ||
| 407 | break; | ||
| 408 | } | ||
| 409 | error = __radix_tree_insert(&pgmap_radix, | 370 | error = __radix_tree_insert(&pgmap_radix, |
| 410 | PHYS_PFN(res->start) + pgoff, order, page_map); | 371 | PHYS_PFN(res->start) + pgoff, order, pgmap); |
| 411 | if (error) { | 372 | if (error) { |
| 412 | dev_err(dev, "%s: failed: %d\n", __func__, error); | 373 | dev_err(dev, "%s: failed: %d\n", __func__, error); |
| 413 | break; | 374 | break; |
| @@ -427,16 +388,16 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, | |||
| 427 | goto err_pfn_remap; | 388 | goto err_pfn_remap; |
| 428 | 389 | ||
| 429 | mem_hotplug_begin(); | 390 | mem_hotplug_begin(); |
| 430 | error = arch_add_memory(nid, align_start, align_size, false); | 391 | error = arch_add_memory(nid, align_start, align_size, altmap, false); |
| 431 | if (!error) | 392 | if (!error) |
| 432 | move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], | 393 | move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], |
| 433 | align_start >> PAGE_SHIFT, | 394 | align_start >> PAGE_SHIFT, |
| 434 | align_size >> PAGE_SHIFT); | 395 | align_size >> PAGE_SHIFT, altmap); |
| 435 | mem_hotplug_done(); | 396 | mem_hotplug_done(); |
| 436 | if (error) | 397 | if (error) |
| 437 | goto err_add_memory; | 398 | goto err_add_memory; |
| 438 | 399 | ||
| 439 | for_each_device_pfn(pfn, page_map) { | 400 | for_each_device_pfn(pfn, pgmap) { |
| 440 | struct page *page = pfn_to_page(pfn); | 401 | struct page *page = pfn_to_page(pfn); |
| 441 | 402 | ||
| 442 | /* | 403 | /* |
| @@ -447,19 +408,21 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, | |||
| 447 | */ | 408 | */ |
| 448 | list_del(&page->lru); | 409 | list_del(&page->lru); |
| 449 | page->pgmap = pgmap; | 410 | page->pgmap = pgmap; |
| 450 | percpu_ref_get(ref); | 411 | percpu_ref_get(pgmap->ref); |
| 451 | if (!(++i % 1024)) | 412 | if (!(++i % 1024)) |
| 452 | cond_resched(); | 413 | cond_resched(); |
| 453 | } | 414 | } |
| 454 | devres_add(dev, page_map); | 415 | |
| 416 | devm_add_action(dev, devm_memremap_pages_release, pgmap); | ||
| 417 | |||
| 455 | return __va(res->start); | 418 | return __va(res->start); |
| 456 | 419 | ||
| 457 | err_add_memory: | 420 | err_add_memory: |
| 458 | untrack_pfn(NULL, PHYS_PFN(align_start), align_size); | 421 | untrack_pfn(NULL, PHYS_PFN(align_start), align_size); |
| 459 | err_pfn_remap: | 422 | err_pfn_remap: |
| 460 | err_radix: | 423 | err_radix: |
| 461 | pgmap_radix_release(res); | 424 | pgmap_radix_release(res, pgoff); |
| 462 | devres_free(page_map); | 425 | devres_free(pgmap); |
| 463 | return ERR_PTR(error); | 426 | return ERR_PTR(error); |
| 464 | } | 427 | } |
| 465 | EXPORT_SYMBOL(devm_memremap_pages); | 428 | EXPORT_SYMBOL(devm_memremap_pages); |
| @@ -475,34 +438,39 @@ void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns) | |||
| 475 | altmap->alloc -= nr_pfns; | 438 | altmap->alloc -= nr_pfns; |
| 476 | } | 439 | } |
| 477 | 440 | ||
| 478 | struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) | 441 | /** |
| 442 | * get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn | ||
| 443 | * @pfn: page frame number to lookup page_map | ||
| 444 | * @pgmap: optional known pgmap that already has a reference | ||
| 445 | * | ||
| 446 | * If @pgmap is non-NULL and covers @pfn it will be returned as-is. If @pgmap | ||
| 447 | * is non-NULL but does not cover @pfn the reference to it will be released. | ||
| 448 | */ | ||
| 449 | struct dev_pagemap *get_dev_pagemap(unsigned long pfn, | ||
| 450 | struct dev_pagemap *pgmap) | ||
| 479 | { | 451 | { |
| 480 | /* | 452 | resource_size_t phys = PFN_PHYS(pfn); |
| 481 | * 'memmap_start' is the virtual address for the first "struct | ||
| 482 | * page" in this range of the vmemmap array. In the case of | ||
| 483 | * CONFIG_SPARSEMEM_VMEMMAP a page_to_pfn conversion is simple | ||
| 484 | * pointer arithmetic, so we can perform this to_vmem_altmap() | ||
| 485 | * conversion without concern for the initialization state of | ||
| 486 | * the struct page fields. | ||
| 487 | */ | ||
| 488 | struct page *page = (struct page *) memmap_start; | ||
| 489 | struct dev_pagemap *pgmap; | ||
| 490 | 453 | ||
| 491 | /* | 454 | /* |
| 492 | * Unconditionally retrieve a dev_pagemap associated with the | 455 | * In the cached case we're already holding a live reference. |
| 493 | * given physical address, this is only for use in the | ||
| 494 | * arch_{add|remove}_memory() for setting up and tearing down | ||
| 495 | * the memmap. | ||
| 496 | */ | 456 | */ |
| 457 | if (pgmap) { | ||
| 458 | if (phys >= pgmap->res.start && phys <= pgmap->res.end) | ||
| 459 | return pgmap; | ||
| 460 | put_dev_pagemap(pgmap); | ||
| 461 | } | ||
| 462 | |||
| 463 | /* fall back to slow path lookup */ | ||
| 497 | rcu_read_lock(); | 464 | rcu_read_lock(); |
| 498 | pgmap = find_dev_pagemap(__pfn_to_phys(page_to_pfn(page))); | 465 | pgmap = radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys)); |
| 466 | if (pgmap && !percpu_ref_tryget_live(pgmap->ref)) | ||
| 467 | pgmap = NULL; | ||
| 499 | rcu_read_unlock(); | 468 | rcu_read_unlock(); |
| 500 | 469 | ||
| 501 | return pgmap ? pgmap->altmap : NULL; | 470 | return pgmap; |
| 502 | } | 471 | } |
| 503 | #endif /* CONFIG_ZONE_DEVICE */ | 472 | #endif /* CONFIG_ZONE_DEVICE */ |
| 504 | 473 | ||
| 505 | |||
| 506 | #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) | 474 | #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) |
| 507 | void put_zone_device_private_or_public_page(struct page *page) | 475 | void put_zone_device_private_or_public_page(struct page *page) |
| 508 | { | 476 | { |
diff --git a/kernel/module.c b/kernel/module.c index 09e48eee4d55..ccdf24c4949e 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
| @@ -3129,7 +3129,11 @@ static int find_module_sections(struct module *mod, struct load_info *info) | |||
| 3129 | sizeof(*mod->ftrace_callsites), | 3129 | sizeof(*mod->ftrace_callsites), |
| 3130 | &mod->num_ftrace_callsites); | 3130 | &mod->num_ftrace_callsites); |
| 3131 | #endif | 3131 | #endif |
| 3132 | 3132 | #ifdef CONFIG_FUNCTION_ERROR_INJECTION | |
| 3133 | mod->ei_funcs = section_objs(info, "_error_injection_whitelist", | ||
| 3134 | sizeof(*mod->ei_funcs), | ||
| 3135 | &mod->num_ei_funcs); | ||
| 3136 | #endif | ||
| 3133 | mod->extable = section_objs(info, "__ex_table", | 3137 | mod->extable = section_objs(info, "__ex_table", |
| 3134 | sizeof(*mod->extable), &mod->num_exentries); | 3138 | sizeof(*mod->extable), &mod->num_exentries); |
| 3135 | 3139 | ||
| @@ -3949,6 +3953,12 @@ static const char *get_ksymbol(struct module *mod, | |||
| 3949 | return symname(kallsyms, best); | 3953 | return symname(kallsyms, best); |
| 3950 | } | 3954 | } |
| 3951 | 3955 | ||
| 3956 | void * __weak dereference_module_function_descriptor(struct module *mod, | ||
| 3957 | void *ptr) | ||
| 3958 | { | ||
| 3959 | return ptr; | ||
| 3960 | } | ||
| 3961 | |||
| 3952 | /* For kallsyms to ask for address resolution. NULL means not found. Careful | 3962 | /* For kallsyms to ask for address resolution. NULL means not found. Careful |
| 3953 | * not to lock to avoid deadlock on oopses, simply disable preemption. */ | 3963 | * not to lock to avoid deadlock on oopses, simply disable preemption. */ |
| 3954 | const char *module_address_lookup(unsigned long addr, | 3964 | const char *module_address_lookup(unsigned long addr, |
diff --git a/kernel/padata.c b/kernel/padata.c index 57c0074d50cc..d568cc56405f 100644 --- a/kernel/padata.c +++ b/kernel/padata.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0 | ||
| 1 | /* | 2 | /* |
| 2 | * padata.c - generic interface to process data streams in parallel | 3 | * padata.c - generic interface to process data streams in parallel |
| 3 | * | 4 | * |
diff --git a/kernel/power/power.h b/kernel/power/power.h index f29cd178df90..9e58bdc8a562 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
| @@ -104,9 +104,6 @@ extern int in_suspend; | |||
| 104 | extern dev_t swsusp_resume_device; | 104 | extern dev_t swsusp_resume_device; |
| 105 | extern sector_t swsusp_resume_block; | 105 | extern sector_t swsusp_resume_block; |
| 106 | 106 | ||
| 107 | extern asmlinkage int swsusp_arch_suspend(void); | ||
| 108 | extern asmlinkage int swsusp_arch_resume(void); | ||
| 109 | |||
| 110 | extern int create_basic_memory_bitmaps(void); | 107 | extern int create_basic_memory_bitmaps(void); |
| 111 | extern void free_basic_memory_bitmaps(void); | 108 | extern void free_basic_memory_bitmaps(void); |
| 112 | extern int hibernate_preallocate_memory(void); | 109 | extern int hibernate_preallocate_memory(void); |
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index b9006617710f..db4b9b8929eb 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c | |||
| @@ -131,13 +131,10 @@ static int __init control_devkmsg(char *str) | |||
| 131 | /* | 131 | /* |
| 132 | * Set sysctl string accordingly: | 132 | * Set sysctl string accordingly: |
| 133 | */ | 133 | */ |
| 134 | if (devkmsg_log == DEVKMSG_LOG_MASK_ON) { | 134 | if (devkmsg_log == DEVKMSG_LOG_MASK_ON) |
| 135 | memset(devkmsg_log_str, 0, DEVKMSG_STR_MAX_SIZE); | 135 | strcpy(devkmsg_log_str, "on"); |
| 136 | strncpy(devkmsg_log_str, "on", 2); | 136 | else if (devkmsg_log == DEVKMSG_LOG_MASK_OFF) |
| 137 | } else if (devkmsg_log == DEVKMSG_LOG_MASK_OFF) { | 137 | strcpy(devkmsg_log_str, "off"); |
| 138 | memset(devkmsg_log_str, 0, DEVKMSG_STR_MAX_SIZE); | ||
| 139 | strncpy(devkmsg_log_str, "off", 3); | ||
| 140 | } | ||
| 141 | /* else "ratelimit" which is set by default. */ | 138 | /* else "ratelimit" which is set by default. */ |
| 142 | 139 | ||
| 143 | /* | 140 | /* |
| @@ -277,6 +274,13 @@ EXPORT_SYMBOL(console_set_on_cmdline); | |||
| 277 | /* Flag: console code may call schedule() */ | 274 | /* Flag: console code may call schedule() */ |
| 278 | static int console_may_schedule; | 275 | static int console_may_schedule; |
| 279 | 276 | ||
| 277 | enum con_msg_format_flags { | ||
| 278 | MSG_FORMAT_DEFAULT = 0, | ||
| 279 | MSG_FORMAT_SYSLOG = (1 << 0), | ||
| 280 | }; | ||
| 281 | |||
| 282 | static int console_msg_format = MSG_FORMAT_DEFAULT; | ||
| 283 | |||
| 280 | /* | 284 | /* |
| 281 | * The printk log buffer consists of a chain of concatenated variable | 285 | * The printk log buffer consists of a chain of concatenated variable |
| 282 | * length records. Every record starts with a record header, containing | 286 | * length records. Every record starts with a record header, containing |
| @@ -920,10 +924,10 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) | |||
| 920 | return ret; | 924 | return ret; |
| 921 | } | 925 | } |
| 922 | 926 | ||
| 923 | static unsigned int devkmsg_poll(struct file *file, poll_table *wait) | 927 | static __poll_t devkmsg_poll(struct file *file, poll_table *wait) |
| 924 | { | 928 | { |
| 925 | struct devkmsg_user *user = file->private_data; | 929 | struct devkmsg_user *user = file->private_data; |
| 926 | int ret = 0; | 930 | __poll_t ret = 0; |
| 927 | 931 | ||
| 928 | if (!user) | 932 | if (!user) |
| 929 | return POLLERR|POLLNVAL; | 933 | return POLLERR|POLLNVAL; |
| @@ -1544,6 +1548,146 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) | |||
| 1544 | } | 1548 | } |
| 1545 | 1549 | ||
| 1546 | /* | 1550 | /* |
| 1551 | * Special console_lock variants that help to reduce the risk of soft-lockups. | ||
| 1552 | * They allow to pass console_lock to another printk() call using a busy wait. | ||
| 1553 | */ | ||
| 1554 | |||
| 1555 | #ifdef CONFIG_LOCKDEP | ||
| 1556 | static struct lockdep_map console_owner_dep_map = { | ||
| 1557 | .name = "console_owner" | ||
| 1558 | }; | ||
| 1559 | #endif | ||
| 1560 | |||
| 1561 | static DEFINE_RAW_SPINLOCK(console_owner_lock); | ||
| 1562 | static struct task_struct *console_owner; | ||
| 1563 | static bool console_waiter; | ||
| 1564 | |||
| 1565 | /** | ||
| 1566 | * console_lock_spinning_enable - mark beginning of code where another | ||
| 1567 | * thread might safely busy wait | ||
| 1568 | * | ||
| 1569 | * This basically converts console_lock into a spinlock. This marks | ||
| 1570 | * the section where the console_lock owner can not sleep, because | ||
| 1571 | * there may be a waiter spinning (like a spinlock). Also it must be | ||
| 1572 | * ready to hand over the lock at the end of the section. | ||
| 1573 | */ | ||
| 1574 | static void console_lock_spinning_enable(void) | ||
| 1575 | { | ||
| 1576 | raw_spin_lock(&console_owner_lock); | ||
| 1577 | console_owner = current; | ||
| 1578 | raw_spin_unlock(&console_owner_lock); | ||
| 1579 | |||
| 1580 | /* The waiter may spin on us after setting console_owner */ | ||
| 1581 | spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); | ||
| 1582 | } | ||
| 1583 | |||
| 1584 | /** | ||
| 1585 | * console_lock_spinning_disable_and_check - mark end of code where another | ||
| 1586 | * thread was able to busy wait and check if there is a waiter | ||
| 1587 | * | ||
| 1588 | * This is called at the end of the section where spinning is allowed. | ||
| 1589 | * It has two functions. First, it is a signal that it is no longer | ||
| 1590 | * safe to start busy waiting for the lock. Second, it checks if | ||
| 1591 | * there is a busy waiter and passes the lock rights to her. | ||
| 1592 | * | ||
| 1593 | * Important: Callers lose the lock if there was a busy waiter. | ||
| 1594 | * They must not touch items synchronized by console_lock | ||
| 1595 | * in this case. | ||
| 1596 | * | ||
| 1597 | * Return: 1 if the lock rights were passed, 0 otherwise. | ||
| 1598 | */ | ||
| 1599 | static int console_lock_spinning_disable_and_check(void) | ||
| 1600 | { | ||
| 1601 | int waiter; | ||
| 1602 | |||
| 1603 | raw_spin_lock(&console_owner_lock); | ||
| 1604 | waiter = READ_ONCE(console_waiter); | ||
| 1605 | console_owner = NULL; | ||
| 1606 | raw_spin_unlock(&console_owner_lock); | ||
| 1607 | |||
| 1608 | if (!waiter) { | ||
| 1609 | spin_release(&console_owner_dep_map, 1, _THIS_IP_); | ||
| 1610 | return 0; | ||
| 1611 | } | ||
| 1612 | |||
| 1613 | /* The waiter is now free to continue */ | ||
| 1614 | WRITE_ONCE(console_waiter, false); | ||
| 1615 | |||
| 1616 | spin_release(&console_owner_dep_map, 1, _THIS_IP_); | ||
| 1617 | |||
| 1618 | /* | ||
| 1619 | * Hand off console_lock to waiter. The waiter will perform | ||
| 1620 | * the up(). After this, the waiter is the console_lock owner. | ||
| 1621 | */ | ||
| 1622 | mutex_release(&console_lock_dep_map, 1, _THIS_IP_); | ||
| 1623 | return 1; | ||
| 1624 | } | ||
| 1625 | |||
| 1626 | /** | ||
| 1627 | * console_trylock_spinning - try to get console_lock by busy waiting | ||
| 1628 | * | ||
| 1629 | * This allows to busy wait for the console_lock when the current | ||
| 1630 | * owner is running in specially marked sections. It means that | ||
| 1631 | * the current owner is running and cannot reschedule until it | ||
| 1632 | * is ready to lose the lock. | ||
| 1633 | * | ||
| 1634 | * Return: 1 if we got the lock, 0 othrewise | ||
| 1635 | */ | ||
| 1636 | static int console_trylock_spinning(void) | ||
| 1637 | { | ||
| 1638 | struct task_struct *owner = NULL; | ||
| 1639 | bool waiter; | ||
| 1640 | bool spin = false; | ||
| 1641 | unsigned long flags; | ||
| 1642 | |||
| 1643 | if (console_trylock()) | ||
| 1644 | return 1; | ||
| 1645 | |||
| 1646 | printk_safe_enter_irqsave(flags); | ||
| 1647 | |||
| 1648 | raw_spin_lock(&console_owner_lock); | ||
| 1649 | owner = READ_ONCE(console_owner); | ||
| 1650 | waiter = READ_ONCE(console_waiter); | ||
| 1651 | if (!waiter && owner && owner != current) { | ||
| 1652 | WRITE_ONCE(console_waiter, true); | ||
| 1653 | spin = true; | ||
| 1654 | } | ||
| 1655 | raw_spin_unlock(&console_owner_lock); | ||
| 1656 | |||
| 1657 | /* | ||
| 1658 | * If there is an active printk() writing to the | ||
| 1659 | * consoles, instead of having it write our data too, | ||
| 1660 | * see if we can offload that load from the active | ||
| 1661 | * printer, and do some printing ourselves. | ||
| 1662 | * Go into a spin only if there isn't already a waiter | ||
| 1663 | * spinning, and there is an active printer, and | ||
| 1664 | * that active printer isn't us (recursive printk?). | ||
| 1665 | */ | ||
| 1666 | if (!spin) { | ||
| 1667 | printk_safe_exit_irqrestore(flags); | ||
| 1668 | return 0; | ||
| 1669 | } | ||
| 1670 | |||
| 1671 | /* We spin waiting for the owner to release us */ | ||
| 1672 | spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); | ||
| 1673 | /* Owner will clear console_waiter on hand off */ | ||
| 1674 | while (READ_ONCE(console_waiter)) | ||
| 1675 | cpu_relax(); | ||
| 1676 | spin_release(&console_owner_dep_map, 1, _THIS_IP_); | ||
| 1677 | |||
| 1678 | printk_safe_exit_irqrestore(flags); | ||
| 1679 | /* | ||
| 1680 | * The owner passed the console lock to us. | ||
| 1681 | * Since we did not spin on console lock, annotate | ||
| 1682 | * this as a trylock. Otherwise lockdep will | ||
| 1683 | * complain. | ||
| 1684 | */ | ||
| 1685 | mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_); | ||
| 1686 | |||
| 1687 | return 1; | ||
| 1688 | } | ||
| 1689 | |||
| 1690 | /* | ||
| 1547 | * Call the console drivers, asking them to write out | 1691 | * Call the console drivers, asking them to write out |
| 1548 | * log_buf[start] to log_buf[end - 1]. | 1692 | * log_buf[start] to log_buf[end - 1]. |
| 1549 | * The console_lock must be held. | 1693 | * The console_lock must be held. |
| @@ -1749,12 +1893,19 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
| 1749 | /* If called from the scheduler, we can not call up(). */ | 1893 | /* If called from the scheduler, we can not call up(). */ |
| 1750 | if (!in_sched) { | 1894 | if (!in_sched) { |
| 1751 | /* | 1895 | /* |
| 1896 | * Disable preemption to avoid being preempted while holding | ||
| 1897 | * console_sem which would prevent anyone from printing to | ||
| 1898 | * console | ||
| 1899 | */ | ||
| 1900 | preempt_disable(); | ||
| 1901 | /* | ||
| 1752 | * Try to acquire and then immediately release the console | 1902 | * Try to acquire and then immediately release the console |
| 1753 | * semaphore. The release will print out buffers and wake up | 1903 | * semaphore. The release will print out buffers and wake up |
| 1754 | * /dev/kmsg and syslog() users. | 1904 | * /dev/kmsg and syslog() users. |
| 1755 | */ | 1905 | */ |
| 1756 | if (console_trylock()) | 1906 | if (console_trylock_spinning()) |
| 1757 | console_unlock(); | 1907 | console_unlock(); |
| 1908 | preempt_enable(); | ||
| 1758 | } | 1909 | } |
| 1759 | 1910 | ||
| 1760 | return printed_len; | 1911 | return printed_len; |
| @@ -1855,6 +2006,8 @@ static ssize_t msg_print_ext_header(char *buf, size_t size, | |||
| 1855 | static ssize_t msg_print_ext_body(char *buf, size_t size, | 2006 | static ssize_t msg_print_ext_body(char *buf, size_t size, |
| 1856 | char *dict, size_t dict_len, | 2007 | char *dict, size_t dict_len, |
| 1857 | char *text, size_t text_len) { return 0; } | 2008 | char *text, size_t text_len) { return 0; } |
| 2009 | static void console_lock_spinning_enable(void) { } | ||
| 2010 | static int console_lock_spinning_disable_and_check(void) { return 0; } | ||
| 1858 | static void call_console_drivers(const char *ext_text, size_t ext_len, | 2011 | static void call_console_drivers(const char *ext_text, size_t ext_len, |
| 1859 | const char *text, size_t len) {} | 2012 | const char *text, size_t len) {} |
| 1860 | static size_t msg_print_text(const struct printk_log *msg, | 2013 | static size_t msg_print_text(const struct printk_log *msg, |
| @@ -1913,6 +2066,17 @@ static int __add_preferred_console(char *name, int idx, char *options, | |||
| 1913 | c->index = idx; | 2066 | c->index = idx; |
| 1914 | return 0; | 2067 | return 0; |
| 1915 | } | 2068 | } |
| 2069 | |||
| 2070 | static int __init console_msg_format_setup(char *str) | ||
| 2071 | { | ||
| 2072 | if (!strcmp(str, "syslog")) | ||
| 2073 | console_msg_format = MSG_FORMAT_SYSLOG; | ||
| 2074 | if (!strcmp(str, "default")) | ||
| 2075 | console_msg_format = MSG_FORMAT_DEFAULT; | ||
| 2076 | return 1; | ||
| 2077 | } | ||
| 2078 | __setup("console_msg_format=", console_msg_format_setup); | ||
| 2079 | |||
| 1916 | /* | 2080 | /* |
| 1917 | * Set up a console. Called via do_early_param() in init/main.c | 2081 | * Set up a console. Called via do_early_param() in init/main.c |
| 1918 | * for each "console=" parameter in the boot command line. | 2082 | * for each "console=" parameter in the boot command line. |
| @@ -2069,20 +2233,7 @@ int console_trylock(void) | |||
| 2069 | return 0; | 2233 | return 0; |
| 2070 | } | 2234 | } |
| 2071 | console_locked = 1; | 2235 | console_locked = 1; |
| 2072 | /* | 2236 | console_may_schedule = 0; |
| 2073 | * When PREEMPT_COUNT disabled we can't reliably detect if it's | ||
| 2074 | * safe to schedule (e.g. calling printk while holding a spin_lock), | ||
| 2075 | * because preempt_disable()/preempt_enable() are just barriers there | ||
| 2076 | * and preempt_count() is always 0. | ||
| 2077 | * | ||
| 2078 | * RCU read sections have a separate preemption counter when | ||
| 2079 | * PREEMPT_RCU enabled thus we must take extra care and check | ||
| 2080 | * rcu_preempt_depth(), otherwise RCU read sections modify | ||
| 2081 | * preempt_count(). | ||
| 2082 | */ | ||
| 2083 | console_may_schedule = !oops_in_progress && | ||
| 2084 | preemptible() && | ||
| 2085 | !rcu_preempt_depth(); | ||
| 2086 | return 1; | 2237 | return 1; |
| 2087 | } | 2238 | } |
| 2088 | EXPORT_SYMBOL(console_trylock); | 2239 | EXPORT_SYMBOL(console_trylock); |
| @@ -2215,7 +2366,10 @@ skip: | |||
| 2215 | goto skip; | 2366 | goto skip; |
| 2216 | } | 2367 | } |
| 2217 | 2368 | ||
| 2218 | len += msg_print_text(msg, false, text + len, sizeof(text) - len); | 2369 | len += msg_print_text(msg, |
| 2370 | console_msg_format & MSG_FORMAT_SYSLOG, | ||
| 2371 | text + len, | ||
| 2372 | sizeof(text) - len); | ||
| 2219 | if (nr_ext_console_drivers) { | 2373 | if (nr_ext_console_drivers) { |
| 2220 | ext_len = msg_print_ext_header(ext_text, | 2374 | ext_len = msg_print_ext_header(ext_text, |
| 2221 | sizeof(ext_text), | 2375 | sizeof(ext_text), |
| @@ -2229,14 +2383,29 @@ skip: | |||
| 2229 | console_seq++; | 2383 | console_seq++; |
| 2230 | raw_spin_unlock(&logbuf_lock); | 2384 | raw_spin_unlock(&logbuf_lock); |
| 2231 | 2385 | ||
| 2386 | /* | ||
| 2387 | * While actively printing out messages, if another printk() | ||
| 2388 | * were to occur on another CPU, it may wait for this one to | ||
| 2389 | * finish. This task can not be preempted if there is a | ||
| 2390 | * waiter waiting to take over. | ||
| 2391 | */ | ||
| 2392 | console_lock_spinning_enable(); | ||
| 2393 | |||
| 2232 | stop_critical_timings(); /* don't trace print latency */ | 2394 | stop_critical_timings(); /* don't trace print latency */ |
| 2233 | call_console_drivers(ext_text, ext_len, text, len); | 2395 | call_console_drivers(ext_text, ext_len, text, len); |
| 2234 | start_critical_timings(); | 2396 | start_critical_timings(); |
| 2397 | |||
| 2398 | if (console_lock_spinning_disable_and_check()) { | ||
| 2399 | printk_safe_exit_irqrestore(flags); | ||
| 2400 | return; | ||
| 2401 | } | ||
| 2402 | |||
| 2235 | printk_safe_exit_irqrestore(flags); | 2403 | printk_safe_exit_irqrestore(flags); |
| 2236 | 2404 | ||
| 2237 | if (do_cond_resched) | 2405 | if (do_cond_resched) |
| 2238 | cond_resched(); | 2406 | cond_resched(); |
| 2239 | } | 2407 | } |
| 2408 | |||
| 2240 | console_locked = 0; | 2409 | console_locked = 0; |
| 2241 | 2410 | ||
| 2242 | /* Release the exclusive_console once it is used */ | 2411 | /* Release the exclusive_console once it is used */ |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 84b1367935e4..5e1d713c8e61 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
| @@ -659,7 +659,7 @@ static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info) | |||
| 659 | if (lock_task_sighand(child, &flags)) { | 659 | if (lock_task_sighand(child, &flags)) { |
| 660 | error = -EINVAL; | 660 | error = -EINVAL; |
| 661 | if (likely(child->last_siginfo != NULL)) { | 661 | if (likely(child->last_siginfo != NULL)) { |
| 662 | *info = *child->last_siginfo; | 662 | copy_siginfo(info, child->last_siginfo); |
| 663 | error = 0; | 663 | error = 0; |
| 664 | } | 664 | } |
| 665 | unlock_task_sighand(child, &flags); | 665 | unlock_task_sighand(child, &flags); |
| @@ -675,7 +675,7 @@ static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info) | |||
| 675 | if (lock_task_sighand(child, &flags)) { | 675 | if (lock_task_sighand(child, &flags)) { |
| 676 | error = -EINVAL; | 676 | error = -EINVAL; |
| 677 | if (likely(child->last_siginfo != NULL)) { | 677 | if (likely(child->last_siginfo != NULL)) { |
| 678 | *child->last_siginfo = *info; | 678 | copy_siginfo(child->last_siginfo, info); |
| 679 | error = 0; | 679 | error = 0; |
| 680 | } | 680 | } |
| 681 | unlock_task_sighand(child, &flags); | 681 | unlock_task_sighand(child, &flags); |
| @@ -1092,6 +1092,10 @@ int ptrace_request(struct task_struct *child, long request, | |||
| 1092 | ret = seccomp_get_filter(child, addr, datavp); | 1092 | ret = seccomp_get_filter(child, addr, datavp); |
| 1093 | break; | 1093 | break; |
| 1094 | 1094 | ||
| 1095 | case PTRACE_SECCOMP_GET_METADATA: | ||
| 1096 | ret = seccomp_get_metadata(child, addr, datavp); | ||
| 1097 | break; | ||
| 1098 | |||
| 1095 | default: | 1099 | default: |
| 1096 | break; | 1100 | break; |
| 1097 | } | 1101 | } |
| @@ -1226,7 +1230,6 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, | |||
| 1226 | break; | 1230 | break; |
| 1227 | 1231 | ||
| 1228 | case PTRACE_SETSIGINFO: | 1232 | case PTRACE_SETSIGINFO: |
| 1229 | memset(&siginfo, 0, sizeof siginfo); | ||
| 1230 | if (copy_siginfo_from_user32( | 1233 | if (copy_siginfo_from_user32( |
| 1231 | &siginfo, (struct compat_siginfo __user *) datap)) | 1234 | &siginfo, (struct compat_siginfo __user *) datap)) |
| 1232 | ret = -EFAULT; | 1235 | ret = -EFAULT; |
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index fbd56d6e575b..68fa19a5e7bd 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c | |||
| @@ -422,11 +422,13 @@ void init_rcu_head(struct rcu_head *head) | |||
| 422 | { | 422 | { |
| 423 | debug_object_init(head, &rcuhead_debug_descr); | 423 | debug_object_init(head, &rcuhead_debug_descr); |
| 424 | } | 424 | } |
| 425 | EXPORT_SYMBOL_GPL(init_rcu_head); | ||
| 425 | 426 | ||
| 426 | void destroy_rcu_head(struct rcu_head *head) | 427 | void destroy_rcu_head(struct rcu_head *head) |
| 427 | { | 428 | { |
| 428 | debug_object_free(head, &rcuhead_debug_descr); | 429 | debug_object_free(head, &rcuhead_debug_descr); |
| 429 | } | 430 | } |
| 431 | EXPORT_SYMBOL_GPL(destroy_rcu_head); | ||
| 430 | 432 | ||
| 431 | static bool rcuhead_is_static_object(void *addr) | 433 | static bool rcuhead_is_static_object(void *addr) |
| 432 | { | 434 | { |
diff --git a/kernel/relay.c b/kernel/relay.c index 39a9dfc69486..41280033a4c5 100644 --- a/kernel/relay.c +++ b/kernel/relay.c | |||
| @@ -919,9 +919,9 @@ static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma) | |||
| 919 | * | 919 | * |
| 920 | * Poll implemention. | 920 | * Poll implemention. |
| 921 | */ | 921 | */ |
| 922 | static unsigned int relay_file_poll(struct file *filp, poll_table *wait) | 922 | static __poll_t relay_file_poll(struct file *filp, poll_table *wait) |
| 923 | { | 923 | { |
| 924 | unsigned int mask = 0; | 924 | __poll_t mask = 0; |
| 925 | struct rchan_buf *buf = filp->private_data; | 925 | struct rchan_buf *buf = filp->private_data; |
| 926 | 926 | ||
| 927 | if (buf->finalized) | 927 | if (buf->finalized) |
diff --git a/kernel/resource.c b/kernel/resource.c index 54ba6de3757c..8c527d83ca76 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
| @@ -1022,6 +1022,7 @@ static void __init __reserve_region_with_split(struct resource *root, | |||
| 1022 | struct resource *conflict; | 1022 | struct resource *conflict; |
| 1023 | struct resource *res = alloc_resource(GFP_ATOMIC); | 1023 | struct resource *res = alloc_resource(GFP_ATOMIC); |
| 1024 | struct resource *next_res = NULL; | 1024 | struct resource *next_res = NULL; |
| 1025 | int type = resource_type(root); | ||
| 1025 | 1026 | ||
| 1026 | if (!res) | 1027 | if (!res) |
| 1027 | return; | 1028 | return; |
| @@ -1029,7 +1030,7 @@ static void __init __reserve_region_with_split(struct resource *root, | |||
| 1029 | res->name = name; | 1030 | res->name = name; |
| 1030 | res->start = start; | 1031 | res->start = start; |
| 1031 | res->end = end; | 1032 | res->end = end; |
| 1032 | res->flags = IORESOURCE_BUSY; | 1033 | res->flags = type | IORESOURCE_BUSY; |
| 1033 | res->desc = IORES_DESC_NONE; | 1034 | res->desc = IORES_DESC_NONE; |
| 1034 | 1035 | ||
| 1035 | while (1) { | 1036 | while (1) { |
| @@ -1064,7 +1065,7 @@ static void __init __reserve_region_with_split(struct resource *root, | |||
| 1064 | next_res->name = name; | 1065 | next_res->name = name; |
| 1065 | next_res->start = conflict->end + 1; | 1066 | next_res->start = conflict->end + 1; |
| 1066 | next_res->end = end; | 1067 | next_res->end = end; |
| 1067 | next_res->flags = IORESOURCE_BUSY; | 1068 | next_res->flags = type | IORESOURCE_BUSY; |
| 1068 | next_res->desc = IORES_DESC_NONE; | 1069 | next_res->desc = IORES_DESC_NONE; |
| 1069 | } | 1070 | } |
| 1070 | } else { | 1071 | } else { |
| @@ -1478,7 +1479,7 @@ void __devm_release_region(struct device *dev, struct resource *parent, | |||
| 1478 | EXPORT_SYMBOL(__devm_release_region); | 1479 | EXPORT_SYMBOL(__devm_release_region); |
| 1479 | 1480 | ||
| 1480 | /* | 1481 | /* |
| 1481 | * Called from init/main.c to reserve IO ports. | 1482 | * Reserve I/O ports or memory based on "reserve=" kernel parameter. |
| 1482 | */ | 1483 | */ |
| 1483 | #define MAXRESERVE 4 | 1484 | #define MAXRESERVE 4 |
| 1484 | static int __init reserve_setup(char *str) | 1485 | static int __init reserve_setup(char *str) |
| @@ -1489,26 +1490,38 @@ static int __init reserve_setup(char *str) | |||
| 1489 | for (;;) { | 1490 | for (;;) { |
| 1490 | unsigned int io_start, io_num; | 1491 | unsigned int io_start, io_num; |
| 1491 | int x = reserved; | 1492 | int x = reserved; |
| 1493 | struct resource *parent; | ||
| 1492 | 1494 | ||
| 1493 | if (get_option (&str, &io_start) != 2) | 1495 | if (get_option(&str, &io_start) != 2) |
| 1494 | break; | 1496 | break; |
| 1495 | if (get_option (&str, &io_num) == 0) | 1497 | if (get_option(&str, &io_num) == 0) |
| 1496 | break; | 1498 | break; |
| 1497 | if (x < MAXRESERVE) { | 1499 | if (x < MAXRESERVE) { |
| 1498 | struct resource *res = reserve + x; | 1500 | struct resource *res = reserve + x; |
| 1501 | |||
| 1502 | /* | ||
| 1503 | * If the region starts below 0x10000, we assume it's | ||
| 1504 | * I/O port space; otherwise assume it's memory. | ||
| 1505 | */ | ||
| 1506 | if (io_start < 0x10000) { | ||
| 1507 | res->flags = IORESOURCE_IO; | ||
| 1508 | parent = &ioport_resource; | ||
| 1509 | } else { | ||
| 1510 | res->flags = IORESOURCE_MEM; | ||
| 1511 | parent = &iomem_resource; | ||
| 1512 | } | ||
| 1499 | res->name = "reserved"; | 1513 | res->name = "reserved"; |
| 1500 | res->start = io_start; | 1514 | res->start = io_start; |
| 1501 | res->end = io_start + io_num - 1; | 1515 | res->end = io_start + io_num - 1; |
| 1502 | res->flags = IORESOURCE_BUSY; | 1516 | res->flags |= IORESOURCE_BUSY; |
| 1503 | res->desc = IORES_DESC_NONE; | 1517 | res->desc = IORES_DESC_NONE; |
| 1504 | res->child = NULL; | 1518 | res->child = NULL; |
| 1505 | if (request_resource(res->start >= 0x10000 ? &iomem_resource : &ioport_resource, res) == 0) | 1519 | if (request_resource(parent, res) == 0) |
| 1506 | reserved = x+1; | 1520 | reserved = x+1; |
| 1507 | } | 1521 | } |
| 1508 | } | 1522 | } |
| 1509 | return 1; | 1523 | return 1; |
| 1510 | } | 1524 | } |
| 1511 | |||
| 1512 | __setup("reserve=", reserve_setup); | 1525 | __setup("reserve=", reserve_setup); |
| 1513 | 1526 | ||
| 1514 | /* | 1527 | /* |
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index a43df5193538..bb4b9fe026a1 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c | |||
| @@ -1,13 +1,12 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0 | 1 | // SPDX-License-Identifier: GPL-2.0 |
| 2 | #include "sched.h" | ||
| 3 | |||
| 4 | #include <linux/proc_fs.h> | 2 | #include <linux/proc_fs.h> |
| 5 | #include <linux/seq_file.h> | 3 | #include <linux/seq_file.h> |
| 6 | #include <linux/kallsyms.h> | ||
| 7 | #include <linux/utsname.h> | 4 | #include <linux/utsname.h> |
| 8 | #include <linux/security.h> | 5 | #include <linux/security.h> |
| 9 | #include <linux/export.h> | 6 | #include <linux/export.h> |
| 10 | 7 | ||
| 8 | #include "sched.h" | ||
| 9 | |||
| 11 | unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; | 10 | unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; |
| 12 | static struct autogroup autogroup_default; | 11 | static struct autogroup autogroup_default; |
| 13 | static atomic_t autogroup_seq_nr; | 12 | static atomic_t autogroup_seq_nr; |
diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 5f0dfb2abb8d..940fa408a288 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c | |||
| @@ -515,7 +515,7 @@ void put_seccomp_filter(struct task_struct *tsk) | |||
| 515 | 515 | ||
| 516 | static void seccomp_init_siginfo(siginfo_t *info, int syscall, int reason) | 516 | static void seccomp_init_siginfo(siginfo_t *info, int syscall, int reason) |
| 517 | { | 517 | { |
| 518 | memset(info, 0, sizeof(*info)); | 518 | clear_siginfo(info); |
| 519 | info->si_signo = SIGSYS; | 519 | info->si_signo = SIGSYS; |
| 520 | info->si_code = SYS_SECCOMP; | 520 | info->si_code = SYS_SECCOMP; |
| 521 | info->si_call_addr = (void __user *)KSTK_EIP(current); | 521 | info->si_call_addr = (void __user *)KSTK_EIP(current); |
| @@ -978,49 +978,68 @@ long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) | |||
| 978 | } | 978 | } |
| 979 | 979 | ||
| 980 | #if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE) | 980 | #if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE) |
| 981 | long seccomp_get_filter(struct task_struct *task, unsigned long filter_off, | 981 | static struct seccomp_filter *get_nth_filter(struct task_struct *task, |
| 982 | void __user *data) | 982 | unsigned long filter_off) |
| 983 | { | 983 | { |
| 984 | struct seccomp_filter *filter; | 984 | struct seccomp_filter *orig, *filter; |
| 985 | struct sock_fprog_kern *fprog; | 985 | unsigned long count; |
| 986 | long ret; | ||
| 987 | unsigned long count = 0; | ||
| 988 | |||
| 989 | if (!capable(CAP_SYS_ADMIN) || | ||
| 990 | current->seccomp.mode != SECCOMP_MODE_DISABLED) { | ||
| 991 | return -EACCES; | ||
| 992 | } | ||
| 993 | 986 | ||
| 987 | /* | ||
| 988 | * Note: this is only correct because the caller should be the (ptrace) | ||
| 989 | * tracer of the task, otherwise lock_task_sighand is needed. | ||
| 990 | */ | ||
| 994 | spin_lock_irq(&task->sighand->siglock); | 991 | spin_lock_irq(&task->sighand->siglock); |
| 992 | |||
| 995 | if (task->seccomp.mode != SECCOMP_MODE_FILTER) { | 993 | if (task->seccomp.mode != SECCOMP_MODE_FILTER) { |
| 996 | ret = -EINVAL; | 994 | spin_unlock_irq(&task->sighand->siglock); |
| 997 | goto out; | 995 | return ERR_PTR(-EINVAL); |
| 998 | } | 996 | } |
| 999 | 997 | ||
| 1000 | filter = task->seccomp.filter; | 998 | orig = task->seccomp.filter; |
| 1001 | while (filter) { | 999 | __get_seccomp_filter(orig); |
| 1002 | filter = filter->prev; | 1000 | spin_unlock_irq(&task->sighand->siglock); |
| 1001 | |||
| 1002 | count = 0; | ||
| 1003 | for (filter = orig; filter; filter = filter->prev) | ||
| 1003 | count++; | 1004 | count++; |
| 1004 | } | ||
| 1005 | 1005 | ||
| 1006 | if (filter_off >= count) { | 1006 | if (filter_off >= count) { |
| 1007 | ret = -ENOENT; | 1007 | filter = ERR_PTR(-ENOENT); |
| 1008 | goto out; | 1008 | goto out; |
| 1009 | } | 1009 | } |
| 1010 | count -= filter_off; | ||
| 1011 | 1010 | ||
| 1012 | filter = task->seccomp.filter; | 1011 | count -= filter_off; |
| 1013 | while (filter && count > 1) { | 1012 | for (filter = orig; filter && count > 1; filter = filter->prev) |
| 1014 | filter = filter->prev; | ||
| 1015 | count--; | 1013 | count--; |
| 1016 | } | ||
| 1017 | 1014 | ||
| 1018 | if (WARN_ON(count != 1 || !filter)) { | 1015 | if (WARN_ON(count != 1 || !filter)) { |
| 1019 | /* The filter tree shouldn't shrink while we're using it. */ | 1016 | filter = ERR_PTR(-ENOENT); |
| 1020 | ret = -ENOENT; | ||
| 1021 | goto out; | 1017 | goto out; |
| 1022 | } | 1018 | } |
| 1023 | 1019 | ||
| 1020 | __get_seccomp_filter(filter); | ||
| 1021 | |||
| 1022 | out: | ||
| 1023 | __put_seccomp_filter(orig); | ||
| 1024 | return filter; | ||
| 1025 | } | ||
| 1026 | |||
| 1027 | long seccomp_get_filter(struct task_struct *task, unsigned long filter_off, | ||
| 1028 | void __user *data) | ||
| 1029 | { | ||
| 1030 | struct seccomp_filter *filter; | ||
| 1031 | struct sock_fprog_kern *fprog; | ||
| 1032 | long ret; | ||
| 1033 | |||
| 1034 | if (!capable(CAP_SYS_ADMIN) || | ||
| 1035 | current->seccomp.mode != SECCOMP_MODE_DISABLED) { | ||
| 1036 | return -EACCES; | ||
| 1037 | } | ||
| 1038 | |||
| 1039 | filter = get_nth_filter(task, filter_off); | ||
| 1040 | if (IS_ERR(filter)) | ||
| 1041 | return PTR_ERR(filter); | ||
| 1042 | |||
| 1024 | fprog = filter->prog->orig_prog; | 1043 | fprog = filter->prog->orig_prog; |
| 1025 | if (!fprog) { | 1044 | if (!fprog) { |
| 1026 | /* This must be a new non-cBPF filter, since we save | 1045 | /* This must be a new non-cBPF filter, since we save |
| @@ -1035,17 +1054,44 @@ long seccomp_get_filter(struct task_struct *task, unsigned long filter_off, | |||
| 1035 | if (!data) | 1054 | if (!data) |
| 1036 | goto out; | 1055 | goto out; |
| 1037 | 1056 | ||
| 1038 | __get_seccomp_filter(filter); | ||
| 1039 | spin_unlock_irq(&task->sighand->siglock); | ||
| 1040 | |||
| 1041 | if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog))) | 1057 | if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog))) |
| 1042 | ret = -EFAULT; | 1058 | ret = -EFAULT; |
| 1043 | 1059 | ||
| 1060 | out: | ||
| 1044 | __put_seccomp_filter(filter); | 1061 | __put_seccomp_filter(filter); |
| 1045 | return ret; | 1062 | return ret; |
| 1063 | } | ||
| 1046 | 1064 | ||
| 1047 | out: | 1065 | long seccomp_get_metadata(struct task_struct *task, |
| 1048 | spin_unlock_irq(&task->sighand->siglock); | 1066 | unsigned long size, void __user *data) |
| 1067 | { | ||
| 1068 | long ret; | ||
| 1069 | struct seccomp_filter *filter; | ||
| 1070 | struct seccomp_metadata kmd = {}; | ||
| 1071 | |||
| 1072 | if (!capable(CAP_SYS_ADMIN) || | ||
| 1073 | current->seccomp.mode != SECCOMP_MODE_DISABLED) { | ||
| 1074 | return -EACCES; | ||
| 1075 | } | ||
| 1076 | |||
| 1077 | size = min_t(unsigned long, size, sizeof(kmd)); | ||
| 1078 | |||
| 1079 | if (copy_from_user(&kmd, data, size)) | ||
| 1080 | return -EFAULT; | ||
| 1081 | |||
| 1082 | filter = get_nth_filter(task, kmd.filter_off); | ||
| 1083 | if (IS_ERR(filter)) | ||
| 1084 | return PTR_ERR(filter); | ||
| 1085 | |||
| 1086 | memset(&kmd, 0, sizeof(kmd)); | ||
| 1087 | if (filter->log) | ||
| 1088 | kmd.flags |= SECCOMP_FILTER_FLAG_LOG; | ||
| 1089 | |||
| 1090 | ret = size; | ||
| 1091 | if (copy_to_user(data, &kmd, size)) | ||
| 1092 | ret = -EFAULT; | ||
| 1093 | |||
| 1094 | __put_seccomp_filter(filter); | ||
| 1049 | return ret; | 1095 | return ret; |
| 1050 | } | 1096 | } |
| 1051 | #endif | 1097 | #endif |
diff --git a/kernel/signal.c b/kernel/signal.c index 9558664bd9ec..c6e4c83dc090 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
| @@ -40,6 +40,7 @@ | |||
| 40 | #include <linux/cn_proc.h> | 40 | #include <linux/cn_proc.h> |
| 41 | #include <linux/compiler.h> | 41 | #include <linux/compiler.h> |
| 42 | #include <linux/posix-timers.h> | 42 | #include <linux/posix-timers.h> |
| 43 | #include <linux/livepatch.h> | ||
| 43 | 44 | ||
| 44 | #define CREATE_TRACE_POINTS | 45 | #define CREATE_TRACE_POINTS |
| 45 | #include <trace/events/signal.h> | 46 | #include <trace/events/signal.h> |
| @@ -165,7 +166,8 @@ void recalc_sigpending_and_wake(struct task_struct *t) | |||
| 165 | 166 | ||
| 166 | void recalc_sigpending(void) | 167 | void recalc_sigpending(void) |
| 167 | { | 168 | { |
| 168 | if (!recalc_sigpending_tsk(current) && !freezing(current)) | 169 | if (!recalc_sigpending_tsk(current) && !freezing(current) && |
| 170 | !klp_patch_pending(current)) | ||
| 169 | clear_thread_flag(TIF_SIGPENDING); | 171 | clear_thread_flag(TIF_SIGPENDING); |
| 170 | 172 | ||
| 171 | } | 173 | } |
| @@ -549,6 +551,7 @@ still_pending: | |||
| 549 | * a fast-pathed signal or we must have been | 551 | * a fast-pathed signal or we must have been |
| 550 | * out of queue space. So zero out the info. | 552 | * out of queue space. So zero out the info. |
| 551 | */ | 553 | */ |
| 554 | clear_siginfo(info); | ||
| 552 | info->si_signo = sig; | 555 | info->si_signo = sig; |
| 553 | info->si_errno = 0; | 556 | info->si_errno = 0; |
| 554 | info->si_code = SI_USER; | 557 | info->si_code = SI_USER; |
| @@ -642,6 +645,9 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) | |||
| 642 | spin_unlock(&tsk->sighand->siglock); | 645 | spin_unlock(&tsk->sighand->siglock); |
| 643 | posixtimer_rearm(info); | 646 | posixtimer_rearm(info); |
| 644 | spin_lock(&tsk->sighand->siglock); | 647 | spin_lock(&tsk->sighand->siglock); |
| 648 | |||
| 649 | /* Don't expose the si_sys_private value to userspace */ | ||
| 650 | info->si_sys_private = 0; | ||
| 645 | } | 651 | } |
| 646 | #endif | 652 | #endif |
| 647 | return signr; | 653 | return signr; |
| @@ -1043,6 +1049,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, | |||
| 1043 | list_add_tail(&q->list, &pending->list); | 1049 | list_add_tail(&q->list, &pending->list); |
| 1044 | switch ((unsigned long) info) { | 1050 | switch ((unsigned long) info) { |
| 1045 | case (unsigned long) SEND_SIG_NOINFO: | 1051 | case (unsigned long) SEND_SIG_NOINFO: |
| 1052 | clear_siginfo(&q->info); | ||
| 1046 | q->info.si_signo = sig; | 1053 | q->info.si_signo = sig; |
| 1047 | q->info.si_errno = 0; | 1054 | q->info.si_errno = 0; |
| 1048 | q->info.si_code = SI_USER; | 1055 | q->info.si_code = SI_USER; |
| @@ -1051,6 +1058,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, | |||
| 1051 | q->info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); | 1058 | q->info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); |
| 1052 | break; | 1059 | break; |
| 1053 | case (unsigned long) SEND_SIG_PRIV: | 1060 | case (unsigned long) SEND_SIG_PRIV: |
| 1061 | clear_siginfo(&q->info); | ||
| 1054 | q->info.si_signo = sig; | 1062 | q->info.si_signo = sig; |
| 1055 | q->info.si_errno = 0; | 1063 | q->info.si_errno = 0; |
| 1056 | q->info.si_code = SI_KERNEL; | 1064 | q->info.si_code = SI_KERNEL; |
| @@ -1485,6 +1493,129 @@ force_sigsegv(int sig, struct task_struct *p) | |||
| 1485 | return 0; | 1493 | return 0; |
| 1486 | } | 1494 | } |
| 1487 | 1495 | ||
| 1496 | int force_sig_fault(int sig, int code, void __user *addr | ||
| 1497 | ___ARCH_SI_TRAPNO(int trapno) | ||
| 1498 | ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) | ||
| 1499 | , struct task_struct *t) | ||
| 1500 | { | ||
| 1501 | struct siginfo info; | ||
| 1502 | |||
| 1503 | clear_siginfo(&info); | ||
| 1504 | info.si_signo = sig; | ||
| 1505 | info.si_errno = 0; | ||
| 1506 | info.si_code = code; | ||
| 1507 | info.si_addr = addr; | ||
| 1508 | #ifdef __ARCH_SI_TRAPNO | ||
| 1509 | info.si_trapno = trapno; | ||
| 1510 | #endif | ||
| 1511 | #ifdef __ia64__ | ||
| 1512 | info.si_imm = imm; | ||
| 1513 | info.si_flags = flags; | ||
| 1514 | info.si_isr = isr; | ||
| 1515 | #endif | ||
| 1516 | return force_sig_info(info.si_signo, &info, t); | ||
| 1517 | } | ||
| 1518 | |||
| 1519 | int send_sig_fault(int sig, int code, void __user *addr | ||
| 1520 | ___ARCH_SI_TRAPNO(int trapno) | ||
| 1521 | ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) | ||
| 1522 | , struct task_struct *t) | ||
| 1523 | { | ||
| 1524 | struct siginfo info; | ||
| 1525 | |||
| 1526 | clear_siginfo(&info); | ||
| 1527 | info.si_signo = sig; | ||
| 1528 | info.si_errno = 0; | ||
| 1529 | info.si_code = code; | ||
| 1530 | info.si_addr = addr; | ||
| 1531 | #ifdef __ARCH_SI_TRAPNO | ||
| 1532 | info.si_trapno = trapno; | ||
| 1533 | #endif | ||
| 1534 | #ifdef __ia64__ | ||
| 1535 | info.si_imm = imm; | ||
| 1536 | info.si_flags = flags; | ||
| 1537 | info.si_isr = isr; | ||
| 1538 | #endif | ||
| 1539 | return send_sig_info(info.si_signo, &info, t); | ||
| 1540 | } | ||
| 1541 | |||
| 1542 | #if defined(BUS_MCEERR_AO) && defined(BUS_MCEERR_AR) | ||
| 1543 | int force_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t) | ||
| 1544 | { | ||
| 1545 | struct siginfo info; | ||
| 1546 | |||
| 1547 | WARN_ON((code != BUS_MCEERR_AO) && (code != BUS_MCEERR_AR)); | ||
| 1548 | clear_siginfo(&info); | ||
| 1549 | info.si_signo = SIGBUS; | ||
| 1550 | info.si_errno = 0; | ||
| 1551 | info.si_code = code; | ||
| 1552 | info.si_addr = addr; | ||
| 1553 | info.si_addr_lsb = lsb; | ||
| 1554 | return force_sig_info(info.si_signo, &info, t); | ||
| 1555 | } | ||
| 1556 | |||
| 1557 | int send_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t) | ||
| 1558 | { | ||
| 1559 | struct siginfo info; | ||
| 1560 | |||
| 1561 | WARN_ON((code != BUS_MCEERR_AO) && (code != BUS_MCEERR_AR)); | ||
| 1562 | clear_siginfo(&info); | ||
| 1563 | info.si_signo = SIGBUS; | ||
| 1564 | info.si_errno = 0; | ||
| 1565 | info.si_code = code; | ||
| 1566 | info.si_addr = addr; | ||
| 1567 | info.si_addr_lsb = lsb; | ||
| 1568 | return send_sig_info(info.si_signo, &info, t); | ||
| 1569 | } | ||
| 1570 | EXPORT_SYMBOL(send_sig_mceerr); | ||
| 1571 | #endif | ||
| 1572 | |||
| 1573 | #ifdef SEGV_BNDERR | ||
| 1574 | int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper) | ||
| 1575 | { | ||
| 1576 | struct siginfo info; | ||
| 1577 | |||
| 1578 | clear_siginfo(&info); | ||
| 1579 | info.si_signo = SIGSEGV; | ||
| 1580 | info.si_errno = 0; | ||
| 1581 | info.si_code = SEGV_BNDERR; | ||
| 1582 | info.si_addr = addr; | ||
| 1583 | info.si_lower = lower; | ||
| 1584 | info.si_upper = upper; | ||
| 1585 | return force_sig_info(info.si_signo, &info, current); | ||
| 1586 | } | ||
| 1587 | #endif | ||
| 1588 | |||
| 1589 | #ifdef SEGV_PKUERR | ||
| 1590 | int force_sig_pkuerr(void __user *addr, u32 pkey) | ||
| 1591 | { | ||
| 1592 | struct siginfo info; | ||
| 1593 | |||
| 1594 | clear_siginfo(&info); | ||
| 1595 | info.si_signo = SIGSEGV; | ||
| 1596 | info.si_errno = 0; | ||
| 1597 | info.si_code = SEGV_PKUERR; | ||
| 1598 | info.si_addr = addr; | ||
| 1599 | info.si_pkey = pkey; | ||
| 1600 | return force_sig_info(info.si_signo, &info, current); | ||
| 1601 | } | ||
| 1602 | #endif | ||
| 1603 | |||
| 1604 | /* For the crazy architectures that include trap information in | ||
| 1605 | * the errno field, instead of an actual errno value. | ||
| 1606 | */ | ||
| 1607 | int force_sig_ptrace_errno_trap(int errno, void __user *addr) | ||
| 1608 | { | ||
| 1609 | struct siginfo info; | ||
| 1610 | |||
| 1611 | clear_siginfo(&info); | ||
| 1612 | info.si_signo = SIGTRAP; | ||
| 1613 | info.si_errno = errno; | ||
| 1614 | info.si_code = TRAP_HWBKPT; | ||
| 1615 | info.si_addr = addr; | ||
| 1616 | return force_sig_info(info.si_signo, &info, current); | ||
| 1617 | } | ||
| 1618 | |||
| 1488 | int kill_pgrp(struct pid *pid, int sig, int priv) | 1619 | int kill_pgrp(struct pid *pid, int sig, int priv) |
| 1489 | { | 1620 | { |
| 1490 | int ret; | 1621 | int ret; |
| @@ -1623,6 +1754,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig) | |||
| 1623 | sig = SIGCHLD; | 1754 | sig = SIGCHLD; |
| 1624 | } | 1755 | } |
| 1625 | 1756 | ||
| 1757 | clear_siginfo(&info); | ||
| 1626 | info.si_signo = sig; | 1758 | info.si_signo = sig; |
| 1627 | info.si_errno = 0; | 1759 | info.si_errno = 0; |
| 1628 | /* | 1760 | /* |
| @@ -1717,6 +1849,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, | |||
| 1717 | parent = tsk->real_parent; | 1849 | parent = tsk->real_parent; |
| 1718 | } | 1850 | } |
| 1719 | 1851 | ||
| 1852 | clear_siginfo(&info); | ||
| 1720 | info.si_signo = SIGCHLD; | 1853 | info.si_signo = SIGCHLD; |
| 1721 | info.si_errno = 0; | 1854 | info.si_errno = 0; |
| 1722 | /* | 1855 | /* |
| @@ -1929,7 +2062,7 @@ static void ptrace_do_notify(int signr, int exit_code, int why) | |||
| 1929 | { | 2062 | { |
| 1930 | siginfo_t info; | 2063 | siginfo_t info; |
| 1931 | 2064 | ||
| 1932 | memset(&info, 0, sizeof info); | 2065 | clear_siginfo(&info); |
| 1933 | info.si_signo = signr; | 2066 | info.si_signo = signr; |
| 1934 | info.si_code = exit_code; | 2067 | info.si_code = exit_code; |
| 1935 | info.si_pid = task_pid_vnr(current); | 2068 | info.si_pid = task_pid_vnr(current); |
| @@ -2136,6 +2269,7 @@ static int ptrace_signal(int signr, siginfo_t *info) | |||
| 2136 | * have updated *info via PTRACE_SETSIGINFO. | 2269 | * have updated *info via PTRACE_SETSIGINFO. |
| 2137 | */ | 2270 | */ |
| 2138 | if (signr != info->si_signo) { | 2271 | if (signr != info->si_signo) { |
| 2272 | clear_siginfo(info); | ||
| 2139 | info->si_signo = signr; | 2273 | info->si_signo = signr; |
| 2140 | info->si_errno = 0; | 2274 | info->si_errno = 0; |
| 2141 | info->si_code = SI_USER; | 2275 | info->si_code = SI_USER; |
| @@ -2688,9 +2822,7 @@ enum siginfo_layout siginfo_layout(int sig, int si_code) | |||
| 2688 | #endif | 2822 | #endif |
| 2689 | [SIGCHLD] = { NSIGCHLD, SIL_CHLD }, | 2823 | [SIGCHLD] = { NSIGCHLD, SIL_CHLD }, |
| 2690 | [SIGPOLL] = { NSIGPOLL, SIL_POLL }, | 2824 | [SIGPOLL] = { NSIGPOLL, SIL_POLL }, |
| 2691 | #ifdef __ARCH_SIGSYS | ||
| 2692 | [SIGSYS] = { NSIGSYS, SIL_SYS }, | 2825 | [SIGSYS] = { NSIGSYS, SIL_SYS }, |
| 2693 | #endif | ||
| 2694 | }; | 2826 | }; |
| 2695 | if ((sig < ARRAY_SIZE(filter)) && (si_code <= filter[sig].limit)) | 2827 | if ((sig < ARRAY_SIZE(filter)) && (si_code <= filter[sig].limit)) |
| 2696 | layout = filter[sig].layout; | 2828 | layout = filter[sig].layout; |
| @@ -2712,12 +2844,14 @@ enum siginfo_layout siginfo_layout(int sig, int si_code) | |||
| 2712 | if ((sig == SIGFPE) && (si_code == FPE_FIXME)) | 2844 | if ((sig == SIGFPE) && (si_code == FPE_FIXME)) |
| 2713 | layout = SIL_FAULT; | 2845 | layout = SIL_FAULT; |
| 2714 | #endif | 2846 | #endif |
| 2847 | #ifdef BUS_FIXME | ||
| 2848 | if ((sig == SIGBUS) && (si_code == BUS_FIXME)) | ||
| 2849 | layout = SIL_FAULT; | ||
| 2850 | #endif | ||
| 2715 | } | 2851 | } |
| 2716 | return layout; | 2852 | return layout; |
| 2717 | } | 2853 | } |
| 2718 | 2854 | ||
| 2719 | #ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER | ||
| 2720 | |||
| 2721 | int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) | 2855 | int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) |
| 2722 | { | 2856 | { |
| 2723 | int err; | 2857 | int err; |
| @@ -2756,13 +2890,21 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) | |||
| 2756 | #ifdef __ARCH_SI_TRAPNO | 2890 | #ifdef __ARCH_SI_TRAPNO |
| 2757 | err |= __put_user(from->si_trapno, &to->si_trapno); | 2891 | err |= __put_user(from->si_trapno, &to->si_trapno); |
| 2758 | #endif | 2892 | #endif |
| 2759 | #ifdef BUS_MCEERR_AO | 2893 | #ifdef __ia64__ |
| 2894 | err |= __put_user(from->si_imm, &to->si_imm); | ||
| 2895 | err |= __put_user(from->si_flags, &to->si_flags); | ||
| 2896 | err |= __put_user(from->si_isr, &to->si_isr); | ||
| 2897 | #endif | ||
| 2760 | /* | 2898 | /* |
| 2761 | * Other callers might not initialize the si_lsb field, | 2899 | * Other callers might not initialize the si_lsb field, |
| 2762 | * so check explicitly for the right codes here. | 2900 | * so check explicitly for the right codes here. |
| 2763 | */ | 2901 | */ |
| 2764 | if (from->si_signo == SIGBUS && | 2902 | #ifdef BUS_MCEERR_AR |
| 2765 | (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)) | 2903 | if (from->si_signo == SIGBUS && from->si_code == BUS_MCEERR_AR) |
| 2904 | err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); | ||
| 2905 | #endif | ||
| 2906 | #ifdef BUS_MCEERR_AO | ||
| 2907 | if (from->si_signo == SIGBUS && from->si_code == BUS_MCEERR_AO) | ||
| 2766 | err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); | 2908 | err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); |
| 2767 | #endif | 2909 | #endif |
| 2768 | #ifdef SEGV_BNDERR | 2910 | #ifdef SEGV_BNDERR |
| @@ -2788,18 +2930,185 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) | |||
| 2788 | err |= __put_user(from->si_uid, &to->si_uid); | 2930 | err |= __put_user(from->si_uid, &to->si_uid); |
| 2789 | err |= __put_user(from->si_ptr, &to->si_ptr); | 2931 | err |= __put_user(from->si_ptr, &to->si_ptr); |
| 2790 | break; | 2932 | break; |
| 2791 | #ifdef __ARCH_SIGSYS | ||
| 2792 | case SIL_SYS: | 2933 | case SIL_SYS: |
| 2793 | err |= __put_user(from->si_call_addr, &to->si_call_addr); | 2934 | err |= __put_user(from->si_call_addr, &to->si_call_addr); |
| 2794 | err |= __put_user(from->si_syscall, &to->si_syscall); | 2935 | err |= __put_user(from->si_syscall, &to->si_syscall); |
| 2795 | err |= __put_user(from->si_arch, &to->si_arch); | 2936 | err |= __put_user(from->si_arch, &to->si_arch); |
| 2796 | break; | 2937 | break; |
| 2797 | #endif | ||
| 2798 | } | 2938 | } |
| 2799 | return err; | 2939 | return err; |
| 2800 | } | 2940 | } |
| 2801 | 2941 | ||
| 2942 | #ifdef CONFIG_COMPAT | ||
| 2943 | int copy_siginfo_to_user32(struct compat_siginfo __user *to, | ||
| 2944 | const struct siginfo *from) | ||
| 2945 | #if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION) | ||
| 2946 | { | ||
| 2947 | return __copy_siginfo_to_user32(to, from, in_x32_syscall()); | ||
| 2948 | } | ||
| 2949 | int __copy_siginfo_to_user32(struct compat_siginfo __user *to, | ||
| 2950 | const struct siginfo *from, bool x32_ABI) | ||
| 2951 | #endif | ||
| 2952 | { | ||
| 2953 | struct compat_siginfo new; | ||
| 2954 | memset(&new, 0, sizeof(new)); | ||
| 2955 | |||
| 2956 | new.si_signo = from->si_signo; | ||
| 2957 | new.si_errno = from->si_errno; | ||
| 2958 | new.si_code = from->si_code; | ||
| 2959 | switch(siginfo_layout(from->si_signo, from->si_code)) { | ||
| 2960 | case SIL_KILL: | ||
| 2961 | new.si_pid = from->si_pid; | ||
| 2962 | new.si_uid = from->si_uid; | ||
| 2963 | break; | ||
| 2964 | case SIL_TIMER: | ||
| 2965 | new.si_tid = from->si_tid; | ||
| 2966 | new.si_overrun = from->si_overrun; | ||
| 2967 | new.si_int = from->si_int; | ||
| 2968 | break; | ||
| 2969 | case SIL_POLL: | ||
| 2970 | new.si_band = from->si_band; | ||
| 2971 | new.si_fd = from->si_fd; | ||
| 2972 | break; | ||
| 2973 | case SIL_FAULT: | ||
| 2974 | new.si_addr = ptr_to_compat(from->si_addr); | ||
| 2975 | #ifdef __ARCH_SI_TRAPNO | ||
| 2976 | new.si_trapno = from->si_trapno; | ||
| 2977 | #endif | ||
| 2978 | #ifdef BUS_MCEERR_AR | ||
| 2979 | if ((from->si_signo == SIGBUS) && (from->si_code == BUS_MCEERR_AR)) | ||
| 2980 | new.si_addr_lsb = from->si_addr_lsb; | ||
| 2981 | #endif | ||
| 2982 | #ifdef BUS_MCEERR_AO | ||
| 2983 | if ((from->si_signo == SIGBUS) && (from->si_code == BUS_MCEERR_AO)) | ||
| 2984 | new.si_addr_lsb = from->si_addr_lsb; | ||
| 2985 | #endif | ||
| 2986 | #ifdef SEGV_BNDERR | ||
| 2987 | if ((from->si_signo == SIGSEGV) && | ||
| 2988 | (from->si_code == SEGV_BNDERR)) { | ||
| 2989 | new.si_lower = ptr_to_compat(from->si_lower); | ||
| 2990 | new.si_upper = ptr_to_compat(from->si_upper); | ||
| 2991 | } | ||
| 2992 | #endif | ||
| 2993 | #ifdef SEGV_PKUERR | ||
| 2994 | if ((from->si_signo == SIGSEGV) && | ||
| 2995 | (from->si_code == SEGV_PKUERR)) | ||
| 2996 | new.si_pkey = from->si_pkey; | ||
| 2997 | #endif | ||
| 2998 | |||
| 2999 | break; | ||
| 3000 | case SIL_CHLD: | ||
| 3001 | new.si_pid = from->si_pid; | ||
| 3002 | new.si_uid = from->si_uid; | ||
| 3003 | new.si_status = from->si_status; | ||
| 3004 | #ifdef CONFIG_X86_X32_ABI | ||
| 3005 | if (x32_ABI) { | ||
| 3006 | new._sifields._sigchld_x32._utime = from->si_utime; | ||
| 3007 | new._sifields._sigchld_x32._stime = from->si_stime; | ||
| 3008 | } else | ||
| 3009 | #endif | ||
| 3010 | { | ||
| 3011 | new.si_utime = from->si_utime; | ||
| 3012 | new.si_stime = from->si_stime; | ||
| 3013 | } | ||
| 3014 | break; | ||
| 3015 | case SIL_RT: | ||
| 3016 | new.si_pid = from->si_pid; | ||
| 3017 | new.si_uid = from->si_uid; | ||
| 3018 | new.si_int = from->si_int; | ||
| 3019 | break; | ||
| 3020 | case SIL_SYS: | ||
| 3021 | new.si_call_addr = ptr_to_compat(from->si_call_addr); | ||
| 3022 | new.si_syscall = from->si_syscall; | ||
| 3023 | new.si_arch = from->si_arch; | ||
| 3024 | break; | ||
| 3025 | } | ||
| 3026 | |||
| 3027 | if (copy_to_user(to, &new, sizeof(struct compat_siginfo))) | ||
| 3028 | return -EFAULT; | ||
| 3029 | |||
| 3030 | return 0; | ||
| 3031 | } | ||
| 3032 | |||
| 3033 | int copy_siginfo_from_user32(struct siginfo *to, | ||
| 3034 | const struct compat_siginfo __user *ufrom) | ||
| 3035 | { | ||
| 3036 | struct compat_siginfo from; | ||
| 3037 | |||
| 3038 | if (copy_from_user(&from, ufrom, sizeof(struct compat_siginfo))) | ||
| 3039 | return -EFAULT; | ||
| 3040 | |||
| 3041 | clear_siginfo(to); | ||
| 3042 | to->si_signo = from.si_signo; | ||
| 3043 | to->si_errno = from.si_errno; | ||
| 3044 | to->si_code = from.si_code; | ||
| 3045 | switch(siginfo_layout(from.si_signo, from.si_code)) { | ||
| 3046 | case SIL_KILL: | ||
| 3047 | to->si_pid = from.si_pid; | ||
| 3048 | to->si_uid = from.si_uid; | ||
| 3049 | break; | ||
| 3050 | case SIL_TIMER: | ||
| 3051 | to->si_tid = from.si_tid; | ||
| 3052 | to->si_overrun = from.si_overrun; | ||
| 3053 | to->si_int = from.si_int; | ||
| 3054 | break; | ||
| 3055 | case SIL_POLL: | ||
| 3056 | to->si_band = from.si_band; | ||
| 3057 | to->si_fd = from.si_fd; | ||
| 3058 | break; | ||
| 3059 | case SIL_FAULT: | ||
| 3060 | to->si_addr = compat_ptr(from.si_addr); | ||
| 3061 | #ifdef __ARCH_SI_TRAPNO | ||
| 3062 | to->si_trapno = from.si_trapno; | ||
| 3063 | #endif | ||
| 3064 | #ifdef BUS_MCEERR_AR | ||
| 3065 | if ((from.si_signo == SIGBUS) && (from.si_code == BUS_MCEERR_AR)) | ||
| 3066 | to->si_addr_lsb = from.si_addr_lsb; | ||
| 3067 | #endif | ||
| 3068 | #ifdef BUS_MCEER_AO | ||
| 3069 | if ((from.si_signo == SIGBUS) && (from.si_code == BUS_MCEERR_AO)) | ||
| 3070 | to->si_addr_lsb = from.si_addr_lsb; | ||
| 3071 | #endif | ||
| 3072 | #ifdef SEGV_BNDERR | ||
| 3073 | if ((from.si_signo == SIGSEGV) && (from.si_code == SEGV_BNDERR)) { | ||
| 3074 | to->si_lower = compat_ptr(from.si_lower); | ||
| 3075 | to->si_upper = compat_ptr(from.si_upper); | ||
| 3076 | } | ||
| 3077 | #endif | ||
| 3078 | #ifdef SEGV_PKUERR | ||
| 3079 | if ((from.si_signo == SIGSEGV) && (from.si_code == SEGV_PKUERR)) | ||
| 3080 | to->si_pkey = from.si_pkey; | ||
| 3081 | #endif | ||
| 3082 | break; | ||
| 3083 | case SIL_CHLD: | ||
| 3084 | to->si_pid = from.si_pid; | ||
| 3085 | to->si_uid = from.si_uid; | ||
| 3086 | to->si_status = from.si_status; | ||
| 3087 | #ifdef CONFIG_X86_X32_ABI | ||
| 3088 | if (in_x32_syscall()) { | ||
| 3089 | to->si_utime = from._sifields._sigchld_x32._utime; | ||
| 3090 | to->si_stime = from._sifields._sigchld_x32._stime; | ||
| 3091 | } else | ||
| 2802 | #endif | 3092 | #endif |
| 3093 | { | ||
| 3094 | to->si_utime = from.si_utime; | ||
| 3095 | to->si_stime = from.si_stime; | ||
| 3096 | } | ||
| 3097 | break; | ||
| 3098 | case SIL_RT: | ||
| 3099 | to->si_pid = from.si_pid; | ||
| 3100 | to->si_uid = from.si_uid; | ||
| 3101 | to->si_int = from.si_int; | ||
| 3102 | break; | ||
| 3103 | case SIL_SYS: | ||
| 3104 | to->si_call_addr = compat_ptr(from.si_call_addr); | ||
| 3105 | to->si_syscall = from.si_syscall; | ||
| 3106 | to->si_arch = from.si_arch; | ||
| 3107 | break; | ||
| 3108 | } | ||
| 3109 | return 0; | ||
| 3110 | } | ||
| 3111 | #endif /* CONFIG_COMPAT */ | ||
| 2803 | 3112 | ||
| 2804 | /** | 3113 | /** |
| 2805 | * do_sigtimedwait - wait for queued signals specified in @which | 3114 | * do_sigtimedwait - wait for queued signals specified in @which |
| @@ -2937,6 +3246,7 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) | |||
| 2937 | { | 3246 | { |
| 2938 | struct siginfo info; | 3247 | struct siginfo info; |
| 2939 | 3248 | ||
| 3249 | clear_siginfo(&info); | ||
| 2940 | info.si_signo = sig; | 3250 | info.si_signo = sig; |
| 2941 | info.si_errno = 0; | 3251 | info.si_errno = 0; |
| 2942 | info.si_code = SI_USER; | 3252 | info.si_code = SI_USER; |
| @@ -2978,8 +3288,9 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) | |||
| 2978 | 3288 | ||
| 2979 | static int do_tkill(pid_t tgid, pid_t pid, int sig) | 3289 | static int do_tkill(pid_t tgid, pid_t pid, int sig) |
| 2980 | { | 3290 | { |
| 2981 | struct siginfo info = {}; | 3291 | struct siginfo info; |
| 2982 | 3292 | ||
| 3293 | clear_siginfo(&info); | ||
| 2983 | info.si_signo = sig; | 3294 | info.si_signo = sig; |
| 2984 | info.si_errno = 0; | 3295 | info.si_errno = 0; |
| 2985 | info.si_code = SI_TKILL; | 3296 | info.si_code = SI_TKILL; |
| @@ -3060,7 +3371,7 @@ COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo, | |||
| 3060 | int, sig, | 3371 | int, sig, |
| 3061 | struct compat_siginfo __user *, uinfo) | 3372 | struct compat_siginfo __user *, uinfo) |
| 3062 | { | 3373 | { |
| 3063 | siginfo_t info = {}; | 3374 | siginfo_t info; |
| 3064 | int ret = copy_siginfo_from_user32(&info, uinfo); | 3375 | int ret = copy_siginfo_from_user32(&info, uinfo); |
| 3065 | if (unlikely(ret)) | 3376 | if (unlikely(ret)) |
| 3066 | return ret; | 3377 | return ret; |
| @@ -3104,7 +3415,7 @@ COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo, | |||
| 3104 | int, sig, | 3415 | int, sig, |
| 3105 | struct compat_siginfo __user *, uinfo) | 3416 | struct compat_siginfo __user *, uinfo) |
| 3106 | { | 3417 | { |
| 3107 | siginfo_t info = {}; | 3418 | siginfo_t info; |
| 3108 | 3419 | ||
| 3109 | if (copy_siginfo_from_user32(&info, uinfo)) | 3420 | if (copy_siginfo_from_user32(&info, uinfo)) |
| 3110 | return -EFAULT; | 3421 | return -EFAULT; |
| @@ -3677,6 +3988,7 @@ void __init signals_init(void) | |||
| 3677 | /* If this check fails, the __ARCH_SI_PREAMBLE_SIZE value is wrong! */ | 3988 | /* If this check fails, the __ARCH_SI_PREAMBLE_SIZE value is wrong! */ |
| 3678 | BUILD_BUG_ON(__ARCH_SI_PREAMBLE_SIZE | 3989 | BUILD_BUG_ON(__ARCH_SI_PREAMBLE_SIZE |
| 3679 | != offsetof(struct siginfo, _sifields._pad)); | 3990 | != offsetof(struct siginfo, _sifields._pad)); |
| 3991 | BUILD_BUG_ON(sizeof(struct siginfo) != SI_MAX_SIZE); | ||
| 3680 | 3992 | ||
| 3681 | sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC); | 3993 | sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC); |
| 3682 | } | 3994 | } |
| @@ -3684,26 +3996,25 @@ void __init signals_init(void) | |||
| 3684 | #ifdef CONFIG_KGDB_KDB | 3996 | #ifdef CONFIG_KGDB_KDB |
| 3685 | #include <linux/kdb.h> | 3997 | #include <linux/kdb.h> |
| 3686 | /* | 3998 | /* |
| 3687 | * kdb_send_sig_info - Allows kdb to send signals without exposing | 3999 | * kdb_send_sig - Allows kdb to send signals without exposing |
| 3688 | * signal internals. This function checks if the required locks are | 4000 | * signal internals. This function checks if the required locks are |
| 3689 | * available before calling the main signal code, to avoid kdb | 4001 | * available before calling the main signal code, to avoid kdb |
| 3690 | * deadlocks. | 4002 | * deadlocks. |
| 3691 | */ | 4003 | */ |
| 3692 | void | 4004 | void kdb_send_sig(struct task_struct *t, int sig) |
| 3693 | kdb_send_sig_info(struct task_struct *t, struct siginfo *info) | ||
| 3694 | { | 4005 | { |
| 3695 | static struct task_struct *kdb_prev_t; | 4006 | static struct task_struct *kdb_prev_t; |
| 3696 | int sig, new_t; | 4007 | int new_t, ret; |
| 3697 | if (!spin_trylock(&t->sighand->siglock)) { | 4008 | if (!spin_trylock(&t->sighand->siglock)) { |
| 3698 | kdb_printf("Can't do kill command now.\n" | 4009 | kdb_printf("Can't do kill command now.\n" |
| 3699 | "The sigmask lock is held somewhere else in " | 4010 | "The sigmask lock is held somewhere else in " |
| 3700 | "kernel, try again later\n"); | 4011 | "kernel, try again later\n"); |
| 3701 | return; | 4012 | return; |
| 3702 | } | 4013 | } |
| 3703 | spin_unlock(&t->sighand->siglock); | ||
| 3704 | new_t = kdb_prev_t != t; | 4014 | new_t = kdb_prev_t != t; |
| 3705 | kdb_prev_t = t; | 4015 | kdb_prev_t = t; |
| 3706 | if (t->state != TASK_RUNNING && new_t) { | 4016 | if (t->state != TASK_RUNNING && new_t) { |
| 4017 | spin_unlock(&t->sighand->siglock); | ||
| 3707 | kdb_printf("Process is not RUNNING, sending a signal from " | 4018 | kdb_printf("Process is not RUNNING, sending a signal from " |
| 3708 | "kdb risks deadlock\n" | 4019 | "kdb risks deadlock\n" |
| 3709 | "on the run queue locks. " | 4020 | "on the run queue locks. " |
| @@ -3712,8 +4023,9 @@ kdb_send_sig_info(struct task_struct *t, struct siginfo *info) | |||
| 3712 | "the deadlock.\n"); | 4023 | "the deadlock.\n"); |
| 3713 | return; | 4024 | return; |
| 3714 | } | 4025 | } |
| 3715 | sig = info->si_signo; | 4026 | ret = send_signal(sig, SEND_SIG_PRIV, t, false); |
| 3716 | if (send_sig_info(sig, info, t)) | 4027 | spin_unlock(&t->sighand->siglock); |
| 4028 | if (ret) | ||
| 3717 | kdb_printf("Fail to deliver Signal %d to process %d.\n", | 4029 | kdb_printf("Fail to deliver Signal %d to process %d.\n", |
| 3718 | sig, t->pid); | 4030 | sig, t->pid); |
| 3719 | else | 4031 | else |
diff --git a/kernel/sys.c b/kernel/sys.c index 83ffd7dccf23..f2289de20e19 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
| @@ -135,7 +135,7 @@ EXPORT_SYMBOL(overflowgid); | |||
| 135 | */ | 135 | */ |
| 136 | 136 | ||
| 137 | int fs_overflowuid = DEFAULT_FS_OVERFLOWUID; | 137 | int fs_overflowuid = DEFAULT_FS_OVERFLOWUID; |
| 138 | int fs_overflowgid = DEFAULT_FS_OVERFLOWUID; | 138 | int fs_overflowgid = DEFAULT_FS_OVERFLOWGID; |
| 139 | 139 | ||
| 140 | EXPORT_SYMBOL(fs_overflowuid); | 140 | EXPORT_SYMBOL(fs_overflowuid); |
| 141 | EXPORT_SYMBOL(fs_overflowgid); | 141 | EXPORT_SYMBOL(fs_overflowgid); |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 557d46728577..2fb4e27c636a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
| @@ -1374,13 +1374,6 @@ static struct ctl_table vm_table[] = { | |||
| 1374 | .mode = 0644, | 1374 | .mode = 0644, |
| 1375 | .proc_handler = proc_dointvec, | 1375 | .proc_handler = proc_dointvec, |
| 1376 | }, | 1376 | }, |
| 1377 | { | ||
| 1378 | .procname = "hugepages_treat_as_movable", | ||
| 1379 | .data = &hugepages_treat_as_movable, | ||
| 1380 | .maxlen = sizeof(int), | ||
| 1381 | .mode = 0644, | ||
| 1382 | .proc_handler = proc_dointvec, | ||
| 1383 | }, | ||
| 1384 | { | 1377 | { |
| 1385 | .procname = "nr_overcommit_hugepages", | 1378 | .procname = "nr_overcommit_hugepages", |
| 1386 | .data = NULL, | 1379 | .data = NULL, |
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index cc91d90abd84..94ad46d50b56 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c | |||
| @@ -68,10 +68,10 @@ static ssize_t posix_clock_read(struct file *fp, char __user *buf, | |||
| 68 | return err; | 68 | return err; |
| 69 | } | 69 | } |
| 70 | 70 | ||
| 71 | static unsigned int posix_clock_poll(struct file *fp, poll_table *wait) | 71 | static __poll_t posix_clock_poll(struct file *fp, poll_table *wait) |
| 72 | { | 72 | { |
| 73 | struct posix_clock *clk = get_posix_clock(fp); | 73 | struct posix_clock *clk = get_posix_clock(fp); |
| 74 | unsigned int result = 0; | 74 | __poll_t result = 0; |
| 75 | 75 | ||
| 76 | if (!clk) | 76 | if (!clk) |
| 77 | return POLLERR; | 77 | return POLLERR; |
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index ec999f32c840..75043046914e 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c | |||
| @@ -462,7 +462,7 @@ static struct k_itimer * alloc_posix_timer(void) | |||
| 462 | kmem_cache_free(posix_timers_cache, tmr); | 462 | kmem_cache_free(posix_timers_cache, tmr); |
| 463 | return NULL; | 463 | return NULL; |
| 464 | } | 464 | } |
| 465 | memset(&tmr->sigq->info, 0, sizeof(siginfo_t)); | 465 | clear_siginfo(&tmr->sigq->info); |
| 466 | return tmr; | 466 | return tmr; |
| 467 | } | 467 | } |
| 468 | 468 | ||
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index f54dc62b599c..0b249e2f0c3c 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
| @@ -530,6 +530,15 @@ config FUNCTION_PROFILER | |||
| 530 | 530 | ||
| 531 | If in doubt, say N. | 531 | If in doubt, say N. |
| 532 | 532 | ||
| 533 | config BPF_KPROBE_OVERRIDE | ||
| 534 | bool "Enable BPF programs to override a kprobed function" | ||
| 535 | depends on BPF_EVENTS | ||
| 536 | depends on FUNCTION_ERROR_INJECTION | ||
| 537 | default n | ||
| 538 | help | ||
| 539 | Allows BPF to override the execution of a probed function and | ||
| 540 | set a different return value. This is used for error injection. | ||
| 541 | |||
| 533 | config FTRACE_MCOUNT_RECORD | 542 | config FTRACE_MCOUNT_RECORD |
| 534 | def_bool y | 543 | def_bool y |
| 535 | depends on DYNAMIC_FTRACE | 544 | depends on DYNAMIC_FTRACE |
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 40207c2a4113..fc2838ac8b78 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c | |||
| @@ -13,6 +13,10 @@ | |||
| 13 | #include <linux/filter.h> | 13 | #include <linux/filter.h> |
| 14 | #include <linux/uaccess.h> | 14 | #include <linux/uaccess.h> |
| 15 | #include <linux/ctype.h> | 15 | #include <linux/ctype.h> |
| 16 | #include <linux/kprobes.h> | ||
| 17 | #include <linux/error-injection.h> | ||
| 18 | |||
| 19 | #include "trace_probe.h" | ||
| 16 | #include "trace.h" | 20 | #include "trace.h" |
| 17 | 21 | ||
| 18 | u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); | 22 | u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); |
| @@ -76,6 +80,23 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) | |||
| 76 | } | 80 | } |
| 77 | EXPORT_SYMBOL_GPL(trace_call_bpf); | 81 | EXPORT_SYMBOL_GPL(trace_call_bpf); |
| 78 | 82 | ||
| 83 | #ifdef CONFIG_BPF_KPROBE_OVERRIDE | ||
| 84 | BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) | ||
| 85 | { | ||
| 86 | regs_set_return_value(regs, rc); | ||
| 87 | override_function_with_return(regs); | ||
| 88 | return 0; | ||
| 89 | } | ||
| 90 | |||
| 91 | static const struct bpf_func_proto bpf_override_return_proto = { | ||
| 92 | .func = bpf_override_return, | ||
| 93 | .gpl_only = true, | ||
| 94 | .ret_type = RET_INTEGER, | ||
| 95 | .arg1_type = ARG_PTR_TO_CTX, | ||
| 96 | .arg2_type = ARG_ANYTHING, | ||
| 97 | }; | ||
| 98 | #endif | ||
| 99 | |||
| 79 | BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr) | 100 | BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr) |
| 80 | { | 101 | { |
| 81 | int ret; | 102 | int ret; |
| @@ -224,7 +245,7 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, | |||
| 224 | */ | 245 | */ |
| 225 | #define __BPF_TP_EMIT() __BPF_ARG3_TP() | 246 | #define __BPF_TP_EMIT() __BPF_ARG3_TP() |
| 226 | #define __BPF_TP(...) \ | 247 | #define __BPF_TP(...) \ |
| 227 | __trace_printk(1 /* Fake ip will not be printed. */, \ | 248 | __trace_printk(0 /* Fake ip */, \ |
| 228 | fmt, ##__VA_ARGS__) | 249 | fmt, ##__VA_ARGS__) |
| 229 | 250 | ||
| 230 | #define __BPF_ARG1_TP(...) \ | 251 | #define __BPF_ARG1_TP(...) \ |
| @@ -556,6 +577,10 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func | |||
| 556 | return &bpf_get_stackid_proto; | 577 | return &bpf_get_stackid_proto; |
| 557 | case BPF_FUNC_perf_event_read_value: | 578 | case BPF_FUNC_perf_event_read_value: |
| 558 | return &bpf_perf_event_read_value_proto; | 579 | return &bpf_perf_event_read_value_proto; |
| 580 | #ifdef CONFIG_BPF_KPROBE_OVERRIDE | ||
| 581 | case BPF_FUNC_override_return: | ||
| 582 | return &bpf_override_return_proto; | ||
| 583 | #endif | ||
| 559 | default: | 584 | default: |
| 560 | return tracing_func_proto(func_id); | 585 | return tracing_func_proto(func_id); |
| 561 | } | 586 | } |
| @@ -773,6 +798,15 @@ int perf_event_attach_bpf_prog(struct perf_event *event, | |||
| 773 | struct bpf_prog_array *new_array; | 798 | struct bpf_prog_array *new_array; |
| 774 | int ret = -EEXIST; | 799 | int ret = -EEXIST; |
| 775 | 800 | ||
| 801 | /* | ||
| 802 | * Kprobe override only works if they are on the function entry, | ||
| 803 | * and only if they are on the opt-in list. | ||
| 804 | */ | ||
| 805 | if (prog->kprobe_override && | ||
| 806 | (!trace_kprobe_on_func_entry(event->tp_event) || | ||
| 807 | !trace_kprobe_error_injectable(event->tp_event))) | ||
| 808 | return -EINVAL; | ||
| 809 | |||
| 776 | mutex_lock(&bpf_event_mutex); | 810 | mutex_lock(&bpf_event_mutex); |
| 777 | 811 | ||
| 778 | if (event->prog) | 812 | if (event->prog) |
| @@ -825,3 +859,26 @@ void perf_event_detach_bpf_prog(struct perf_event *event) | |||
| 825 | unlock: | 859 | unlock: |
| 826 | mutex_unlock(&bpf_event_mutex); | 860 | mutex_unlock(&bpf_event_mutex); |
| 827 | } | 861 | } |
| 862 | |||
| 863 | int perf_event_query_prog_array(struct perf_event *event, void __user *info) | ||
| 864 | { | ||
| 865 | struct perf_event_query_bpf __user *uquery = info; | ||
| 866 | struct perf_event_query_bpf query = {}; | ||
| 867 | int ret; | ||
| 868 | |||
| 869 | if (!capable(CAP_SYS_ADMIN)) | ||
| 870 | return -EPERM; | ||
| 871 | if (event->attr.type != PERF_TYPE_TRACEPOINT) | ||
| 872 | return -EINVAL; | ||
| 873 | if (copy_from_user(&query, uquery, sizeof(query))) | ||
| 874 | return -EFAULT; | ||
| 875 | |||
| 876 | mutex_lock(&bpf_event_mutex); | ||
| 877 | ret = bpf_prog_array_copy_info(event->tp_event->prog_array, | ||
| 878 | uquery->ids, | ||
| 879 | query.ids_len, | ||
| 880 | &uquery->prog_cnt); | ||
| 881 | mutex_unlock(&bpf_event_mutex); | ||
| 882 | |||
| 883 | return ret; | ||
| 884 | } | ||
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 554b517c61a0..dabd9d167d42 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
| @@ -5015,7 +5015,6 @@ int ftrace_regex_release(struct inode *inode, struct file *file) | |||
| 5015 | 5015 | ||
| 5016 | parser = &iter->parser; | 5016 | parser = &iter->parser; |
| 5017 | if (trace_parser_loaded(parser)) { | 5017 | if (trace_parser_loaded(parser)) { |
| 5018 | parser->buffer[parser->idx] = 0; | ||
| 5019 | ftrace_match_records(iter->hash, parser->buffer, parser->idx); | 5018 | ftrace_match_records(iter->hash, parser->buffer, parser->idx); |
| 5020 | } | 5019 | } |
| 5021 | 5020 | ||
| @@ -5329,7 +5328,6 @@ ftrace_graph_release(struct inode *inode, struct file *file) | |||
| 5329 | parser = &fgd->parser; | 5328 | parser = &fgd->parser; |
| 5330 | 5329 | ||
| 5331 | if (trace_parser_loaded((parser))) { | 5330 | if (trace_parser_loaded((parser))) { |
| 5332 | parser->buffer[parser->idx] = 0; | ||
| 5333 | ret = ftrace_graph_set_hash(fgd->new_hash, | 5331 | ret = ftrace_graph_set_hash(fgd->new_hash, |
| 5334 | parser->buffer); | 5332 | parser->buffer); |
| 5335 | } | 5333 | } |
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 5af2842dea96..ca6930e0d25e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
| @@ -630,7 +630,7 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full) | |||
| 630 | * Returns POLLIN | POLLRDNORM if data exists in the buffers, | 630 | * Returns POLLIN | POLLRDNORM if data exists in the buffers, |
| 631 | * zero otherwise. | 631 | * zero otherwise. |
| 632 | */ | 632 | */ |
| 633 | int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, | 633 | __poll_t ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, |
| 634 | struct file *filp, poll_table *poll_table) | 634 | struct file *filp, poll_table *poll_table) |
| 635 | { | 635 | { |
| 636 | struct ring_buffer_per_cpu *cpu_buffer; | 636 | struct ring_buffer_per_cpu *cpu_buffer; |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4f3a8e24b426..56608538a4ad 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
| @@ -530,8 +530,6 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, | |||
| 530 | ubuf += ret; | 530 | ubuf += ret; |
| 531 | cnt -= ret; | 531 | cnt -= ret; |
| 532 | 532 | ||
| 533 | parser.buffer[parser.idx] = 0; | ||
| 534 | |||
| 535 | ret = -EINVAL; | 533 | ret = -EINVAL; |
| 536 | if (kstrtoul(parser.buffer, 0, &val)) | 534 | if (kstrtoul(parser.buffer, 0, &val)) |
| 537 | break; | 535 | break; |
| @@ -1236,18 +1234,18 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, | |||
| 1236 | cnt--; | 1234 | cnt--; |
| 1237 | } | 1235 | } |
| 1238 | 1236 | ||
| 1237 | parser->idx = 0; | ||
| 1238 | |||
| 1239 | /* only spaces were written */ | 1239 | /* only spaces were written */ |
| 1240 | if (isspace(ch)) { | 1240 | if (isspace(ch) || !ch) { |
| 1241 | *ppos += read; | 1241 | *ppos += read; |
| 1242 | ret = read; | 1242 | ret = read; |
| 1243 | goto out; | 1243 | goto out; |
| 1244 | } | 1244 | } |
| 1245 | |||
| 1246 | parser->idx = 0; | ||
| 1247 | } | 1245 | } |
| 1248 | 1246 | ||
| 1249 | /* read the non-space input */ | 1247 | /* read the non-space input */ |
| 1250 | while (cnt && !isspace(ch)) { | 1248 | while (cnt && !isspace(ch) && ch) { |
| 1251 | if (parser->idx < parser->size - 1) | 1249 | if (parser->idx < parser->size - 1) |
| 1252 | parser->buffer[parser->idx++] = ch; | 1250 | parser->buffer[parser->idx++] = ch; |
| 1253 | else { | 1251 | else { |
| @@ -1262,12 +1260,14 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, | |||
| 1262 | } | 1260 | } |
| 1263 | 1261 | ||
| 1264 | /* We either got finished input or we have to wait for another call. */ | 1262 | /* We either got finished input or we have to wait for another call. */ |
| 1265 | if (isspace(ch)) { | 1263 | if (isspace(ch) || !ch) { |
| 1266 | parser->buffer[parser->idx] = 0; | 1264 | parser->buffer[parser->idx] = 0; |
| 1267 | parser->cont = false; | 1265 | parser->cont = false; |
| 1268 | } else if (parser->idx < parser->size - 1) { | 1266 | } else if (parser->idx < parser->size - 1) { |
| 1269 | parser->cont = true; | 1267 | parser->cont = true; |
| 1270 | parser->buffer[parser->idx++] = ch; | 1268 | parser->buffer[parser->idx++] = ch; |
| 1269 | /* Make sure the parsed string always terminates with '\0'. */ | ||
| 1270 | parser->buffer[parser->idx] = 0; | ||
| 1271 | } else { | 1271 | } else { |
| 1272 | ret = -EINVAL; | 1272 | ret = -EINVAL; |
| 1273 | goto out; | 1273 | goto out; |
| @@ -5616,7 +5616,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) | |||
| 5616 | return 0; | 5616 | return 0; |
| 5617 | } | 5617 | } |
| 5618 | 5618 | ||
| 5619 | static unsigned int | 5619 | static __poll_t |
| 5620 | trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table) | 5620 | trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table) |
| 5621 | { | 5621 | { |
| 5622 | struct trace_array *tr = iter->tr; | 5622 | struct trace_array *tr = iter->tr; |
| @@ -5635,7 +5635,7 @@ trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_tabl | |||
| 5635 | filp, poll_table); | 5635 | filp, poll_table); |
| 5636 | } | 5636 | } |
| 5637 | 5637 | ||
| 5638 | static unsigned int | 5638 | static __poll_t |
| 5639 | tracing_poll_pipe(struct file *filp, poll_table *poll_table) | 5639 | tracing_poll_pipe(struct file *filp, poll_table *poll_table) |
| 5640 | { | 5640 | { |
| 5641 | struct trace_iterator *iter = filp->private_data; | 5641 | struct trace_iterator *iter = filp->private_data; |
| @@ -6589,7 +6589,7 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp) | |||
| 6589 | return ret; | 6589 | return ret; |
| 6590 | } | 6590 | } |
| 6591 | 6591 | ||
| 6592 | static unsigned int | 6592 | static __poll_t |
| 6593 | tracing_buffers_poll(struct file *filp, poll_table *poll_table) | 6593 | tracing_buffers_poll(struct file *filp, poll_table *poll_table) |
| 6594 | { | 6594 | { |
| 6595 | struct ftrace_buffer_info *info = filp->private_data; | 6595 | struct ftrace_buffer_info *info = filp->private_data; |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 1b87157edbff..05c7172c6667 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
| @@ -885,8 +885,6 @@ ftrace_event_write(struct file *file, const char __user *ubuf, | |||
| 885 | if (*parser.buffer == '!') | 885 | if (*parser.buffer == '!') |
| 886 | set = 0; | 886 | set = 0; |
| 887 | 887 | ||
| 888 | parser.buffer[parser.idx] = 0; | ||
| 889 | |||
| 890 | ret = ftrace_set_clr_event(tr, parser.buffer + !set, set); | 888 | ret = ftrace_set_clr_event(tr, parser.buffer + !set, set); |
| 891 | if (ret) | 889 | if (ret) |
| 892 | goto out_put; | 890 | goto out_put; |
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 492700c5fb4d..1fad24acd444 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
| @@ -21,6 +21,7 @@ | |||
| 21 | #include <linux/module.h> | 21 | #include <linux/module.h> |
| 22 | #include <linux/uaccess.h> | 22 | #include <linux/uaccess.h> |
| 23 | #include <linux/rculist.h> | 23 | #include <linux/rculist.h> |
| 24 | #include <linux/error-injection.h> | ||
| 24 | 25 | ||
| 25 | #include "trace_probe.h" | 26 | #include "trace_probe.h" |
| 26 | 27 | ||
| @@ -42,7 +43,6 @@ struct trace_kprobe { | |||
| 42 | (offsetof(struct trace_kprobe, tp.args) + \ | 43 | (offsetof(struct trace_kprobe, tp.args) + \ |
| 43 | (sizeof(struct probe_arg) * (n))) | 44 | (sizeof(struct probe_arg) * (n))) |
| 44 | 45 | ||
| 45 | |||
| 46 | static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk) | 46 | static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk) |
| 47 | { | 47 | { |
| 48 | return tk->rp.handler != NULL; | 48 | return tk->rp.handler != NULL; |
| @@ -87,6 +87,30 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk) | |||
| 87 | return nhit; | 87 | return nhit; |
| 88 | } | 88 | } |
| 89 | 89 | ||
| 90 | bool trace_kprobe_on_func_entry(struct trace_event_call *call) | ||
| 91 | { | ||
| 92 | struct trace_kprobe *tk = (struct trace_kprobe *)call->data; | ||
| 93 | |||
| 94 | return kprobe_on_func_entry(tk->rp.kp.addr, | ||
| 95 | tk->rp.kp.addr ? NULL : tk->rp.kp.symbol_name, | ||
| 96 | tk->rp.kp.addr ? 0 : tk->rp.kp.offset); | ||
| 97 | } | ||
| 98 | |||
| 99 | bool trace_kprobe_error_injectable(struct trace_event_call *call) | ||
| 100 | { | ||
| 101 | struct trace_kprobe *tk = (struct trace_kprobe *)call->data; | ||
| 102 | unsigned long addr; | ||
| 103 | |||
| 104 | if (tk->symbol) { | ||
| 105 | addr = (unsigned long) | ||
| 106 | kallsyms_lookup_name(trace_kprobe_symbol(tk)); | ||
| 107 | addr += tk->rp.kp.offset; | ||
| 108 | } else { | ||
| 109 | addr = (unsigned long)tk->rp.kp.addr; | ||
| 110 | } | ||
| 111 | return within_error_injection_list(addr); | ||
| 112 | } | ||
| 113 | |||
| 90 | static int register_kprobe_event(struct trace_kprobe *tk); | 114 | static int register_kprobe_event(struct trace_kprobe *tk); |
| 91 | static int unregister_kprobe_event(struct trace_kprobe *tk); | 115 | static int unregister_kprobe_event(struct trace_kprobe *tk); |
| 92 | 116 | ||
| @@ -1170,7 +1194,7 @@ static int kretprobe_event_define_fields(struct trace_event_call *event_call) | |||
| 1170 | #ifdef CONFIG_PERF_EVENTS | 1194 | #ifdef CONFIG_PERF_EVENTS |
| 1171 | 1195 | ||
| 1172 | /* Kprobe profile handler */ | 1196 | /* Kprobe profile handler */ |
| 1173 | static void | 1197 | static int |
| 1174 | kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) | 1198 | kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) |
| 1175 | { | 1199 | { |
| 1176 | struct trace_event_call *call = &tk->tp.call; | 1200 | struct trace_event_call *call = &tk->tp.call; |
| @@ -1179,12 +1203,31 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) | |||
| 1179 | int size, __size, dsize; | 1203 | int size, __size, dsize; |
| 1180 | int rctx; | 1204 | int rctx; |
| 1181 | 1205 | ||
| 1182 | if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs)) | 1206 | if (bpf_prog_array_valid(call)) { |
| 1183 | return; | 1207 | unsigned long orig_ip = instruction_pointer(regs); |
| 1208 | int ret; | ||
| 1209 | |||
| 1210 | ret = trace_call_bpf(call, regs); | ||
| 1211 | |||
| 1212 | /* | ||
| 1213 | * We need to check and see if we modified the pc of the | ||
| 1214 | * pt_regs, and if so clear the kprobe and return 1 so that we | ||
| 1215 | * don't do the single stepping. | ||
| 1216 | * The ftrace kprobe handler leaves it up to us to re-enable | ||
| 1217 | * preemption here before returning if we've modified the ip. | ||
| 1218 | */ | ||
| 1219 | if (orig_ip != instruction_pointer(regs)) { | ||
| 1220 | reset_current_kprobe(); | ||
| 1221 | preempt_enable_no_resched(); | ||
| 1222 | return 1; | ||
| 1223 | } | ||
| 1224 | if (!ret) | ||
| 1225 | return 0; | ||
| 1226 | } | ||
| 1184 | 1227 | ||
| 1185 | head = this_cpu_ptr(call->perf_events); | 1228 | head = this_cpu_ptr(call->perf_events); |
| 1186 | if (hlist_empty(head)) | 1229 | if (hlist_empty(head)) |
| 1187 | return; | 1230 | return 0; |
| 1188 | 1231 | ||
| 1189 | dsize = __get_data_size(&tk->tp, regs); | 1232 | dsize = __get_data_size(&tk->tp, regs); |
| 1190 | __size = sizeof(*entry) + tk->tp.size + dsize; | 1233 | __size = sizeof(*entry) + tk->tp.size + dsize; |
| @@ -1193,13 +1236,14 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) | |||
| 1193 | 1236 | ||
| 1194 | entry = perf_trace_buf_alloc(size, NULL, &rctx); | 1237 | entry = perf_trace_buf_alloc(size, NULL, &rctx); |
| 1195 | if (!entry) | 1238 | if (!entry) |
| 1196 | return; | 1239 | return 0; |
| 1197 | 1240 | ||
| 1198 | entry->ip = (unsigned long)tk->rp.kp.addr; | 1241 | entry->ip = (unsigned long)tk->rp.kp.addr; |
| 1199 | memset(&entry[1], 0, dsize); | 1242 | memset(&entry[1], 0, dsize); |
| 1200 | store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); | 1243 | store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); |
| 1201 | perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, | 1244 | perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, |
| 1202 | head, NULL); | 1245 | head, NULL); |
| 1246 | return 0; | ||
| 1203 | } | 1247 | } |
| 1204 | NOKPROBE_SYMBOL(kprobe_perf_func); | 1248 | NOKPROBE_SYMBOL(kprobe_perf_func); |
| 1205 | 1249 | ||
| @@ -1275,6 +1319,7 @@ static int kprobe_register(struct trace_event_call *event, | |||
| 1275 | static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) | 1319 | static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) |
| 1276 | { | 1320 | { |
| 1277 | struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp); | 1321 | struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp); |
| 1322 | int ret = 0; | ||
| 1278 | 1323 | ||
| 1279 | raw_cpu_inc(*tk->nhit); | 1324 | raw_cpu_inc(*tk->nhit); |
| 1280 | 1325 | ||
| @@ -1282,9 +1327,9 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) | |||
| 1282 | kprobe_trace_func(tk, regs); | 1327 | kprobe_trace_func(tk, regs); |
| 1283 | #ifdef CONFIG_PERF_EVENTS | 1328 | #ifdef CONFIG_PERF_EVENTS |
| 1284 | if (tk->tp.flags & TP_FLAG_PROFILE) | 1329 | if (tk->tp.flags & TP_FLAG_PROFILE) |
| 1285 | kprobe_perf_func(tk, regs); | 1330 | ret = kprobe_perf_func(tk, regs); |
| 1286 | #endif | 1331 | #endif |
| 1287 | return 0; /* We don't tweek kernel, so just return 0 */ | 1332 | return ret; |
| 1288 | } | 1333 | } |
| 1289 | NOKPROBE_SYMBOL(kprobe_dispatcher); | 1334 | NOKPROBE_SYMBOL(kprobe_dispatcher); |
| 1290 | 1335 | ||
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index fb66e3eaa192..e101c5bb9eda 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h | |||
| @@ -252,6 +252,8 @@ struct symbol_cache; | |||
| 252 | unsigned long update_symbol_cache(struct symbol_cache *sc); | 252 | unsigned long update_symbol_cache(struct symbol_cache *sc); |
| 253 | void free_symbol_cache(struct symbol_cache *sc); | 253 | void free_symbol_cache(struct symbol_cache *sc); |
| 254 | struct symbol_cache *alloc_symbol_cache(const char *sym, long offset); | 254 | struct symbol_cache *alloc_symbol_cache(const char *sym, long offset); |
| 255 | bool trace_kprobe_on_func_entry(struct trace_event_call *call); | ||
| 256 | bool trace_kprobe_error_injectable(struct trace_event_call *call); | ||
| 255 | #else | 257 | #else |
| 256 | /* uprobes do not support symbol fetch methods */ | 258 | /* uprobes do not support symbol fetch methods */ |
| 257 | #define fetch_symbol_u8 NULL | 259 | #define fetch_symbol_u8 NULL |
| @@ -277,6 +279,16 @@ alloc_symbol_cache(const char *sym, long offset) | |||
| 277 | { | 279 | { |
| 278 | return NULL; | 280 | return NULL; |
| 279 | } | 281 | } |
| 282 | |||
| 283 | static inline bool trace_kprobe_on_func_entry(struct trace_event_call *call) | ||
| 284 | { | ||
| 285 | return false; | ||
| 286 | } | ||
| 287 | |||
| 288 | static inline bool trace_kprobe_error_injectable(struct trace_event_call *call) | ||
| 289 | { | ||
| 290 | return false; | ||
| 291 | } | ||
| 280 | #endif /* CONFIG_KPROBE_EVENTS */ | 292 | #endif /* CONFIG_KPROBE_EVENTS */ |
| 281 | 293 | ||
| 282 | struct probe_arg { | 294 | struct probe_arg { |
diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c index 8cda06a10d66..c364cf777e1a 100644 --- a/kernel/trace/trace_selftest_dynamic.c +++ b/kernel/trace/trace_selftest_dynamic.c | |||
| @@ -1,13 +1,14 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0 | 1 | // SPDX-License-Identifier: GPL-2.0 |
| 2 | #include <linux/compiler.h> | ||
| 2 | #include "trace.h" | 3 | #include "trace.h" |
| 3 | 4 | ||
| 4 | int DYN_FTRACE_TEST_NAME(void) | 5 | noinline __noclone int DYN_FTRACE_TEST_NAME(void) |
| 5 | { | 6 | { |
| 6 | /* used to call mcount */ | 7 | /* used to call mcount */ |
| 7 | return 0; | 8 | return 0; |
| 8 | } | 9 | } |
| 9 | 10 | ||
| 10 | int DYN_FTRACE_TEST_NAME2(void) | 11 | noinline __noclone int DYN_FTRACE_TEST_NAME2(void) |
| 11 | { | 12 | { |
| 12 | /* used to call mcount */ | 13 | /* used to call mcount */ |
| 13 | return 0; | 14 | return 0; |
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 40592e7b3568..268029ae1be6 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c | |||
| @@ -608,7 +608,7 @@ static int probes_seq_show(struct seq_file *m, void *v) | |||
| 608 | 608 | ||
| 609 | /* Don't print "0x (null)" when offset is 0 */ | 609 | /* Don't print "0x (null)" when offset is 0 */ |
| 610 | if (tu->offset) { | 610 | if (tu->offset) { |
| 611 | seq_printf(m, "0x%p", (void *)tu->offset); | 611 | seq_printf(m, "0x%px", (void *)tu->offset); |
| 612 | } else { | 612 | } else { |
| 613 | switch (sizeof(void *)) { | 613 | switch (sizeof(void *)) { |
| 614 | case 4: | 614 | case 4: |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8c34981d90ad..017044c26233 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
| @@ -3807,6 +3807,7 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, | |||
| 3807 | 3807 | ||
| 3808 | return ret; | 3808 | return ret; |
| 3809 | } | 3809 | } |
| 3810 | EXPORT_SYMBOL_GPL(apply_workqueue_attrs); | ||
| 3810 | 3811 | ||
| 3811 | /** | 3812 | /** |
| 3812 | * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug | 3813 | * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug |
| @@ -3940,6 +3941,37 @@ static int wq_clamp_max_active(int max_active, unsigned int flags, | |||
| 3940 | return clamp_val(max_active, 1, lim); | 3941 | return clamp_val(max_active, 1, lim); |
| 3941 | } | 3942 | } |
| 3942 | 3943 | ||
| 3944 | /* | ||
| 3945 | * Workqueues which may be used during memory reclaim should have a rescuer | ||
| 3946 | * to guarantee forward progress. | ||
| 3947 | */ | ||
| 3948 | static int init_rescuer(struct workqueue_struct *wq) | ||
| 3949 | { | ||
| 3950 | struct worker *rescuer; | ||
| 3951 | int ret; | ||
| 3952 | |||
| 3953 | if (!(wq->flags & WQ_MEM_RECLAIM)) | ||
| 3954 | return 0; | ||
| 3955 | |||
| 3956 | rescuer = alloc_worker(NUMA_NO_NODE); | ||
| 3957 | if (!rescuer) | ||
| 3958 | return -ENOMEM; | ||
| 3959 | |||
| 3960 | rescuer->rescue_wq = wq; | ||
| 3961 | rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", wq->name); | ||
| 3962 | ret = PTR_ERR_OR_ZERO(rescuer->task); | ||
| 3963 | if (ret) { | ||
| 3964 | kfree(rescuer); | ||
| 3965 | return ret; | ||
| 3966 | } | ||
| 3967 | |||
| 3968 | wq->rescuer = rescuer; | ||
| 3969 | kthread_bind_mask(rescuer->task, cpu_possible_mask); | ||
| 3970 | wake_up_process(rescuer->task); | ||
| 3971 | |||
| 3972 | return 0; | ||
| 3973 | } | ||
| 3974 | |||
| 3943 | struct workqueue_struct *__alloc_workqueue_key(const char *fmt, | 3975 | struct workqueue_struct *__alloc_workqueue_key(const char *fmt, |
| 3944 | unsigned int flags, | 3976 | unsigned int flags, |
| 3945 | int max_active, | 3977 | int max_active, |
| @@ -4002,29 +4034,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, | |||
| 4002 | if (alloc_and_link_pwqs(wq) < 0) | 4034 | if (alloc_and_link_pwqs(wq) < 0) |
| 4003 | goto err_free_wq; | 4035 | goto err_free_wq; |
| 4004 | 4036 | ||
| 4005 | /* | 4037 | if (wq_online && init_rescuer(wq) < 0) |
| 4006 | * Workqueues which may be used during memory reclaim should | 4038 | goto err_destroy; |
| 4007 | * have a rescuer to guarantee forward progress. | ||
| 4008 | */ | ||
| 4009 | if (flags & WQ_MEM_RECLAIM) { | ||
| 4010 | struct worker *rescuer; | ||
| 4011 | |||
| 4012 | rescuer = alloc_worker(NUMA_NO_NODE); | ||
| 4013 | if (!rescuer) | ||
| 4014 | goto err_destroy; | ||
| 4015 | |||
| 4016 | rescuer->rescue_wq = wq; | ||
| 4017 | rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", | ||
| 4018 | wq->name); | ||
| 4019 | if (IS_ERR(rescuer->task)) { | ||
| 4020 | kfree(rescuer); | ||
| 4021 | goto err_destroy; | ||
| 4022 | } | ||
| 4023 | |||
| 4024 | wq->rescuer = rescuer; | ||
| 4025 | kthread_bind_mask(rescuer->task, cpu_possible_mask); | ||
| 4026 | wake_up_process(rescuer->task); | ||
| 4027 | } | ||
| 4028 | 4039 | ||
| 4029 | if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq)) | 4040 | if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq)) |
| 4030 | goto err_destroy; | 4041 | goto err_destroy; |
| @@ -5642,6 +5653,8 @@ int __init workqueue_init(void) | |||
| 5642 | * archs such as power and arm64. As per-cpu pools created | 5653 | * archs such as power and arm64. As per-cpu pools created |
| 5643 | * previously could be missing node hint and unbound pools NUMA | 5654 | * previously could be missing node hint and unbound pools NUMA |
| 5644 | * affinity, fix them up. | 5655 | * affinity, fix them up. |
| 5656 | * | ||
| 5657 | * Also, while iterating workqueues, create rescuers if requested. | ||
| 5645 | */ | 5658 | */ |
| 5646 | wq_numa_init(); | 5659 | wq_numa_init(); |
| 5647 | 5660 | ||
| @@ -5653,8 +5666,12 @@ int __init workqueue_init(void) | |||
| 5653 | } | 5666 | } |
| 5654 | } | 5667 | } |
| 5655 | 5668 | ||
| 5656 | list_for_each_entry(wq, &workqueues, list) | 5669 | list_for_each_entry(wq, &workqueues, list) { |
| 5657 | wq_update_unbound_numa(wq, smp_processor_id(), true); | 5670 | wq_update_unbound_numa(wq, smp_processor_id(), true); |
| 5671 | WARN(init_rescuer(wq), | ||
| 5672 | "workqueue: failed to create early rescuer for %s", | ||
| 5673 | wq->name); | ||
| 5674 | } | ||
| 5658 | 5675 | ||
| 5659 | mutex_unlock(&wq_pool_mutex); | 5676 | mutex_unlock(&wq_pool_mutex); |
| 5660 | 5677 | ||
