aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2018-02-06 15:12:31 -0500
committerIngo Molnar <mingo@kernel.org>2018-02-06 15:12:31 -0500
commit82845079160817cc6ac64e5321bbd935e0a47b3a (patch)
tree0886d1d52428e9db14536cae4b37db896e7c360a /kernel
parent32e839dda3ba576943365f0f5817ce5c843137dc (diff)
parent68c5735eaa5e680e701c9a2d1e3c7880bdf5ab66 (diff)
Merge branch 'linus' into sched/urgent, to resolve conflicts
Conflicts: arch/arm64/kernel/entry.S arch/x86/Kconfig include/linux/sched/mm.h kernel/fork.c Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/bpf/Makefile2
-rw-r--r--kernel/bpf/arraymap.c49
-rw-r--r--kernel/bpf/cgroup.c15
-rw-r--r--kernel/bpf/core.c409
-rw-r--r--kernel/bpf/cpumap.c31
-rw-r--r--kernel/bpf/devmap.c8
-rw-r--r--kernel/bpf/disasm.c63
-rw-r--r--kernel/bpf/disasm.h29
-rw-r--r--kernel/bpf/hashtab.c103
-rw-r--r--kernel/bpf/inode.c50
-rw-r--r--kernel/bpf/lpm_trie.c98
-rw-r--r--kernel/bpf/offload.c430
-rw-r--r--kernel/bpf/sockmap.c16
-rw-r--r--kernel/bpf/stackmap.c34
-rw-r--r--kernel/bpf/syscall.c214
-rw-r--r--kernel/bpf/verifier.c1451
-rw-r--r--kernel/cgroup/cgroup.c6
-rw-r--r--kernel/debug/kdb/kdb_main.c10
-rw-r--r--kernel/debug/kdb/kdb_private.h2
-rw-r--r--kernel/events/core.c15
-rw-r--r--kernel/fail_function.c349
-rw-r--r--kernel/fork.c484
-rw-r--r--kernel/irq/autoprobe.c2
-rw-r--r--kernel/irq/chip.c6
-rw-r--r--kernel/irq/debug.h14
-rw-r--r--kernel/irq/internals.h2
-rw-r--r--kernel/kallsyms.c46
-rw-r--r--kernel/livepatch/core.c76
-rw-r--r--kernel/livepatch/transition.c116
-rw-r--r--kernel/livepatch/transition.h2
-rw-r--r--kernel/memremap.c174
-rw-r--r--kernel/module.c12
-rw-r--r--kernel/padata.c1
-rw-r--r--kernel/power/power.h3
-rw-r--r--kernel/printk/printk.c219
-rw-r--r--kernel/ptrace.c9
-rw-r--r--kernel/rcu/update.c2
-rw-r--r--kernel/relay.c4
-rw-r--r--kernel/resource.c29
-rw-r--r--kernel/sched/autogroup.c5
-rw-r--r--kernel/seccomp.c108
-rw-r--r--kernel/signal.c354
-rw-r--r--kernel/sys.c2
-rw-r--r--kernel/sysctl.c7
-rw-r--r--kernel/time/posix-clock.c4
-rw-r--r--kernel/time/posix-timers.c2
-rw-r--r--kernel/trace/Kconfig9
-rw-r--r--kernel/trace/bpf_trace.c59
-rw-r--r--kernel/trace/ftrace.c2
-rw-r--r--kernel/trace/ring_buffer.c2
-rw-r--r--kernel/trace/trace.c20
-rw-r--r--kernel/trace/trace_events.c2
-rw-r--r--kernel/trace/trace_kprobe.c61
-rw-r--r--kernel/trace/trace_probe.h12
-rw-r--r--kernel/trace/trace_selftest_dynamic.c5
-rw-r--r--kernel/trace/trace_uprobe.c2
-rw-r--r--kernel/workqueue.c65
58 files changed, 4082 insertions, 1225 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 172d151d429c..f85ae5dfa474 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -81,6 +81,7 @@ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
81obj-$(CONFIG_GCOV_KERNEL) += gcov/ 81obj-$(CONFIG_GCOV_KERNEL) += gcov/
82obj-$(CONFIG_KCOV) += kcov.o 82obj-$(CONFIG_KCOV) += kcov.o
83obj-$(CONFIG_KPROBES) += kprobes.o 83obj-$(CONFIG_KPROBES) += kprobes.o
84obj-$(CONFIG_FAIL_FUNCTION) += fail_function.o
84obj-$(CONFIG_KGDB) += debug/ 85obj-$(CONFIG_KGDB) += debug/
85obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o 86obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
86obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o 87obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index e691da0b3bab..a713fd23ec88 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -9,9 +9,11 @@ obj-$(CONFIG_BPF_SYSCALL) += devmap.o
9obj-$(CONFIG_BPF_SYSCALL) += cpumap.o 9obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
10obj-$(CONFIG_BPF_SYSCALL) += offload.o 10obj-$(CONFIG_BPF_SYSCALL) += offload.o
11ifeq ($(CONFIG_STREAM_PARSER),y) 11ifeq ($(CONFIG_STREAM_PARSER),y)
12ifeq ($(CONFIG_INET),y)
12obj-$(CONFIG_BPF_SYSCALL) += sockmap.o 13obj-$(CONFIG_BPF_SYSCALL) += sockmap.o
13endif 14endif
14endif 15endif
16endif
15ifeq ($(CONFIG_PERF_EVENTS),y) 17ifeq ($(CONFIG_PERF_EVENTS),y)
16obj-$(CONFIG_BPF_SYSCALL) += stackmap.o 18obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
17endif 19endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index ab94d304a634..b1f66480135b 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -49,27 +49,35 @@ static int bpf_array_alloc_percpu(struct bpf_array *array)
49} 49}
50 50
51/* Called from syscall */ 51/* Called from syscall */
52static struct bpf_map *array_map_alloc(union bpf_attr *attr) 52static int array_map_alloc_check(union bpf_attr *attr)
53{ 53{
54 bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; 54 bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
55 int numa_node = bpf_map_attr_numa_node(attr); 55 int numa_node = bpf_map_attr_numa_node(attr);
56 u32 elem_size, index_mask, max_entries;
57 bool unpriv = !capable(CAP_SYS_ADMIN);
58 struct bpf_array *array;
59 u64 array_size, mask64;
60 56
61 /* check sanity of attributes */ 57 /* check sanity of attributes */
62 if (attr->max_entries == 0 || attr->key_size != 4 || 58 if (attr->max_entries == 0 || attr->key_size != 4 ||
63 attr->value_size == 0 || 59 attr->value_size == 0 ||
64 attr->map_flags & ~ARRAY_CREATE_FLAG_MASK || 60 attr->map_flags & ~ARRAY_CREATE_FLAG_MASK ||
65 (percpu && numa_node != NUMA_NO_NODE)) 61 (percpu && numa_node != NUMA_NO_NODE))
66 return ERR_PTR(-EINVAL); 62 return -EINVAL;
67 63
68 if (attr->value_size > KMALLOC_MAX_SIZE) 64 if (attr->value_size > KMALLOC_MAX_SIZE)
69 /* if value_size is bigger, the user space won't be able to 65 /* if value_size is bigger, the user space won't be able to
70 * access the elements. 66 * access the elements.
71 */ 67 */
72 return ERR_PTR(-E2BIG); 68 return -E2BIG;
69
70 return 0;
71}
72
73static struct bpf_map *array_map_alloc(union bpf_attr *attr)
74{
75 bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
76 int numa_node = bpf_map_attr_numa_node(attr);
77 u32 elem_size, index_mask, max_entries;
78 bool unpriv = !capable(CAP_SYS_ADMIN);
79 struct bpf_array *array;
80 u64 array_size, mask64;
73 81
74 elem_size = round_up(attr->value_size, 8); 82 elem_size = round_up(attr->value_size, 8);
75 83
@@ -112,12 +120,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
112 array->map.unpriv_array = unpriv; 120 array->map.unpriv_array = unpriv;
113 121
114 /* copy mandatory map attributes */ 122 /* copy mandatory map attributes */
115 array->map.map_type = attr->map_type; 123 bpf_map_init_from_attr(&array->map, attr);
116 array->map.key_size = attr->key_size;
117 array->map.value_size = attr->value_size;
118 array->map.max_entries = attr->max_entries;
119 array->map.map_flags = attr->map_flags;
120 array->map.numa_node = numa_node;
121 array->elem_size = elem_size; 124 array->elem_size = elem_size;
122 125
123 if (!percpu) 126 if (!percpu)
@@ -327,6 +330,7 @@ static void array_map_free(struct bpf_map *map)
327} 330}
328 331
329const struct bpf_map_ops array_map_ops = { 332const struct bpf_map_ops array_map_ops = {
333 .map_alloc_check = array_map_alloc_check,
330 .map_alloc = array_map_alloc, 334 .map_alloc = array_map_alloc,
331 .map_free = array_map_free, 335 .map_free = array_map_free,
332 .map_get_next_key = array_map_get_next_key, 336 .map_get_next_key = array_map_get_next_key,
@@ -337,6 +341,7 @@ const struct bpf_map_ops array_map_ops = {
337}; 341};
338 342
339const struct bpf_map_ops percpu_array_map_ops = { 343const struct bpf_map_ops percpu_array_map_ops = {
344 .map_alloc_check = array_map_alloc_check,
340 .map_alloc = array_map_alloc, 345 .map_alloc = array_map_alloc,
341 .map_free = array_map_free, 346 .map_free = array_map_free,
342 .map_get_next_key = array_map_get_next_key, 347 .map_get_next_key = array_map_get_next_key,
@@ -345,12 +350,12 @@ const struct bpf_map_ops percpu_array_map_ops = {
345 .map_delete_elem = array_map_delete_elem, 350 .map_delete_elem = array_map_delete_elem,
346}; 351};
347 352
348static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr) 353static int fd_array_map_alloc_check(union bpf_attr *attr)
349{ 354{
350 /* only file descriptors can be stored in this type of map */ 355 /* only file descriptors can be stored in this type of map */
351 if (attr->value_size != sizeof(u32)) 356 if (attr->value_size != sizeof(u32))
352 return ERR_PTR(-EINVAL); 357 return -EINVAL;
353 return array_map_alloc(attr); 358 return array_map_alloc_check(attr);
354} 359}
355 360
356static void fd_array_map_free(struct bpf_map *map) 361static void fd_array_map_free(struct bpf_map *map)
@@ -474,7 +479,8 @@ void bpf_fd_array_map_clear(struct bpf_map *map)
474} 479}
475 480
476const struct bpf_map_ops prog_array_map_ops = { 481const struct bpf_map_ops prog_array_map_ops = {
477 .map_alloc = fd_array_map_alloc, 482 .map_alloc_check = fd_array_map_alloc_check,
483 .map_alloc = array_map_alloc,
478 .map_free = fd_array_map_free, 484 .map_free = fd_array_map_free,
479 .map_get_next_key = array_map_get_next_key, 485 .map_get_next_key = array_map_get_next_key,
480 .map_lookup_elem = fd_array_map_lookup_elem, 486 .map_lookup_elem = fd_array_map_lookup_elem,
@@ -561,7 +567,8 @@ static void perf_event_fd_array_release(struct bpf_map *map,
561} 567}
562 568
563const struct bpf_map_ops perf_event_array_map_ops = { 569const struct bpf_map_ops perf_event_array_map_ops = {
564 .map_alloc = fd_array_map_alloc, 570 .map_alloc_check = fd_array_map_alloc_check,
571 .map_alloc = array_map_alloc,
565 .map_free = fd_array_map_free, 572 .map_free = fd_array_map_free,
566 .map_get_next_key = array_map_get_next_key, 573 .map_get_next_key = array_map_get_next_key,
567 .map_lookup_elem = fd_array_map_lookup_elem, 574 .map_lookup_elem = fd_array_map_lookup_elem,
@@ -592,7 +599,8 @@ static void cgroup_fd_array_free(struct bpf_map *map)
592} 599}
593 600
594const struct bpf_map_ops cgroup_array_map_ops = { 601const struct bpf_map_ops cgroup_array_map_ops = {
595 .map_alloc = fd_array_map_alloc, 602 .map_alloc_check = fd_array_map_alloc_check,
603 .map_alloc = array_map_alloc,
596 .map_free = cgroup_fd_array_free, 604 .map_free = cgroup_fd_array_free,
597 .map_get_next_key = array_map_get_next_key, 605 .map_get_next_key = array_map_get_next_key,
598 .map_lookup_elem = fd_array_map_lookup_elem, 606 .map_lookup_elem = fd_array_map_lookup_elem,
@@ -610,7 +618,7 @@ static struct bpf_map *array_of_map_alloc(union bpf_attr *attr)
610 if (IS_ERR(inner_map_meta)) 618 if (IS_ERR(inner_map_meta))
611 return inner_map_meta; 619 return inner_map_meta;
612 620
613 map = fd_array_map_alloc(attr); 621 map = array_map_alloc(attr);
614 if (IS_ERR(map)) { 622 if (IS_ERR(map)) {
615 bpf_map_meta_free(inner_map_meta); 623 bpf_map_meta_free(inner_map_meta);
616 return map; 624 return map;
@@ -673,6 +681,7 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map,
673} 681}
674 682
675const struct bpf_map_ops array_of_maps_map_ops = { 683const struct bpf_map_ops array_of_maps_map_ops = {
684 .map_alloc_check = fd_array_map_alloc_check,
676 .map_alloc = array_of_map_alloc, 685 .map_alloc = array_of_map_alloc,
677 .map_free = array_of_map_free, 686 .map_free = array_of_map_free,
678 .map_get_next_key = array_map_get_next_key, 687 .map_get_next_key = array_map_get_next_key,
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index b789ab78d28f..c1c0b60d3f2f 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -568,6 +568,8 @@ static bool cgroup_dev_is_valid_access(int off, int size,
568 enum bpf_access_type type, 568 enum bpf_access_type type,
569 struct bpf_insn_access_aux *info) 569 struct bpf_insn_access_aux *info)
570{ 570{
571 const int size_default = sizeof(__u32);
572
571 if (type == BPF_WRITE) 573 if (type == BPF_WRITE)
572 return false; 574 return false;
573 575
@@ -576,8 +578,17 @@ static bool cgroup_dev_is_valid_access(int off, int size,
576 /* The verifier guarantees that size > 0. */ 578 /* The verifier guarantees that size > 0. */
577 if (off % size != 0) 579 if (off % size != 0)
578 return false; 580 return false;
579 if (size != sizeof(__u32)) 581
580 return false; 582 switch (off) {
583 case bpf_ctx_range(struct bpf_cgroup_dev_ctx, access_type):
584 bpf_ctx_record_field_size(info, size_default);
585 if (!bpf_ctx_narrow_access_ok(off, size, size_default))
586 return false;
587 break;
588 default:
589 if (size != size_default)
590 return false;
591 }
581 592
582 return true; 593 return true;
583} 594}
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7949e8b8f94e..5f35f93dcab2 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -94,6 +94,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
94 fp->pages = size / PAGE_SIZE; 94 fp->pages = size / PAGE_SIZE;
95 fp->aux = aux; 95 fp->aux = aux;
96 fp->aux->prog = fp; 96 fp->aux->prog = fp;
97 fp->jit_requested = ebpf_jit_enabled();
97 98
98 INIT_LIST_HEAD_RCU(&fp->aux->ksym_lnode); 99 INIT_LIST_HEAD_RCU(&fp->aux->ksym_lnode);
99 100
@@ -217,30 +218,40 @@ int bpf_prog_calc_tag(struct bpf_prog *fp)
217 return 0; 218 return 0;
218} 219}
219 220
220static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn)
221{
222 return BPF_CLASS(insn->code) == BPF_JMP &&
223 /* Call and Exit are both special jumps with no
224 * target inside the BPF instruction image.
225 */
226 BPF_OP(insn->code) != BPF_CALL &&
227 BPF_OP(insn->code) != BPF_EXIT;
228}
229
230static void bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta) 221static void bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta)
231{ 222{
232 struct bpf_insn *insn = prog->insnsi; 223 struct bpf_insn *insn = prog->insnsi;
233 u32 i, insn_cnt = prog->len; 224 u32 i, insn_cnt = prog->len;
225 bool pseudo_call;
226 u8 code;
227 int off;
234 228
235 for (i = 0; i < insn_cnt; i++, insn++) { 229 for (i = 0; i < insn_cnt; i++, insn++) {
236 if (!bpf_is_jmp_and_has_target(insn)) 230 code = insn->code;
231 if (BPF_CLASS(code) != BPF_JMP)
237 continue; 232 continue;
233 if (BPF_OP(code) == BPF_EXIT)
234 continue;
235 if (BPF_OP(code) == BPF_CALL) {
236 if (insn->src_reg == BPF_PSEUDO_CALL)
237 pseudo_call = true;
238 else
239 continue;
240 } else {
241 pseudo_call = false;
242 }
243 off = pseudo_call ? insn->imm : insn->off;
238 244
239 /* Adjust offset of jmps if we cross boundaries. */ 245 /* Adjust offset of jmps if we cross boundaries. */
240 if (i < pos && i + insn->off + 1 > pos) 246 if (i < pos && i + off + 1 > pos)
241 insn->off += delta; 247 off += delta;
242 else if (i > pos + delta && i + insn->off + 1 <= pos + delta) 248 else if (i > pos + delta && i + off + 1 <= pos + delta)
243 insn->off -= delta; 249 off -= delta;
250
251 if (pseudo_call)
252 insn->imm = off;
253 else
254 insn->off = off;
244 } 255 }
245} 256}
246 257
@@ -289,6 +300,11 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
289} 300}
290 301
291#ifdef CONFIG_BPF_JIT 302#ifdef CONFIG_BPF_JIT
303/* All BPF JIT sysctl knobs here. */
304int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON);
305int bpf_jit_harden __read_mostly;
306int bpf_jit_kallsyms __read_mostly;
307
292static __always_inline void 308static __always_inline void
293bpf_get_prog_addr_region(const struct bpf_prog *prog, 309bpf_get_prog_addr_region(const struct bpf_prog *prog,
294 unsigned long *symbol_start, 310 unsigned long *symbol_start,
@@ -370,8 +386,6 @@ static DEFINE_SPINLOCK(bpf_lock);
370static LIST_HEAD(bpf_kallsyms); 386static LIST_HEAD(bpf_kallsyms);
371static struct latch_tree_root bpf_tree __cacheline_aligned; 387static struct latch_tree_root bpf_tree __cacheline_aligned;
372 388
373int bpf_jit_kallsyms __read_mostly;
374
375static void bpf_prog_ksym_node_add(struct bpf_prog_aux *aux) 389static void bpf_prog_ksym_node_add(struct bpf_prog_aux *aux)
376{ 390{
377 WARN_ON_ONCE(!list_empty(&aux->ksym_lnode)); 391 WARN_ON_ONCE(!list_empty(&aux->ksym_lnode));
@@ -552,8 +566,6 @@ void __weak bpf_jit_free(struct bpf_prog *fp)
552 bpf_prog_unlock_free(fp); 566 bpf_prog_unlock_free(fp);
553} 567}
554 568
555int bpf_jit_harden __read_mostly;
556
557static int bpf_jit_blind_insn(const struct bpf_insn *from, 569static int bpf_jit_blind_insn(const struct bpf_insn *from,
558 const struct bpf_insn *aux, 570 const struct bpf_insn *aux,
559 struct bpf_insn *to_buff) 571 struct bpf_insn *to_buff)
@@ -711,7 +723,7 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
711 struct bpf_insn *insn; 723 struct bpf_insn *insn;
712 int i, rewritten; 724 int i, rewritten;
713 725
714 if (!bpf_jit_blinding_enabled()) 726 if (!bpf_jit_blinding_enabled(prog) || prog->blinded)
715 return prog; 727 return prog;
716 728
717 clone = bpf_prog_clone_create(prog, GFP_USER); 729 clone = bpf_prog_clone_create(prog, GFP_USER);
@@ -753,13 +765,16 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
753 i += insn_delta; 765 i += insn_delta;
754 } 766 }
755 767
768 clone->blinded = 1;
756 return clone; 769 return clone;
757} 770}
758#endif /* CONFIG_BPF_JIT */ 771#endif /* CONFIG_BPF_JIT */
759 772
760/* Base function for offset calculation. Needs to go into .text section, 773/* Base function for offset calculation. Needs to go into .text section,
761 * therefore keeping it non-static as well; will also be used by JITs 774 * therefore keeping it non-static as well; will also be used by JITs
762 * anyway later on, so do not let the compiler omit it. 775 * anyway later on, so do not let the compiler omit it. This also needs
776 * to go into kallsyms for correlation from e.g. bpftool, so naming
777 * must not change.
763 */ 778 */
764noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) 779noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
765{ 780{
@@ -767,6 +782,137 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
767} 782}
768EXPORT_SYMBOL_GPL(__bpf_call_base); 783EXPORT_SYMBOL_GPL(__bpf_call_base);
769 784
785/* All UAPI available opcodes. */
786#define BPF_INSN_MAP(INSN_2, INSN_3) \
787 /* 32 bit ALU operations. */ \
788 /* Register based. */ \
789 INSN_3(ALU, ADD, X), \
790 INSN_3(ALU, SUB, X), \
791 INSN_3(ALU, AND, X), \
792 INSN_3(ALU, OR, X), \
793 INSN_3(ALU, LSH, X), \
794 INSN_3(ALU, RSH, X), \
795 INSN_3(ALU, XOR, X), \
796 INSN_3(ALU, MUL, X), \
797 INSN_3(ALU, MOV, X), \
798 INSN_3(ALU, DIV, X), \
799 INSN_3(ALU, MOD, X), \
800 INSN_2(ALU, NEG), \
801 INSN_3(ALU, END, TO_BE), \
802 INSN_3(ALU, END, TO_LE), \
803 /* Immediate based. */ \
804 INSN_3(ALU, ADD, K), \
805 INSN_3(ALU, SUB, K), \
806 INSN_3(ALU, AND, K), \
807 INSN_3(ALU, OR, K), \
808 INSN_3(ALU, LSH, K), \
809 INSN_3(ALU, RSH, K), \
810 INSN_3(ALU, XOR, K), \
811 INSN_3(ALU, MUL, K), \
812 INSN_3(ALU, MOV, K), \
813 INSN_3(ALU, DIV, K), \
814 INSN_3(ALU, MOD, K), \
815 /* 64 bit ALU operations. */ \
816 /* Register based. */ \
817 INSN_3(ALU64, ADD, X), \
818 INSN_3(ALU64, SUB, X), \
819 INSN_3(ALU64, AND, X), \
820 INSN_3(ALU64, OR, X), \
821 INSN_3(ALU64, LSH, X), \
822 INSN_3(ALU64, RSH, X), \
823 INSN_3(ALU64, XOR, X), \
824 INSN_3(ALU64, MUL, X), \
825 INSN_3(ALU64, MOV, X), \
826 INSN_3(ALU64, ARSH, X), \
827 INSN_3(ALU64, DIV, X), \
828 INSN_3(ALU64, MOD, X), \
829 INSN_2(ALU64, NEG), \
830 /* Immediate based. */ \
831 INSN_3(ALU64, ADD, K), \
832 INSN_3(ALU64, SUB, K), \
833 INSN_3(ALU64, AND, K), \
834 INSN_3(ALU64, OR, K), \
835 INSN_3(ALU64, LSH, K), \
836 INSN_3(ALU64, RSH, K), \
837 INSN_3(ALU64, XOR, K), \
838 INSN_3(ALU64, MUL, K), \
839 INSN_3(ALU64, MOV, K), \
840 INSN_3(ALU64, ARSH, K), \
841 INSN_3(ALU64, DIV, K), \
842 INSN_3(ALU64, MOD, K), \
843 /* Call instruction. */ \
844 INSN_2(JMP, CALL), \
845 /* Exit instruction. */ \
846 INSN_2(JMP, EXIT), \
847 /* Jump instructions. */ \
848 /* Register based. */ \
849 INSN_3(JMP, JEQ, X), \
850 INSN_3(JMP, JNE, X), \
851 INSN_3(JMP, JGT, X), \
852 INSN_3(JMP, JLT, X), \
853 INSN_3(JMP, JGE, X), \
854 INSN_3(JMP, JLE, X), \
855 INSN_3(JMP, JSGT, X), \
856 INSN_3(JMP, JSLT, X), \
857 INSN_3(JMP, JSGE, X), \
858 INSN_3(JMP, JSLE, X), \
859 INSN_3(JMP, JSET, X), \
860 /* Immediate based. */ \
861 INSN_3(JMP, JEQ, K), \
862 INSN_3(JMP, JNE, K), \
863 INSN_3(JMP, JGT, K), \
864 INSN_3(JMP, JLT, K), \
865 INSN_3(JMP, JGE, K), \
866 INSN_3(JMP, JLE, K), \
867 INSN_3(JMP, JSGT, K), \
868 INSN_3(JMP, JSLT, K), \
869 INSN_3(JMP, JSGE, K), \
870 INSN_3(JMP, JSLE, K), \
871 INSN_3(JMP, JSET, K), \
872 INSN_2(JMP, JA), \
873 /* Store instructions. */ \
874 /* Register based. */ \
875 INSN_3(STX, MEM, B), \
876 INSN_3(STX, MEM, H), \
877 INSN_3(STX, MEM, W), \
878 INSN_3(STX, MEM, DW), \
879 INSN_3(STX, XADD, W), \
880 INSN_3(STX, XADD, DW), \
881 /* Immediate based. */ \
882 INSN_3(ST, MEM, B), \
883 INSN_3(ST, MEM, H), \
884 INSN_3(ST, MEM, W), \
885 INSN_3(ST, MEM, DW), \
886 /* Load instructions. */ \
887 /* Register based. */ \
888 INSN_3(LDX, MEM, B), \
889 INSN_3(LDX, MEM, H), \
890 INSN_3(LDX, MEM, W), \
891 INSN_3(LDX, MEM, DW), \
892 /* Immediate based. */ \
893 INSN_3(LD, IMM, DW), \
894 /* Misc (old cBPF carry-over). */ \
895 INSN_3(LD, ABS, B), \
896 INSN_3(LD, ABS, H), \
897 INSN_3(LD, ABS, W), \
898 INSN_3(LD, IND, B), \
899 INSN_3(LD, IND, H), \
900 INSN_3(LD, IND, W)
901
902bool bpf_opcode_in_insntable(u8 code)
903{
904#define BPF_INSN_2_TBL(x, y) [BPF_##x | BPF_##y] = true
905#define BPF_INSN_3_TBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = true
906 static const bool public_insntable[256] = {
907 [0 ... 255] = false,
908 /* Now overwrite non-defaults ... */
909 BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL),
910 };
911#undef BPF_INSN_3_TBL
912#undef BPF_INSN_2_TBL
913 return public_insntable[code];
914}
915
770#ifndef CONFIG_BPF_JIT_ALWAYS_ON 916#ifndef CONFIG_BPF_JIT_ALWAYS_ON
771/** 917/**
772 * __bpf_prog_run - run eBPF program on a given context 918 * __bpf_prog_run - run eBPF program on a given context
@@ -775,118 +921,21 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
775 * 921 *
776 * Decode and execute eBPF instructions. 922 * Decode and execute eBPF instructions.
777 */ 923 */
778static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, 924static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
779 u64 *stack)
780{ 925{
781 u64 tmp; 926 u64 tmp;
927#define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y
928#define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z
782 static const void *jumptable[256] = { 929 static const void *jumptable[256] = {
783 [0 ... 255] = &&default_label, 930 [0 ... 255] = &&default_label,
784 /* Now overwrite non-defaults ... */ 931 /* Now overwrite non-defaults ... */
785 /* 32 bit ALU operations */ 932 BPF_INSN_MAP(BPF_INSN_2_LBL, BPF_INSN_3_LBL),
786 [BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X, 933 /* Non-UAPI available opcodes. */
787 [BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K, 934 [BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS,
788 [BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X,
789 [BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K,
790 [BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X,
791 [BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K,
792 [BPF_ALU | BPF_OR | BPF_X] = &&ALU_OR_X,
793 [BPF_ALU | BPF_OR | BPF_K] = &&ALU_OR_K,
794 [BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X,
795 [BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K,
796 [BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X,
797 [BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K,
798 [BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X,
799 [BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K,
800 [BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X,
801 [BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K,
802 [BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X,
803 [BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K,
804 [BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X,
805 [BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K,
806 [BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X,
807 [BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K,
808 [BPF_ALU | BPF_NEG] = &&ALU_NEG,
809 [BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE,
810 [BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE,
811 /* 64 bit ALU operations */
812 [BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X,
813 [BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K,
814 [BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X,
815 [BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K,
816 [BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X,
817 [BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K,
818 [BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X,
819 [BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K,
820 [BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X,
821 [BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K,
822 [BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X,
823 [BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K,
824 [BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X,
825 [BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K,
826 [BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X,
827 [BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K,
828 [BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X,
829 [BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K,
830 [BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X,
831 [BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K,
832 [BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X,
833 [BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K,
834 [BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X,
835 [BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K,
836 [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
837 /* Call instruction */
838 [BPF_JMP | BPF_CALL] = &&JMP_CALL,
839 [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL, 935 [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL,
840 /* Jumps */
841 [BPF_JMP | BPF_JA] = &&JMP_JA,
842 [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
843 [BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K,
844 [BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X,
845 [BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K,
846 [BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X,
847 [BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K,
848 [BPF_JMP | BPF_JLT | BPF_X] = &&JMP_JLT_X,
849 [BPF_JMP | BPF_JLT | BPF_K] = &&JMP_JLT_K,
850 [BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X,
851 [BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K,
852 [BPF_JMP | BPF_JLE | BPF_X] = &&JMP_JLE_X,
853 [BPF_JMP | BPF_JLE | BPF_K] = &&JMP_JLE_K,
854 [BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X,
855 [BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K,
856 [BPF_JMP | BPF_JSLT | BPF_X] = &&JMP_JSLT_X,
857 [BPF_JMP | BPF_JSLT | BPF_K] = &&JMP_JSLT_K,
858 [BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X,
859 [BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K,
860 [BPF_JMP | BPF_JSLE | BPF_X] = &&JMP_JSLE_X,
861 [BPF_JMP | BPF_JSLE | BPF_K] = &&JMP_JSLE_K,
862 [BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X,
863 [BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K,
864 /* Program return */
865 [BPF_JMP | BPF_EXIT] = &&JMP_EXIT,
866 /* Store instructions */
867 [BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B,
868 [BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H,
869 [BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W,
870 [BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW,
871 [BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W,
872 [BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW,
873 [BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B,
874 [BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H,
875 [BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W,
876 [BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW,
877 /* Load instructions */
878 [BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B,
879 [BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H,
880 [BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W,
881 [BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW,
882 [BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W,
883 [BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H,
884 [BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B,
885 [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W,
886 [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H,
887 [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B,
888 [BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW,
889 }; 936 };
937#undef BPF_INSN_3_LBL
938#undef BPF_INSN_2_LBL
890 u32 tail_call_cnt = 0; 939 u32 tail_call_cnt = 0;
891 void *ptr; 940 void *ptr;
892 int off; 941 int off;
@@ -950,14 +999,10 @@ select_insn:
950 (*(s64 *) &DST) >>= IMM; 999 (*(s64 *) &DST) >>= IMM;
951 CONT; 1000 CONT;
952 ALU64_MOD_X: 1001 ALU64_MOD_X:
953 if (unlikely(SRC == 0))
954 return 0;
955 div64_u64_rem(DST, SRC, &tmp); 1002 div64_u64_rem(DST, SRC, &tmp);
956 DST = tmp; 1003 DST = tmp;
957 CONT; 1004 CONT;
958 ALU_MOD_X: 1005 ALU_MOD_X:
959 if (unlikely((u32)SRC == 0))
960 return 0;
961 tmp = (u32) DST; 1006 tmp = (u32) DST;
962 DST = do_div(tmp, (u32) SRC); 1007 DST = do_div(tmp, (u32) SRC);
963 CONT; 1008 CONT;
@@ -970,13 +1015,9 @@ select_insn:
970 DST = do_div(tmp, (u32) IMM); 1015 DST = do_div(tmp, (u32) IMM);
971 CONT; 1016 CONT;
972 ALU64_DIV_X: 1017 ALU64_DIV_X:
973 if (unlikely(SRC == 0))
974 return 0;
975 DST = div64_u64(DST, SRC); 1018 DST = div64_u64(DST, SRC);
976 CONT; 1019 CONT;
977 ALU_DIV_X: 1020 ALU_DIV_X:
978 if (unlikely((u32)SRC == 0))
979 return 0;
980 tmp = (u32) DST; 1021 tmp = (u32) DST;
981 do_div(tmp, (u32) SRC); 1022 do_div(tmp, (u32) SRC);
982 DST = (u32) tmp; 1023 DST = (u32) tmp;
@@ -1026,6 +1067,13 @@ select_insn:
1026 BPF_R4, BPF_R5); 1067 BPF_R4, BPF_R5);
1027 CONT; 1068 CONT;
1028 1069
1070 JMP_CALL_ARGS:
1071 BPF_R0 = (__bpf_call_base_args + insn->imm)(BPF_R1, BPF_R2,
1072 BPF_R3, BPF_R4,
1073 BPF_R5,
1074 insn + insn->off + 1);
1075 CONT;
1076
1029 JMP_TAIL_CALL: { 1077 JMP_TAIL_CALL: {
1030 struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2; 1078 struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
1031 struct bpf_array *array = container_of(map, struct bpf_array, map); 1079 struct bpf_array *array = container_of(map, struct bpf_array, map);
@@ -1280,8 +1328,14 @@ load_byte:
1280 goto load_byte; 1328 goto load_byte;
1281 1329
1282 default_label: 1330 default_label:
1283 /* If we ever reach this, we have a bug somewhere. */ 1331 /* If we ever reach this, we have a bug somewhere. Die hard here
1284 WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code); 1332 * instead of just returning 0; we could be somewhere in a subprog,
1333 * so execution could continue otherwise which we do /not/ want.
1334 *
1335 * Note, verifier whitelists all opcodes in bpf_opcode_in_insntable().
1336 */
1337 pr_warn("BPF interpreter: unknown opcode %02x\n", insn->code);
1338 BUG_ON(1);
1285 return 0; 1339 return 0;
1286} 1340}
1287STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */ 1341STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */
@@ -1298,6 +1352,23 @@ static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn
1298 return ___bpf_prog_run(regs, insn, stack); \ 1352 return ___bpf_prog_run(regs, insn, stack); \
1299} 1353}
1300 1354
1355#define PROG_NAME_ARGS(stack_size) __bpf_prog_run_args##stack_size
1356#define DEFINE_BPF_PROG_RUN_ARGS(stack_size) \
1357static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \
1358 const struct bpf_insn *insn) \
1359{ \
1360 u64 stack[stack_size / sizeof(u64)]; \
1361 u64 regs[MAX_BPF_REG]; \
1362\
1363 FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
1364 BPF_R1 = r1; \
1365 BPF_R2 = r2; \
1366 BPF_R3 = r3; \
1367 BPF_R4 = r4; \
1368 BPF_R5 = r5; \
1369 return ___bpf_prog_run(regs, insn, stack); \
1370}
1371
1301#define EVAL1(FN, X) FN(X) 1372#define EVAL1(FN, X) FN(X)
1302#define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y) 1373#define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y)
1303#define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y) 1374#define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y)
@@ -1309,6 +1380,10 @@ EVAL6(DEFINE_BPF_PROG_RUN, 32, 64, 96, 128, 160, 192);
1309EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384); 1380EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384);
1310EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512); 1381EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512);
1311 1382
1383EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 32, 64, 96, 128, 160, 192);
1384EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 224, 256, 288, 320, 352, 384);
1385EVAL4(DEFINE_BPF_PROG_RUN_ARGS, 416, 448, 480, 512);
1386
1312#define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size), 1387#define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size),
1313 1388
1314static unsigned int (*interpreters[])(const void *ctx, 1389static unsigned int (*interpreters[])(const void *ctx,
@@ -1317,11 +1392,33 @@ EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192)
1317EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) 1392EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
1318EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) 1393EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
1319}; 1394};
1395#undef PROG_NAME_LIST
1396#define PROG_NAME_LIST(stack_size) PROG_NAME_ARGS(stack_size),
1397static u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5,
1398 const struct bpf_insn *insn) = {
1399EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192)
1400EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
1401EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
1402};
1403#undef PROG_NAME_LIST
1404
1405void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth)
1406{
1407 stack_depth = max_t(u32, stack_depth, 1);
1408 insn->off = (s16) insn->imm;
1409 insn->imm = interpreters_args[(round_up(stack_depth, 32) / 32) - 1] -
1410 __bpf_call_base_args;
1411 insn->code = BPF_JMP | BPF_CALL_ARGS;
1412}
1320 1413
1321#else 1414#else
1322static unsigned int __bpf_prog_ret0(const void *ctx, 1415static unsigned int __bpf_prog_ret0_warn(const void *ctx,
1323 const struct bpf_insn *insn) 1416 const struct bpf_insn *insn)
1324{ 1417{
1418 /* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON
1419 * is not working properly, so warn about it!
1420 */
1421 WARN_ON_ONCE(1);
1325 return 0; 1422 return 0;
1326} 1423}
1327#endif 1424#endif
@@ -1329,6 +1426,9 @@ static unsigned int __bpf_prog_ret0(const void *ctx,
1329bool bpf_prog_array_compatible(struct bpf_array *array, 1426bool bpf_prog_array_compatible(struct bpf_array *array,
1330 const struct bpf_prog *fp) 1427 const struct bpf_prog *fp)
1331{ 1428{
1429 if (fp->kprobe_override)
1430 return false;
1431
1332 if (!array->owner_prog_type) { 1432 if (!array->owner_prog_type) {
1333 /* There's no owner yet where we could check for 1433 /* There's no owner yet where we could check for
1334 * compatibility. 1434 * compatibility.
@@ -1378,7 +1478,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
1378 1478
1379 fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; 1479 fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1];
1380#else 1480#else
1381 fp->bpf_func = __bpf_prog_ret0; 1481 fp->bpf_func = __bpf_prog_ret0_warn;
1382#endif 1482#endif
1383 1483
1384 /* eBPF JITs can rewrite the program in case constant 1484 /* eBPF JITs can rewrite the program in case constant
@@ -1481,6 +1581,8 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
1481 rcu_read_lock(); 1581 rcu_read_lock();
1482 prog = rcu_dereference(progs)->progs; 1582 prog = rcu_dereference(progs)->progs;
1483 for (; *prog; prog++) { 1583 for (; *prog; prog++) {
1584 if (*prog == &dummy_bpf_prog.prog)
1585 continue;
1484 id = (*prog)->aux->id; 1586 id = (*prog)->aux->id;
1485 if (copy_to_user(prog_ids + i, &id, sizeof(id))) { 1587 if (copy_to_user(prog_ids + i, &id, sizeof(id))) {
1486 rcu_read_unlock(); 1588 rcu_read_unlock();
@@ -1564,14 +1666,41 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
1564 return 0; 1666 return 0;
1565} 1667}
1566 1668
1669int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array,
1670 __u32 __user *prog_ids, u32 request_cnt,
1671 __u32 __user *prog_cnt)
1672{
1673 u32 cnt = 0;
1674
1675 if (array)
1676 cnt = bpf_prog_array_length(array);
1677
1678 if (copy_to_user(prog_cnt, &cnt, sizeof(cnt)))
1679 return -EFAULT;
1680
1681 /* return early if user requested only program count or nothing to copy */
1682 if (!request_cnt || !cnt)
1683 return 0;
1684
1685 return bpf_prog_array_copy_to_user(array, prog_ids, request_cnt);
1686}
1687
1567static void bpf_prog_free_deferred(struct work_struct *work) 1688static void bpf_prog_free_deferred(struct work_struct *work)
1568{ 1689{
1569 struct bpf_prog_aux *aux; 1690 struct bpf_prog_aux *aux;
1691 int i;
1570 1692
1571 aux = container_of(work, struct bpf_prog_aux, work); 1693 aux = container_of(work, struct bpf_prog_aux, work);
1572 if (bpf_prog_is_dev_bound(aux)) 1694 if (bpf_prog_is_dev_bound(aux))
1573 bpf_prog_offload_destroy(aux->prog); 1695 bpf_prog_offload_destroy(aux->prog);
1574 bpf_jit_free(aux->prog); 1696 for (i = 0; i < aux->func_cnt; i++)
1697 bpf_jit_free(aux->func[i]);
1698 if (aux->func_cnt) {
1699 kfree(aux->func);
1700 bpf_prog_unlock_free(aux->prog);
1701 } else {
1702 bpf_jit_free(aux->prog);
1703 }
1575} 1704}
1576 1705
1577/* Free internal BPF program */ 1706/* Free internal BPF program */
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index ce5b669003b2..fbfdada6caee 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -94,13 +94,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
94 if (!cmap) 94 if (!cmap)
95 return ERR_PTR(-ENOMEM); 95 return ERR_PTR(-ENOMEM);
96 96
97 /* mandatory map attributes */ 97 bpf_map_init_from_attr(&cmap->map, attr);
98 cmap->map.map_type = attr->map_type;
99 cmap->map.key_size = attr->key_size;
100 cmap->map.value_size = attr->value_size;
101 cmap->map.max_entries = attr->max_entries;
102 cmap->map.map_flags = attr->map_flags;
103 cmap->map.numa_node = bpf_map_attr_numa_node(attr);
104 98
105 /* Pre-limit array size based on NR_CPUS, not final CPU check */ 99 /* Pre-limit array size based on NR_CPUS, not final CPU check */
106 if (cmap->map.max_entries > NR_CPUS) { 100 if (cmap->map.max_entries > NR_CPUS) {
@@ -143,7 +137,7 @@ free_cmap:
143 return ERR_PTR(err); 137 return ERR_PTR(err);
144} 138}
145 139
146void __cpu_map_queue_destructor(void *ptr) 140static void __cpu_map_queue_destructor(void *ptr)
147{ 141{
148 /* The tear-down procedure should have made sure that queue is 142 /* The tear-down procedure should have made sure that queue is
149 * empty. See __cpu_map_entry_replace() and work-queue 143 * empty. See __cpu_map_entry_replace() and work-queue
@@ -222,8 +216,8 @@ static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp)
222 return xdp_pkt; 216 return xdp_pkt;
223} 217}
224 218
225struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, 219static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
226 struct xdp_pkt *xdp_pkt) 220 struct xdp_pkt *xdp_pkt)
227{ 221{
228 unsigned int frame_size; 222 unsigned int frame_size;
229 void *pkt_data_start; 223 void *pkt_data_start;
@@ -337,7 +331,8 @@ static int cpu_map_kthread_run(void *data)
337 return 0; 331 return 0;
338} 332}
339 333
340struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, int map_id) 334static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu,
335 int map_id)
341{ 336{
342 gfp_t gfp = GFP_ATOMIC|__GFP_NOWARN; 337 gfp_t gfp = GFP_ATOMIC|__GFP_NOWARN;
343 struct bpf_cpu_map_entry *rcpu; 338 struct bpf_cpu_map_entry *rcpu;
@@ -395,7 +390,7 @@ free_rcu:
395 return NULL; 390 return NULL;
396} 391}
397 392
398void __cpu_map_entry_free(struct rcu_head *rcu) 393static void __cpu_map_entry_free(struct rcu_head *rcu)
399{ 394{
400 struct bpf_cpu_map_entry *rcpu; 395 struct bpf_cpu_map_entry *rcpu;
401 int cpu; 396 int cpu;
@@ -438,8 +433,8 @@ void __cpu_map_entry_free(struct rcu_head *rcu)
438 * cpu_map_kthread_stop, which waits for an RCU graze period before 433 * cpu_map_kthread_stop, which waits for an RCU graze period before
439 * stopping kthread, emptying the queue. 434 * stopping kthread, emptying the queue.
440 */ 435 */
441void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, 436static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
442 u32 key_cpu, struct bpf_cpu_map_entry *rcpu) 437 u32 key_cpu, struct bpf_cpu_map_entry *rcpu)
443{ 438{
444 struct bpf_cpu_map_entry *old_rcpu; 439 struct bpf_cpu_map_entry *old_rcpu;
445 440
@@ -451,7 +446,7 @@ void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
451 } 446 }
452} 447}
453 448
454int cpu_map_delete_elem(struct bpf_map *map, void *key) 449static int cpu_map_delete_elem(struct bpf_map *map, void *key)
455{ 450{
456 struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); 451 struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
457 u32 key_cpu = *(u32 *)key; 452 u32 key_cpu = *(u32 *)key;
@@ -464,8 +459,8 @@ int cpu_map_delete_elem(struct bpf_map *map, void *key)
464 return 0; 459 return 0;
465} 460}
466 461
467int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, 462static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
468 u64 map_flags) 463 u64 map_flags)
469{ 464{
470 struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); 465 struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
471 struct bpf_cpu_map_entry *rcpu; 466 struct bpf_cpu_map_entry *rcpu;
@@ -502,7 +497,7 @@ int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
502 return 0; 497 return 0;
503} 498}
504 499
505void cpu_map_free(struct bpf_map *map) 500static void cpu_map_free(struct bpf_map *map)
506{ 501{
507 struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); 502 struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
508 int cpu; 503 int cpu;
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index ebdef54bf7df..565f9ece9115 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -93,13 +93,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
93 if (!dtab) 93 if (!dtab)
94 return ERR_PTR(-ENOMEM); 94 return ERR_PTR(-ENOMEM);
95 95
96 /* mandatory map attributes */ 96 bpf_map_init_from_attr(&dtab->map, attr);
97 dtab->map.map_type = attr->map_type;
98 dtab->map.key_size = attr->key_size;
99 dtab->map.value_size = attr->value_size;
100 dtab->map.max_entries = attr->max_entries;
101 dtab->map.map_flags = attr->map_flags;
102 dtab->map.numa_node = bpf_map_attr_numa_node(attr);
103 97
104 /* make sure page count doesn't overflow */ 98 /* make sure page count doesn't overflow */
105 cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); 99 cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index e682850c9715..8740406df2cd 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -21,10 +21,39 @@ static const char * const func_id_str[] = {
21}; 21};
22#undef __BPF_FUNC_STR_FN 22#undef __BPF_FUNC_STR_FN
23 23
24const char *func_id_name(int id) 24static const char *__func_get_name(const struct bpf_insn_cbs *cbs,
25 const struct bpf_insn *insn,
26 char *buff, size_t len)
25{ 27{
26 BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID); 28 BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID);
27 29
30 if (insn->src_reg != BPF_PSEUDO_CALL &&
31 insn->imm >= 0 && insn->imm < __BPF_FUNC_MAX_ID &&
32 func_id_str[insn->imm])
33 return func_id_str[insn->imm];
34
35 if (cbs && cbs->cb_call)
36 return cbs->cb_call(cbs->private_data, insn);
37
38 if (insn->src_reg == BPF_PSEUDO_CALL)
39 snprintf(buff, len, "%+d", insn->imm);
40
41 return buff;
42}
43
44static const char *__func_imm_name(const struct bpf_insn_cbs *cbs,
45 const struct bpf_insn *insn,
46 u64 full_imm, char *buff, size_t len)
47{
48 if (cbs && cbs->cb_imm)
49 return cbs->cb_imm(cbs->private_data, insn, full_imm);
50
51 snprintf(buff, len, "0x%llx", (unsigned long long)full_imm);
52 return buff;
53}
54
55const char *func_id_name(int id)
56{
28 if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id]) 57 if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id])
29 return func_id_str[id]; 58 return func_id_str[id];
30 else 59 else
@@ -83,7 +112,7 @@ static const char *const bpf_jmp_string[16] = {
83 [BPF_EXIT >> 4] = "exit", 112 [BPF_EXIT >> 4] = "exit",
84}; 113};
85 114
86static void print_bpf_end_insn(bpf_insn_print_cb verbose, 115static void print_bpf_end_insn(bpf_insn_print_t verbose,
87 struct bpf_verifier_env *env, 116 struct bpf_verifier_env *env,
88 const struct bpf_insn *insn) 117 const struct bpf_insn *insn)
89{ 118{
@@ -92,9 +121,12 @@ static void print_bpf_end_insn(bpf_insn_print_cb verbose,
92 insn->imm, insn->dst_reg); 121 insn->imm, insn->dst_reg);
93} 122}
94 123
95void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, 124void print_bpf_insn(const struct bpf_insn_cbs *cbs,
96 const struct bpf_insn *insn, bool allow_ptr_leaks) 125 struct bpf_verifier_env *env,
126 const struct bpf_insn *insn,
127 bool allow_ptr_leaks)
97{ 128{
129 const bpf_insn_print_t verbose = cbs->cb_print;
98 u8 class = BPF_CLASS(insn->code); 130 u8 class = BPF_CLASS(insn->code);
99 131
100 if (class == BPF_ALU || class == BPF_ALU64) { 132 if (class == BPF_ALU || class == BPF_ALU64) {
@@ -175,12 +207,15 @@ void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env,
175 */ 207 */
176 u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; 208 u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
177 bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD; 209 bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD;
210 char tmp[64];
178 211
179 if (map_ptr && !allow_ptr_leaks) 212 if (map_ptr && !allow_ptr_leaks)
180 imm = 0; 213 imm = 0;
181 214
182 verbose(env, "(%02x) r%d = 0x%llx\n", insn->code, 215 verbose(env, "(%02x) r%d = %s\n",
183 insn->dst_reg, (unsigned long long)imm); 216 insn->code, insn->dst_reg,
217 __func_imm_name(cbs, insn, imm,
218 tmp, sizeof(tmp)));
184 } else { 219 } else {
185 verbose(env, "BUG_ld_%02x\n", insn->code); 220 verbose(env, "BUG_ld_%02x\n", insn->code);
186 return; 221 return;
@@ -189,8 +224,20 @@ void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env,
189 u8 opcode = BPF_OP(insn->code); 224 u8 opcode = BPF_OP(insn->code);
190 225
191 if (opcode == BPF_CALL) { 226 if (opcode == BPF_CALL) {
192 verbose(env, "(%02x) call %s#%d\n", insn->code, 227 char tmp[64];
193 func_id_name(insn->imm), insn->imm); 228
229 if (insn->src_reg == BPF_PSEUDO_CALL) {
230 verbose(env, "(%02x) call pc%s\n",
231 insn->code,
232 __func_get_name(cbs, insn,
233 tmp, sizeof(tmp)));
234 } else {
235 strcpy(tmp, "unknown");
236 verbose(env, "(%02x) call %s#%d\n", insn->code,
237 __func_get_name(cbs, insn,
238 tmp, sizeof(tmp)),
239 insn->imm);
240 }
194 } else if (insn->code == (BPF_JMP | BPF_JA)) { 241 } else if (insn->code == (BPF_JMP | BPF_JA)) {
195 verbose(env, "(%02x) goto pc%+d\n", 242 verbose(env, "(%02x) goto pc%+d\n",
196 insn->code, insn->off); 243 insn->code, insn->off);
diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h
index 8de977e420b6..266fe8ee542b 100644
--- a/kernel/bpf/disasm.h
+++ b/kernel/bpf/disasm.h
@@ -17,16 +17,35 @@
17#include <linux/bpf.h> 17#include <linux/bpf.h>
18#include <linux/kernel.h> 18#include <linux/kernel.h>
19#include <linux/stringify.h> 19#include <linux/stringify.h>
20#ifndef __KERNEL__
21#include <stdio.h>
22#include <string.h>
23#endif
24
25struct bpf_verifier_env;
20 26
21extern const char *const bpf_alu_string[16]; 27extern const char *const bpf_alu_string[16];
22extern const char *const bpf_class_string[8]; 28extern const char *const bpf_class_string[8];
23 29
24const char *func_id_name(int id); 30const char *func_id_name(int id);
25 31
26struct bpf_verifier_env; 32typedef __printf(2, 3) void (*bpf_insn_print_t)(struct bpf_verifier_env *env,
27typedef void (*bpf_insn_print_cb)(struct bpf_verifier_env *env, 33 const char *, ...);
28 const char *, ...); 34typedef const char *(*bpf_insn_revmap_call_t)(void *private_data,
29void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, 35 const struct bpf_insn *insn);
30 const struct bpf_insn *insn, bool allow_ptr_leaks); 36typedef const char *(*bpf_insn_print_imm_t)(void *private_data,
37 const struct bpf_insn *insn,
38 __u64 full_imm);
39
40struct bpf_insn_cbs {
41 bpf_insn_print_t cb_print;
42 bpf_insn_revmap_call_t cb_call;
43 bpf_insn_print_imm_t cb_imm;
44 void *private_data;
45};
31 46
47void print_bpf_insn(const struct bpf_insn_cbs *cbs,
48 struct bpf_verifier_env *env,
49 const struct bpf_insn *insn,
50 bool allow_ptr_leaks);
32#endif 51#endif
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 3905d4bc5b80..b76828f23b49 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -227,7 +227,7 @@ static int alloc_extra_elems(struct bpf_htab *htab)
227} 227}
228 228
229/* Called from syscall */ 229/* Called from syscall */
230static struct bpf_map *htab_map_alloc(union bpf_attr *attr) 230static int htab_map_alloc_check(union bpf_attr *attr)
231{ 231{
232 bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || 232 bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
233 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); 233 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
@@ -241,9 +241,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
241 bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); 241 bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
242 bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); 242 bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
243 int numa_node = bpf_map_attr_numa_node(attr); 243 int numa_node = bpf_map_attr_numa_node(attr);
244 struct bpf_htab *htab;
245 int err, i;
246 u64 cost;
247 244
248 BUILD_BUG_ON(offsetof(struct htab_elem, htab) != 245 BUILD_BUG_ON(offsetof(struct htab_elem, htab) !=
249 offsetof(struct htab_elem, hash_node.pprev)); 246 offsetof(struct htab_elem, hash_node.pprev));
@@ -254,40 +251,68 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
254 /* LRU implementation is much complicated than other 251 /* LRU implementation is much complicated than other
255 * maps. Hence, limit to CAP_SYS_ADMIN for now. 252 * maps. Hence, limit to CAP_SYS_ADMIN for now.
256 */ 253 */
257 return ERR_PTR(-EPERM); 254 return -EPERM;
258 255
259 if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK) 256 if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK)
260 /* reserved bits should not be used */ 257 /* reserved bits should not be used */
261 return ERR_PTR(-EINVAL); 258 return -EINVAL;
262 259
263 if (!lru && percpu_lru) 260 if (!lru && percpu_lru)
264 return ERR_PTR(-EINVAL); 261 return -EINVAL;
265 262
266 if (lru && !prealloc) 263 if (lru && !prealloc)
267 return ERR_PTR(-ENOTSUPP); 264 return -ENOTSUPP;
268 265
269 if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru)) 266 if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru))
270 return ERR_PTR(-EINVAL); 267 return -EINVAL;
268
269 /* check sanity of attributes.
270 * value_size == 0 may be allowed in the future to use map as a set
271 */
272 if (attr->max_entries == 0 || attr->key_size == 0 ||
273 attr->value_size == 0)
274 return -EINVAL;
275
276 if (attr->key_size > MAX_BPF_STACK)
277 /* eBPF programs initialize keys on stack, so they cannot be
278 * larger than max stack size
279 */
280 return -E2BIG;
281
282 if (attr->value_size >= KMALLOC_MAX_SIZE -
283 MAX_BPF_STACK - sizeof(struct htab_elem))
284 /* if value_size is bigger, the user space won't be able to
285 * access the elements via bpf syscall. This check also makes
286 * sure that the elem_size doesn't overflow and it's
287 * kmalloc-able later in htab_map_update_elem()
288 */
289 return -E2BIG;
290
291 return 0;
292}
293
294static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
295{
296 bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
297 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
298 bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH ||
299 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
300 /* percpu_lru means each cpu has its own LRU list.
301 * it is different from BPF_MAP_TYPE_PERCPU_HASH where
302 * the map's value itself is percpu. percpu_lru has
303 * nothing to do with the map's value.
304 */
305 bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
306 bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
307 struct bpf_htab *htab;
308 int err, i;
309 u64 cost;
271 310
272 htab = kzalloc(sizeof(*htab), GFP_USER); 311 htab = kzalloc(sizeof(*htab), GFP_USER);
273 if (!htab) 312 if (!htab)
274 return ERR_PTR(-ENOMEM); 313 return ERR_PTR(-ENOMEM);
275 314
276 /* mandatory map attributes */ 315 bpf_map_init_from_attr(&htab->map, attr);
277 htab->map.map_type = attr->map_type;
278 htab->map.key_size = attr->key_size;
279 htab->map.value_size = attr->value_size;
280 htab->map.max_entries = attr->max_entries;
281 htab->map.map_flags = attr->map_flags;
282 htab->map.numa_node = numa_node;
283
284 /* check sanity of attributes.
285 * value_size == 0 may be allowed in the future to use map as a set
286 */
287 err = -EINVAL;
288 if (htab->map.max_entries == 0 || htab->map.key_size == 0 ||
289 htab->map.value_size == 0)
290 goto free_htab;
291 316
292 if (percpu_lru) { 317 if (percpu_lru) {
293 /* ensure each CPU's lru list has >=1 elements. 318 /* ensure each CPU's lru list has >=1 elements.
@@ -304,22 +329,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
304 /* hash table size must be power of 2 */ 329 /* hash table size must be power of 2 */
305 htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); 330 htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
306 331
307 err = -E2BIG;
308 if (htab->map.key_size > MAX_BPF_STACK)
309 /* eBPF programs initialize keys on stack, so they cannot be
310 * larger than max stack size
311 */
312 goto free_htab;
313
314 if (htab->map.value_size >= KMALLOC_MAX_SIZE -
315 MAX_BPF_STACK - sizeof(struct htab_elem))
316 /* if value_size is bigger, the user space won't be able to
317 * access the elements via bpf syscall. This check also makes
318 * sure that the elem_size doesn't overflow and it's
319 * kmalloc-able later in htab_map_update_elem()
320 */
321 goto free_htab;
322
323 htab->elem_size = sizeof(struct htab_elem) + 332 htab->elem_size = sizeof(struct htab_elem) +
324 round_up(htab->map.key_size, 8); 333 round_up(htab->map.key_size, 8);
325 if (percpu) 334 if (percpu)
@@ -327,6 +336,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
327 else 336 else
328 htab->elem_size += round_up(htab->map.value_size, 8); 337 htab->elem_size += round_up(htab->map.value_size, 8);
329 338
339 err = -E2BIG;
330 /* prevent zero size kmalloc and check for u32 overflow */ 340 /* prevent zero size kmalloc and check for u32 overflow */
331 if (htab->n_buckets == 0 || 341 if (htab->n_buckets == 0 ||
332 htab->n_buckets > U32_MAX / sizeof(struct bucket)) 342 htab->n_buckets > U32_MAX / sizeof(struct bucket))
@@ -1143,6 +1153,7 @@ static void htab_map_free(struct bpf_map *map)
1143} 1153}
1144 1154
1145const struct bpf_map_ops htab_map_ops = { 1155const struct bpf_map_ops htab_map_ops = {
1156 .map_alloc_check = htab_map_alloc_check,
1146 .map_alloc = htab_map_alloc, 1157 .map_alloc = htab_map_alloc,
1147 .map_free = htab_map_free, 1158 .map_free = htab_map_free,
1148 .map_get_next_key = htab_map_get_next_key, 1159 .map_get_next_key = htab_map_get_next_key,
@@ -1153,6 +1164,7 @@ const struct bpf_map_ops htab_map_ops = {
1153}; 1164};
1154 1165
1155const struct bpf_map_ops htab_lru_map_ops = { 1166const struct bpf_map_ops htab_lru_map_ops = {
1167 .map_alloc_check = htab_map_alloc_check,
1156 .map_alloc = htab_map_alloc, 1168 .map_alloc = htab_map_alloc,
1157 .map_free = htab_map_free, 1169 .map_free = htab_map_free,
1158 .map_get_next_key = htab_map_get_next_key, 1170 .map_get_next_key = htab_map_get_next_key,
@@ -1236,6 +1248,7 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
1236} 1248}
1237 1249
1238const struct bpf_map_ops htab_percpu_map_ops = { 1250const struct bpf_map_ops htab_percpu_map_ops = {
1251 .map_alloc_check = htab_map_alloc_check,
1239 .map_alloc = htab_map_alloc, 1252 .map_alloc = htab_map_alloc,
1240 .map_free = htab_map_free, 1253 .map_free = htab_map_free,
1241 .map_get_next_key = htab_map_get_next_key, 1254 .map_get_next_key = htab_map_get_next_key,
@@ -1245,6 +1258,7 @@ const struct bpf_map_ops htab_percpu_map_ops = {
1245}; 1258};
1246 1259
1247const struct bpf_map_ops htab_lru_percpu_map_ops = { 1260const struct bpf_map_ops htab_lru_percpu_map_ops = {
1261 .map_alloc_check = htab_map_alloc_check,
1248 .map_alloc = htab_map_alloc, 1262 .map_alloc = htab_map_alloc,
1249 .map_free = htab_map_free, 1263 .map_free = htab_map_free,
1250 .map_get_next_key = htab_map_get_next_key, 1264 .map_get_next_key = htab_map_get_next_key,
@@ -1253,11 +1267,11 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
1253 .map_delete_elem = htab_lru_map_delete_elem, 1267 .map_delete_elem = htab_lru_map_delete_elem,
1254}; 1268};
1255 1269
1256static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr) 1270static int fd_htab_map_alloc_check(union bpf_attr *attr)
1257{ 1271{
1258 if (attr->value_size != sizeof(u32)) 1272 if (attr->value_size != sizeof(u32))
1259 return ERR_PTR(-EINVAL); 1273 return -EINVAL;
1260 return htab_map_alloc(attr); 1274 return htab_map_alloc_check(attr);
1261} 1275}
1262 1276
1263static void fd_htab_map_free(struct bpf_map *map) 1277static void fd_htab_map_free(struct bpf_map *map)
@@ -1328,7 +1342,7 @@ static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr)
1328 if (IS_ERR(inner_map_meta)) 1342 if (IS_ERR(inner_map_meta))
1329 return inner_map_meta; 1343 return inner_map_meta;
1330 1344
1331 map = fd_htab_map_alloc(attr); 1345 map = htab_map_alloc(attr);
1332 if (IS_ERR(map)) { 1346 if (IS_ERR(map)) {
1333 bpf_map_meta_free(inner_map_meta); 1347 bpf_map_meta_free(inner_map_meta);
1334 return map; 1348 return map;
@@ -1372,6 +1386,7 @@ static void htab_of_map_free(struct bpf_map *map)
1372} 1386}
1373 1387
1374const struct bpf_map_ops htab_of_maps_map_ops = { 1388const struct bpf_map_ops htab_of_maps_map_ops = {
1389 .map_alloc_check = fd_htab_map_alloc_check,
1375 .map_alloc = htab_of_map_alloc, 1390 .map_alloc = htab_of_map_alloc,
1376 .map_free = htab_of_map_free, 1391 .map_free = htab_of_map_free,
1377 .map_get_next_key = htab_map_get_next_key, 1392 .map_get_next_key = htab_map_get_next_key,
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 5bb5e49ef4c3..81e2f6995adb 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -150,39 +150,29 @@ static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
150 return 0; 150 return 0;
151} 151}
152 152
153static int bpf_mkobj_ops(struct inode *dir, struct dentry *dentry, 153static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw,
154 umode_t mode, const struct inode_operations *iops) 154 const struct inode_operations *iops)
155{ 155{
156 struct inode *inode; 156 struct inode *dir = dentry->d_parent->d_inode;
157 157 struct inode *inode = bpf_get_inode(dir->i_sb, dir, mode);
158 inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFREG);
159 if (IS_ERR(inode)) 158 if (IS_ERR(inode))
160 return PTR_ERR(inode); 159 return PTR_ERR(inode);
161 160
162 inode->i_op = iops; 161 inode->i_op = iops;
163 inode->i_private = dentry->d_fsdata; 162 inode->i_private = raw;
164 163
165 bpf_dentry_finalize(dentry, inode, dir); 164 bpf_dentry_finalize(dentry, inode, dir);
166 return 0; 165 return 0;
167} 166}
168 167
169static int bpf_mkobj(struct inode *dir, struct dentry *dentry, umode_t mode, 168static int bpf_mkprog(struct dentry *dentry, umode_t mode, void *arg)
170 dev_t devt)
171{ 169{
172 enum bpf_type type = MINOR(devt); 170 return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops);
173 171}
174 if (MAJOR(devt) != UNNAMED_MAJOR || !S_ISREG(mode) ||
175 dentry->d_fsdata == NULL)
176 return -EPERM;
177 172
178 switch (type) { 173static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg)
179 case BPF_TYPE_PROG: 174{
180 return bpf_mkobj_ops(dir, dentry, mode, &bpf_prog_iops); 175 return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops);
181 case BPF_TYPE_MAP:
182 return bpf_mkobj_ops(dir, dentry, mode, &bpf_map_iops);
183 default:
184 return -EPERM;
185 }
186} 176}
187 177
188static struct dentry * 178static struct dentry *
@@ -218,7 +208,6 @@ static int bpf_symlink(struct inode *dir, struct dentry *dentry,
218 208
219static const struct inode_operations bpf_dir_iops = { 209static const struct inode_operations bpf_dir_iops = {
220 .lookup = bpf_lookup, 210 .lookup = bpf_lookup,
221 .mknod = bpf_mkobj,
222 .mkdir = bpf_mkdir, 211 .mkdir = bpf_mkdir,
223 .symlink = bpf_symlink, 212 .symlink = bpf_symlink,
224 .rmdir = simple_rmdir, 213 .rmdir = simple_rmdir,
@@ -234,7 +223,6 @@ static int bpf_obj_do_pin(const struct filename *pathname, void *raw,
234 struct inode *dir; 223 struct inode *dir;
235 struct path path; 224 struct path path;
236 umode_t mode; 225 umode_t mode;
237 dev_t devt;
238 int ret; 226 int ret;
239 227
240 dentry = kern_path_create(AT_FDCWD, pathname->name, &path, 0); 228 dentry = kern_path_create(AT_FDCWD, pathname->name, &path, 0);
@@ -242,9 +230,8 @@ static int bpf_obj_do_pin(const struct filename *pathname, void *raw,
242 return PTR_ERR(dentry); 230 return PTR_ERR(dentry);
243 231
244 mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask()); 232 mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask());
245 devt = MKDEV(UNNAMED_MAJOR, type);
246 233
247 ret = security_path_mknod(&path, dentry, mode, devt); 234 ret = security_path_mknod(&path, dentry, mode, 0);
248 if (ret) 235 if (ret)
249 goto out; 236 goto out;
250 237
@@ -254,9 +241,16 @@ static int bpf_obj_do_pin(const struct filename *pathname, void *raw,
254 goto out; 241 goto out;
255 } 242 }
256 243
257 dentry->d_fsdata = raw; 244 switch (type) {
258 ret = vfs_mknod(dir, dentry, mode, devt); 245 case BPF_TYPE_PROG:
259 dentry->d_fsdata = NULL; 246 ret = vfs_mkobj(dentry, mode, bpf_mkprog, raw);
247 break;
248 case BPF_TYPE_MAP:
249 ret = vfs_mkobj(dentry, mode, bpf_mkmap, raw);
250 break;
251 default:
252 ret = -EPERM;
253 }
260out: 254out:
261 done_path_create(&path, dentry); 255 done_path_create(&path, dentry);
262 return ret; 256 return ret;
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 885e45479680..7b469d10d0e9 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -522,12 +522,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
522 return ERR_PTR(-ENOMEM); 522 return ERR_PTR(-ENOMEM);
523 523
524 /* copy mandatory map attributes */ 524 /* copy mandatory map attributes */
525 trie->map.map_type = attr->map_type; 525 bpf_map_init_from_attr(&trie->map, attr);
526 trie->map.key_size = attr->key_size;
527 trie->map.value_size = attr->value_size;
528 trie->map.max_entries = attr->max_entries;
529 trie->map.map_flags = attr->map_flags;
530 trie->map.numa_node = bpf_map_attr_numa_node(attr);
531 trie->data_size = attr->key_size - 526 trie->data_size = attr->key_size -
532 offsetof(struct bpf_lpm_trie_key, data); 527 offsetof(struct bpf_lpm_trie_key, data);
533 trie->max_prefixlen = trie->data_size * 8; 528 trie->max_prefixlen = trie->data_size * 8;
@@ -596,9 +591,96 @@ unlock:
596 raw_spin_unlock(&trie->lock); 591 raw_spin_unlock(&trie->lock);
597} 592}
598 593
599static int trie_get_next_key(struct bpf_map *map, void *key, void *next_key) 594static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
600{ 595{
601 return -ENOTSUPP; 596 struct lpm_trie_node *node, *next_node = NULL, *parent, *search_root;
597 struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
598 struct bpf_lpm_trie_key *key = _key, *next_key = _next_key;
599 struct lpm_trie_node **node_stack = NULL;
600 int err = 0, stack_ptr = -1;
601 unsigned int next_bit;
602 size_t matchlen;
603
604 /* The get_next_key follows postorder. For the 4 node example in
605 * the top of this file, the trie_get_next_key() returns the following
606 * one after another:
607 * 192.168.0.0/24
608 * 192.168.1.0/24
609 * 192.168.128.0/24
610 * 192.168.0.0/16
611 *
612 * The idea is to return more specific keys before less specific ones.
613 */
614
615 /* Empty trie */
616 search_root = rcu_dereference(trie->root);
617 if (!search_root)
618 return -ENOENT;
619
620 /* For invalid key, find the leftmost node in the trie */
621 if (!key || key->prefixlen > trie->max_prefixlen)
622 goto find_leftmost;
623
624 node_stack = kmalloc(trie->max_prefixlen * sizeof(struct lpm_trie_node *),
625 GFP_ATOMIC | __GFP_NOWARN);
626 if (!node_stack)
627 return -ENOMEM;
628
629 /* Try to find the exact node for the given key */
630 for (node = search_root; node;) {
631 node_stack[++stack_ptr] = node;
632 matchlen = longest_prefix_match(trie, node, key);
633 if (node->prefixlen != matchlen ||
634 node->prefixlen == key->prefixlen)
635 break;
636
637 next_bit = extract_bit(key->data, node->prefixlen);
638 node = rcu_dereference(node->child[next_bit]);
639 }
640 if (!node || node->prefixlen != key->prefixlen ||
641 (node->flags & LPM_TREE_NODE_FLAG_IM))
642 goto find_leftmost;
643
644 /* The node with the exactly-matching key has been found,
645 * find the first node in postorder after the matched node.
646 */
647 node = node_stack[stack_ptr];
648 while (stack_ptr > 0) {
649 parent = node_stack[stack_ptr - 1];
650 if (rcu_dereference(parent->child[0]) == node) {
651 search_root = rcu_dereference(parent->child[1]);
652 if (search_root)
653 goto find_leftmost;
654 }
655 if (!(parent->flags & LPM_TREE_NODE_FLAG_IM)) {
656 next_node = parent;
657 goto do_copy;
658 }
659
660 node = parent;
661 stack_ptr--;
662 }
663
664 /* did not find anything */
665 err = -ENOENT;
666 goto free_stack;
667
668find_leftmost:
669 /* Find the leftmost non-intermediate node, all intermediate nodes
670 * have exact two children, so this function will never return NULL.
671 */
672 for (node = search_root; node;) {
673 if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
674 next_node = node;
675 node = rcu_dereference(node->child[0]);
676 }
677do_copy:
678 next_key->prefixlen = next_node->prefixlen;
679 memcpy((void *)next_key + offsetof(struct bpf_lpm_trie_key, data),
680 next_node->data, trie->data_size);
681free_stack:
682 kfree(node_stack);
683 return err;
602} 684}
603 685
604const struct bpf_map_ops trie_map_ops = { 686const struct bpf_map_ops trie_map_ops = {
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 8455b89d1bbf..c9401075b58c 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -16,18 +16,35 @@
16#include <linux/bpf.h> 16#include <linux/bpf.h>
17#include <linux/bpf_verifier.h> 17#include <linux/bpf_verifier.h>
18#include <linux/bug.h> 18#include <linux/bug.h>
19#include <linux/kdev_t.h>
19#include <linux/list.h> 20#include <linux/list.h>
20#include <linux/netdevice.h> 21#include <linux/netdevice.h>
21#include <linux/printk.h> 22#include <linux/printk.h>
23#include <linux/proc_ns.h>
22#include <linux/rtnetlink.h> 24#include <linux/rtnetlink.h>
25#include <linux/rwsem.h>
23 26
24/* protected by RTNL */ 27/* Protects bpf_prog_offload_devs, bpf_map_offload_devs and offload members
28 * of all progs.
29 * RTNL lock cannot be taken when holding this lock.
30 */
31static DECLARE_RWSEM(bpf_devs_lock);
25static LIST_HEAD(bpf_prog_offload_devs); 32static LIST_HEAD(bpf_prog_offload_devs);
33static LIST_HEAD(bpf_map_offload_devs);
34
35static int bpf_dev_offload_check(struct net_device *netdev)
36{
37 if (!netdev)
38 return -EINVAL;
39 if (!netdev->netdev_ops->ndo_bpf)
40 return -EOPNOTSUPP;
41 return 0;
42}
26 43
27int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) 44int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
28{ 45{
29 struct net *net = current->nsproxy->net_ns; 46 struct bpf_prog_offload *offload;
30 struct bpf_dev_offload *offload; 47 int err;
31 48
32 if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS && 49 if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS &&
33 attr->prog_type != BPF_PROG_TYPE_XDP) 50 attr->prog_type != BPF_PROG_TYPE_XDP)
@@ -41,34 +58,44 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
41 return -ENOMEM; 58 return -ENOMEM;
42 59
43 offload->prog = prog; 60 offload->prog = prog;
44 init_waitqueue_head(&offload->verifier_done);
45 61
46 rtnl_lock(); 62 offload->netdev = dev_get_by_index(current->nsproxy->net_ns,
47 offload->netdev = __dev_get_by_index(net, attr->prog_ifindex); 63 attr->prog_ifindex);
48 if (!offload->netdev) { 64 err = bpf_dev_offload_check(offload->netdev);
49 rtnl_unlock(); 65 if (err)
50 kfree(offload); 66 goto err_maybe_put;
51 return -EINVAL;
52 }
53 67
68 down_write(&bpf_devs_lock);
69 if (offload->netdev->reg_state != NETREG_REGISTERED) {
70 err = -EINVAL;
71 goto err_unlock;
72 }
54 prog->aux->offload = offload; 73 prog->aux->offload = offload;
55 list_add_tail(&offload->offloads, &bpf_prog_offload_devs); 74 list_add_tail(&offload->offloads, &bpf_prog_offload_devs);
56 rtnl_unlock(); 75 dev_put(offload->netdev);
76 up_write(&bpf_devs_lock);
57 77
58 return 0; 78 return 0;
79err_unlock:
80 up_write(&bpf_devs_lock);
81err_maybe_put:
82 if (offload->netdev)
83 dev_put(offload->netdev);
84 kfree(offload);
85 return err;
59} 86}
60 87
61static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd, 88static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd,
62 struct netdev_bpf *data) 89 struct netdev_bpf *data)
63{ 90{
64 struct net_device *netdev = prog->aux->offload->netdev; 91 struct bpf_prog_offload *offload = prog->aux->offload;
92 struct net_device *netdev;
65 93
66 ASSERT_RTNL(); 94 ASSERT_RTNL();
67 95
68 if (!netdev) 96 if (!offload)
69 return -ENODEV; 97 return -ENODEV;
70 if (!netdev->netdev_ops->ndo_bpf) 98 netdev = offload->netdev;
71 return -EOPNOTSUPP;
72 99
73 data->command = cmd; 100 data->command = cmd;
74 101
@@ -87,62 +114,63 @@ int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env)
87 if (err) 114 if (err)
88 goto exit_unlock; 115 goto exit_unlock;
89 116
90 env->dev_ops = data.verifier.ops; 117 env->prog->aux->offload->dev_ops = data.verifier.ops;
91
92 env->prog->aux->offload->dev_state = true; 118 env->prog->aux->offload->dev_state = true;
93 env->prog->aux->offload->verifier_running = true;
94exit_unlock: 119exit_unlock:
95 rtnl_unlock(); 120 rtnl_unlock();
96 return err; 121 return err;
97} 122}
98 123
124int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env,
125 int insn_idx, int prev_insn_idx)
126{
127 struct bpf_prog_offload *offload;
128 int ret = -ENODEV;
129
130 down_read(&bpf_devs_lock);
131 offload = env->prog->aux->offload;
132 if (offload)
133 ret = offload->dev_ops->insn_hook(env, insn_idx, prev_insn_idx);
134 up_read(&bpf_devs_lock);
135
136 return ret;
137}
138
99static void __bpf_prog_offload_destroy(struct bpf_prog *prog) 139static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
100{ 140{
101 struct bpf_dev_offload *offload = prog->aux->offload; 141 struct bpf_prog_offload *offload = prog->aux->offload;
102 struct netdev_bpf data = {}; 142 struct netdev_bpf data = {};
103 143
104 /* Caution - if netdev is destroyed before the program, this function
105 * will be called twice.
106 */
107
108 data.offload.prog = prog; 144 data.offload.prog = prog;
109 145
110 if (offload->verifier_running)
111 wait_event(offload->verifier_done, !offload->verifier_running);
112
113 if (offload->dev_state) 146 if (offload->dev_state)
114 WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data)); 147 WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data));
115 148
116 offload->dev_state = false; 149 /* Make sure BPF_PROG_GET_NEXT_ID can't find this dead program */
150 bpf_prog_free_id(prog, true);
151
117 list_del_init(&offload->offloads); 152 list_del_init(&offload->offloads);
118 offload->netdev = NULL; 153 kfree(offload);
154 prog->aux->offload = NULL;
119} 155}
120 156
121void bpf_prog_offload_destroy(struct bpf_prog *prog) 157void bpf_prog_offload_destroy(struct bpf_prog *prog)
122{ 158{
123 struct bpf_dev_offload *offload = prog->aux->offload;
124
125 offload->verifier_running = false;
126 wake_up(&offload->verifier_done);
127
128 rtnl_lock(); 159 rtnl_lock();
129 __bpf_prog_offload_destroy(prog); 160 down_write(&bpf_devs_lock);
161 if (prog->aux->offload)
162 __bpf_prog_offload_destroy(prog);
163 up_write(&bpf_devs_lock);
130 rtnl_unlock(); 164 rtnl_unlock();
131
132 kfree(offload);
133} 165}
134 166
135static int bpf_prog_offload_translate(struct bpf_prog *prog) 167static int bpf_prog_offload_translate(struct bpf_prog *prog)
136{ 168{
137 struct bpf_dev_offload *offload = prog->aux->offload;
138 struct netdev_bpf data = {}; 169 struct netdev_bpf data = {};
139 int ret; 170 int ret;
140 171
141 data.offload.prog = prog; 172 data.offload.prog = prog;
142 173
143 offload->verifier_running = false;
144 wake_up(&offload->verifier_done);
145
146 rtnl_lock(); 174 rtnl_lock();
147 ret = __bpf_offload_ndo(prog, BPF_OFFLOAD_TRANSLATE, &data); 175 ret = __bpf_offload_ndo(prog, BPF_OFFLOAD_TRANSLATE, &data);
148 rtnl_unlock(); 176 rtnl_unlock();
@@ -164,14 +192,323 @@ int bpf_prog_offload_compile(struct bpf_prog *prog)
164 return bpf_prog_offload_translate(prog); 192 return bpf_prog_offload_translate(prog);
165} 193}
166 194
195struct ns_get_path_bpf_prog_args {
196 struct bpf_prog *prog;
197 struct bpf_prog_info *info;
198};
199
200static struct ns_common *bpf_prog_offload_info_fill_ns(void *private_data)
201{
202 struct ns_get_path_bpf_prog_args *args = private_data;
203 struct bpf_prog_aux *aux = args->prog->aux;
204 struct ns_common *ns;
205 struct net *net;
206
207 rtnl_lock();
208 down_read(&bpf_devs_lock);
209
210 if (aux->offload) {
211 args->info->ifindex = aux->offload->netdev->ifindex;
212 net = dev_net(aux->offload->netdev);
213 get_net(net);
214 ns = &net->ns;
215 } else {
216 args->info->ifindex = 0;
217 ns = NULL;
218 }
219
220 up_read(&bpf_devs_lock);
221 rtnl_unlock();
222
223 return ns;
224}
225
226int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
227 struct bpf_prog *prog)
228{
229 struct ns_get_path_bpf_prog_args args = {
230 .prog = prog,
231 .info = info,
232 };
233 struct bpf_prog_aux *aux = prog->aux;
234 struct inode *ns_inode;
235 struct path ns_path;
236 char __user *uinsns;
237 void *res;
238 u32 ulen;
239
240 res = ns_get_path_cb(&ns_path, bpf_prog_offload_info_fill_ns, &args);
241 if (IS_ERR(res)) {
242 if (!info->ifindex)
243 return -ENODEV;
244 return PTR_ERR(res);
245 }
246
247 down_read(&bpf_devs_lock);
248
249 if (!aux->offload) {
250 up_read(&bpf_devs_lock);
251 return -ENODEV;
252 }
253
254 ulen = info->jited_prog_len;
255 info->jited_prog_len = aux->offload->jited_len;
256 if (info->jited_prog_len & ulen) {
257 uinsns = u64_to_user_ptr(info->jited_prog_insns);
258 ulen = min_t(u32, info->jited_prog_len, ulen);
259 if (copy_to_user(uinsns, aux->offload->jited_image, ulen)) {
260 up_read(&bpf_devs_lock);
261 return -EFAULT;
262 }
263 }
264
265 up_read(&bpf_devs_lock);
266
267 ns_inode = ns_path.dentry->d_inode;
268 info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev);
269 info->netns_ino = ns_inode->i_ino;
270 path_put(&ns_path);
271
272 return 0;
273}
274
167const struct bpf_prog_ops bpf_offload_prog_ops = { 275const struct bpf_prog_ops bpf_offload_prog_ops = {
168}; 276};
169 277
278static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap,
279 enum bpf_netdev_command cmd)
280{
281 struct netdev_bpf data = {};
282 struct net_device *netdev;
283
284 ASSERT_RTNL();
285
286 data.command = cmd;
287 data.offmap = offmap;
288 /* Caller must make sure netdev is valid */
289 netdev = offmap->netdev;
290
291 return netdev->netdev_ops->ndo_bpf(netdev, &data);
292}
293
294struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
295{
296 struct net *net = current->nsproxy->net_ns;
297 struct bpf_offloaded_map *offmap;
298 int err;
299
300 if (!capable(CAP_SYS_ADMIN))
301 return ERR_PTR(-EPERM);
302 if (attr->map_type != BPF_MAP_TYPE_ARRAY &&
303 attr->map_type != BPF_MAP_TYPE_HASH)
304 return ERR_PTR(-EINVAL);
305
306 offmap = kzalloc(sizeof(*offmap), GFP_USER);
307 if (!offmap)
308 return ERR_PTR(-ENOMEM);
309
310 bpf_map_init_from_attr(&offmap->map, attr);
311
312 rtnl_lock();
313 down_write(&bpf_devs_lock);
314 offmap->netdev = __dev_get_by_index(net, attr->map_ifindex);
315 err = bpf_dev_offload_check(offmap->netdev);
316 if (err)
317 goto err_unlock;
318
319 err = bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_ALLOC);
320 if (err)
321 goto err_unlock;
322
323 list_add_tail(&offmap->offloads, &bpf_map_offload_devs);
324 up_write(&bpf_devs_lock);
325 rtnl_unlock();
326
327 return &offmap->map;
328
329err_unlock:
330 up_write(&bpf_devs_lock);
331 rtnl_unlock();
332 kfree(offmap);
333 return ERR_PTR(err);
334}
335
336static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap)
337{
338 WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE));
339 /* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */
340 bpf_map_free_id(&offmap->map, true);
341 list_del_init(&offmap->offloads);
342 offmap->netdev = NULL;
343}
344
345void bpf_map_offload_map_free(struct bpf_map *map)
346{
347 struct bpf_offloaded_map *offmap = map_to_offmap(map);
348
349 rtnl_lock();
350 down_write(&bpf_devs_lock);
351 if (offmap->netdev)
352 __bpf_map_offload_destroy(offmap);
353 up_write(&bpf_devs_lock);
354 rtnl_unlock();
355
356 kfree(offmap);
357}
358
359int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value)
360{
361 struct bpf_offloaded_map *offmap = map_to_offmap(map);
362 int ret = -ENODEV;
363
364 down_read(&bpf_devs_lock);
365 if (offmap->netdev)
366 ret = offmap->dev_ops->map_lookup_elem(offmap, key, value);
367 up_read(&bpf_devs_lock);
368
369 return ret;
370}
371
372int bpf_map_offload_update_elem(struct bpf_map *map,
373 void *key, void *value, u64 flags)
374{
375 struct bpf_offloaded_map *offmap = map_to_offmap(map);
376 int ret = -ENODEV;
377
378 if (unlikely(flags > BPF_EXIST))
379 return -EINVAL;
380
381 down_read(&bpf_devs_lock);
382 if (offmap->netdev)
383 ret = offmap->dev_ops->map_update_elem(offmap, key, value,
384 flags);
385 up_read(&bpf_devs_lock);
386
387 return ret;
388}
389
390int bpf_map_offload_delete_elem(struct bpf_map *map, void *key)
391{
392 struct bpf_offloaded_map *offmap = map_to_offmap(map);
393 int ret = -ENODEV;
394
395 down_read(&bpf_devs_lock);
396 if (offmap->netdev)
397 ret = offmap->dev_ops->map_delete_elem(offmap, key);
398 up_read(&bpf_devs_lock);
399
400 return ret;
401}
402
403int bpf_map_offload_get_next_key(struct bpf_map *map, void *key, void *next_key)
404{
405 struct bpf_offloaded_map *offmap = map_to_offmap(map);
406 int ret = -ENODEV;
407
408 down_read(&bpf_devs_lock);
409 if (offmap->netdev)
410 ret = offmap->dev_ops->map_get_next_key(offmap, key, next_key);
411 up_read(&bpf_devs_lock);
412
413 return ret;
414}
415
416struct ns_get_path_bpf_map_args {
417 struct bpf_offloaded_map *offmap;
418 struct bpf_map_info *info;
419};
420
421static struct ns_common *bpf_map_offload_info_fill_ns(void *private_data)
422{
423 struct ns_get_path_bpf_map_args *args = private_data;
424 struct ns_common *ns;
425 struct net *net;
426
427 rtnl_lock();
428 down_read(&bpf_devs_lock);
429
430 if (args->offmap->netdev) {
431 args->info->ifindex = args->offmap->netdev->ifindex;
432 net = dev_net(args->offmap->netdev);
433 get_net(net);
434 ns = &net->ns;
435 } else {
436 args->info->ifindex = 0;
437 ns = NULL;
438 }
439
440 up_read(&bpf_devs_lock);
441 rtnl_unlock();
442
443 return ns;
444}
445
446int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map)
447{
448 struct ns_get_path_bpf_map_args args = {
449 .offmap = map_to_offmap(map),
450 .info = info,
451 };
452 struct inode *ns_inode;
453 struct path ns_path;
454 void *res;
455
456 res = ns_get_path_cb(&ns_path, bpf_map_offload_info_fill_ns, &args);
457 if (IS_ERR(res)) {
458 if (!info->ifindex)
459 return -ENODEV;
460 return PTR_ERR(res);
461 }
462
463 ns_inode = ns_path.dentry->d_inode;
464 info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev);
465 info->netns_ino = ns_inode->i_ino;
466 path_put(&ns_path);
467
468 return 0;
469}
470
471bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map)
472{
473 struct bpf_offloaded_map *offmap;
474 struct bpf_prog_offload *offload;
475 bool ret;
476
477 if (!bpf_prog_is_dev_bound(prog->aux) || !bpf_map_is_dev_bound(map))
478 return false;
479
480 down_read(&bpf_devs_lock);
481 offload = prog->aux->offload;
482 offmap = map_to_offmap(map);
483
484 ret = offload && offload->netdev == offmap->netdev;
485 up_read(&bpf_devs_lock);
486
487 return ret;
488}
489
490static void bpf_offload_orphan_all_progs(struct net_device *netdev)
491{
492 struct bpf_prog_offload *offload, *tmp;
493
494 list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, offloads)
495 if (offload->netdev == netdev)
496 __bpf_prog_offload_destroy(offload->prog);
497}
498
499static void bpf_offload_orphan_all_maps(struct net_device *netdev)
500{
501 struct bpf_offloaded_map *offmap, *tmp;
502
503 list_for_each_entry_safe(offmap, tmp, &bpf_map_offload_devs, offloads)
504 if (offmap->netdev == netdev)
505 __bpf_map_offload_destroy(offmap);
506}
507
170static int bpf_offload_notification(struct notifier_block *notifier, 508static int bpf_offload_notification(struct notifier_block *notifier,
171 ulong event, void *ptr) 509 ulong event, void *ptr)
172{ 510{
173 struct net_device *netdev = netdev_notifier_info_to_dev(ptr); 511 struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
174 struct bpf_dev_offload *offload, *tmp;
175 512
176 ASSERT_RTNL(); 513 ASSERT_RTNL();
177 514
@@ -181,11 +518,10 @@ static int bpf_offload_notification(struct notifier_block *notifier,
181 if (netdev->reg_state != NETREG_UNREGISTERING) 518 if (netdev->reg_state != NETREG_UNREGISTERING)
182 break; 519 break;
183 520
184 list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, 521 down_write(&bpf_devs_lock);
185 offloads) { 522 bpf_offload_orphan_all_progs(netdev);
186 if (offload->netdev == netdev) 523 bpf_offload_orphan_all_maps(netdev);
187 __bpf_prog_offload_destroy(offload->prog); 524 up_write(&bpf_devs_lock);
188 }
189 break; 525 break;
190 default: 526 default:
191 break; 527 break;
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 1712d319c2d8..0314d1783d77 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -96,14 +96,6 @@ static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
96 return rcu_dereference_sk_user_data(sk); 96 return rcu_dereference_sk_user_data(sk);
97} 97}
98 98
99/* compute the linear packet data range [data, data_end) for skb when
100 * sk_skb type programs are in use.
101 */
102static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb)
103{
104 TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb);
105}
106
107enum __sk_action { 99enum __sk_action {
108 __SK_DROP = 0, 100 __SK_DROP = 0,
109 __SK_PASS, 101 __SK_PASS,
@@ -521,13 +513,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
521 if (!stab) 513 if (!stab)
522 return ERR_PTR(-ENOMEM); 514 return ERR_PTR(-ENOMEM);
523 515
524 /* mandatory map attributes */ 516 bpf_map_init_from_attr(&stab->map, attr);
525 stab->map.map_type = attr->map_type;
526 stab->map.key_size = attr->key_size;
527 stab->map.value_size = attr->value_size;
528 stab->map.max_entries = attr->max_entries;
529 stab->map.map_flags = attr->map_flags;
530 stab->map.numa_node = bpf_map_attr_numa_node(attr);
531 517
532 /* make sure page count doesn't overflow */ 518 /* make sure page count doesn't overflow */
533 cost = (u64) stab->map.max_entries * sizeof(struct sock *); 519 cost = (u64) stab->map.max_entries * sizeof(struct sock *);
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index a15bc636cc98..b0ecf43f5894 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -88,14 +88,10 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
88 if (cost >= U32_MAX - PAGE_SIZE) 88 if (cost >= U32_MAX - PAGE_SIZE)
89 goto free_smap; 89 goto free_smap;
90 90
91 smap->map.map_type = attr->map_type; 91 bpf_map_init_from_attr(&smap->map, attr);
92 smap->map.key_size = attr->key_size;
93 smap->map.value_size = value_size; 92 smap->map.value_size = value_size;
94 smap->map.max_entries = attr->max_entries;
95 smap->map.map_flags = attr->map_flags;
96 smap->n_buckets = n_buckets; 93 smap->n_buckets = n_buckets;
97 smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; 94 smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
98 smap->map.numa_node = bpf_map_attr_numa_node(attr);
99 95
100 err = bpf_map_precharge_memlock(smap->map.pages); 96 err = bpf_map_precharge_memlock(smap->map.pages);
101 if (err) 97 if (err)
@@ -226,9 +222,33 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
226 return 0; 222 return 0;
227} 223}
228 224
229static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key) 225static int stack_map_get_next_key(struct bpf_map *map, void *key,
226 void *next_key)
230{ 227{
231 return -EINVAL; 228 struct bpf_stack_map *smap = container_of(map,
229 struct bpf_stack_map, map);
230 u32 id;
231
232 WARN_ON_ONCE(!rcu_read_lock_held());
233
234 if (!key) {
235 id = 0;
236 } else {
237 id = *(u32 *)key;
238 if (id >= smap->n_buckets || !smap->buckets[id])
239 id = 0;
240 else
241 id++;
242 }
243
244 while (id < smap->n_buckets && !smap->buckets[id])
245 id++;
246
247 if (id >= smap->n_buckets)
248 return -ENOENT;
249
250 *(u32 *)next_key = id;
251 return 0;
232} 252}
233 253
234static int stack_map_update_elem(struct bpf_map *map, void *key, void *value, 254static int stack_map_update_elem(struct bpf_map *map, void *key, void *value,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5cb783fc8224..e24aa3241387 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -94,18 +94,34 @@ static int check_uarg_tail_zero(void __user *uaddr,
94 return 0; 94 return 0;
95} 95}
96 96
97const struct bpf_map_ops bpf_map_offload_ops = {
98 .map_alloc = bpf_map_offload_map_alloc,
99 .map_free = bpf_map_offload_map_free,
100};
101
97static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) 102static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
98{ 103{
104 const struct bpf_map_ops *ops;
99 struct bpf_map *map; 105 struct bpf_map *map;
106 int err;
100 107
101 if (attr->map_type >= ARRAY_SIZE(bpf_map_types) || 108 if (attr->map_type >= ARRAY_SIZE(bpf_map_types))
102 !bpf_map_types[attr->map_type]) 109 return ERR_PTR(-EINVAL);
110 ops = bpf_map_types[attr->map_type];
111 if (!ops)
103 return ERR_PTR(-EINVAL); 112 return ERR_PTR(-EINVAL);
104 113
105 map = bpf_map_types[attr->map_type]->map_alloc(attr); 114 if (ops->map_alloc_check) {
115 err = ops->map_alloc_check(attr);
116 if (err)
117 return ERR_PTR(err);
118 }
119 if (attr->map_ifindex)
120 ops = &bpf_map_offload_ops;
121 map = ops->map_alloc(attr);
106 if (IS_ERR(map)) 122 if (IS_ERR(map))
107 return map; 123 return map;
108 map->ops = bpf_map_types[attr->map_type]; 124 map->ops = ops;
109 map->map_type = attr->map_type; 125 map->map_type = attr->map_type;
110 return map; 126 return map;
111} 127}
@@ -134,6 +150,16 @@ void bpf_map_area_free(void *area)
134 kvfree(area); 150 kvfree(area);
135} 151}
136 152
153void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr)
154{
155 map->map_type = attr->map_type;
156 map->key_size = attr->key_size;
157 map->value_size = attr->value_size;
158 map->max_entries = attr->max_entries;
159 map->map_flags = attr->map_flags;
160 map->numa_node = bpf_map_attr_numa_node(attr);
161}
162
137int bpf_map_precharge_memlock(u32 pages) 163int bpf_map_precharge_memlock(u32 pages)
138{ 164{
139 struct user_struct *user = get_current_user(); 165 struct user_struct *user = get_current_user();
@@ -189,16 +215,25 @@ static int bpf_map_alloc_id(struct bpf_map *map)
189 return id > 0 ? 0 : id; 215 return id > 0 ? 0 : id;
190} 216}
191 217
192static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) 218void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
193{ 219{
194 unsigned long flags; 220 unsigned long flags;
195 221
222 /* Offloaded maps are removed from the IDR store when their device
223 * disappears - even if someone holds an fd to them they are unusable,
224 * the memory is gone, all ops will fail; they are simply waiting for
225 * refcnt to drop to be freed.
226 */
227 if (!map->id)
228 return;
229
196 if (do_idr_lock) 230 if (do_idr_lock)
197 spin_lock_irqsave(&map_idr_lock, flags); 231 spin_lock_irqsave(&map_idr_lock, flags);
198 else 232 else
199 __acquire(&map_idr_lock); 233 __acquire(&map_idr_lock);
200 234
201 idr_remove(&map_idr, map->id); 235 idr_remove(&map_idr, map->id);
236 map->id = 0;
202 237
203 if (do_idr_lock) 238 if (do_idr_lock)
204 spin_unlock_irqrestore(&map_idr_lock, flags); 239 spin_unlock_irqrestore(&map_idr_lock, flags);
@@ -378,7 +413,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
378 return 0; 413 return 0;
379} 414}
380 415
381#define BPF_MAP_CREATE_LAST_FIELD map_name 416#define BPF_MAP_CREATE_LAST_FIELD map_ifindex
382/* called via syscall */ 417/* called via syscall */
383static int map_create(union bpf_attr *attr) 418static int map_create(union bpf_attr *attr)
384{ 419{
@@ -566,8 +601,10 @@ static int map_lookup_elem(union bpf_attr *attr)
566 if (!value) 601 if (!value)
567 goto free_key; 602 goto free_key;
568 603
569 if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || 604 if (bpf_map_is_dev_bound(map)) {
570 map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { 605 err = bpf_map_offload_lookup_elem(map, key, value);
606 } else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
607 map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
571 err = bpf_percpu_hash_copy(map, key, value); 608 err = bpf_percpu_hash_copy(map, key, value);
572 } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { 609 } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
573 err = bpf_percpu_array_copy(map, key, value); 610 err = bpf_percpu_array_copy(map, key, value);
@@ -654,7 +691,10 @@ static int map_update_elem(union bpf_attr *attr)
654 goto free_value; 691 goto free_value;
655 692
656 /* Need to create a kthread, thus must support schedule */ 693 /* Need to create a kthread, thus must support schedule */
657 if (map->map_type == BPF_MAP_TYPE_CPUMAP) { 694 if (bpf_map_is_dev_bound(map)) {
695 err = bpf_map_offload_update_elem(map, key, value, attr->flags);
696 goto out;
697 } else if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
658 err = map->ops->map_update_elem(map, key, value, attr->flags); 698 err = map->ops->map_update_elem(map, key, value, attr->flags);
659 goto out; 699 goto out;
660 } 700 }
@@ -669,10 +709,7 @@ static int map_update_elem(union bpf_attr *attr)
669 err = bpf_percpu_hash_update(map, key, value, attr->flags); 709 err = bpf_percpu_hash_update(map, key, value, attr->flags);
670 } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { 710 } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
671 err = bpf_percpu_array_update(map, key, value, attr->flags); 711 err = bpf_percpu_array_update(map, key, value, attr->flags);
672 } else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || 712 } else if (IS_FD_ARRAY(map)) {
673 map->map_type == BPF_MAP_TYPE_PROG_ARRAY ||
674 map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY ||
675 map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) {
676 rcu_read_lock(); 713 rcu_read_lock();
677 err = bpf_fd_array_map_update_elem(map, f.file, key, value, 714 err = bpf_fd_array_map_update_elem(map, f.file, key, value,
678 attr->flags); 715 attr->flags);
@@ -731,6 +768,11 @@ static int map_delete_elem(union bpf_attr *attr)
731 goto err_put; 768 goto err_put;
732 } 769 }
733 770
771 if (bpf_map_is_dev_bound(map)) {
772 err = bpf_map_offload_delete_elem(map, key);
773 goto out;
774 }
775
734 preempt_disable(); 776 preempt_disable();
735 __this_cpu_inc(bpf_prog_active); 777 __this_cpu_inc(bpf_prog_active);
736 rcu_read_lock(); 778 rcu_read_lock();
@@ -738,7 +780,7 @@ static int map_delete_elem(union bpf_attr *attr)
738 rcu_read_unlock(); 780 rcu_read_unlock();
739 __this_cpu_dec(bpf_prog_active); 781 __this_cpu_dec(bpf_prog_active);
740 preempt_enable(); 782 preempt_enable();
741 783out:
742 if (!err) 784 if (!err)
743 trace_bpf_map_delete_elem(map, ufd, key); 785 trace_bpf_map_delete_elem(map, ufd, key);
744 kfree(key); 786 kfree(key);
@@ -788,9 +830,15 @@ static int map_get_next_key(union bpf_attr *attr)
788 if (!next_key) 830 if (!next_key)
789 goto free_key; 831 goto free_key;
790 832
833 if (bpf_map_is_dev_bound(map)) {
834 err = bpf_map_offload_get_next_key(map, key, next_key);
835 goto out;
836 }
837
791 rcu_read_lock(); 838 rcu_read_lock();
792 err = map->ops->map_get_next_key(map, key, next_key); 839 err = map->ops->map_get_next_key(map, key, next_key);
793 rcu_read_unlock(); 840 rcu_read_unlock();
841out:
794 if (err) 842 if (err)
795 goto free_next_key; 843 goto free_next_key;
796 844
@@ -905,9 +953,13 @@ static int bpf_prog_alloc_id(struct bpf_prog *prog)
905 return id > 0 ? 0 : id; 953 return id > 0 ? 0 : id;
906} 954}
907 955
908static void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) 956void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
909{ 957{
910 /* cBPF to eBPF migrations are currently not in the idr store. */ 958 /* cBPF to eBPF migrations are currently not in the idr store.
959 * Offloaded programs are removed from the store when their device
960 * disappears - even if someone grabs an fd to them they are unusable,
961 * simply waiting for refcnt to drop to be freed.
962 */
911 if (!prog->aux->id) 963 if (!prog->aux->id)
912 return; 964 return;
913 965
@@ -917,6 +969,7 @@ static void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
917 __acquire(&prog_idr_lock); 969 __acquire(&prog_idr_lock);
918 970
919 idr_remove(&prog_idr, prog->aux->id); 971 idr_remove(&prog_idr, prog->aux->id);
972 prog->aux->id = 0;
920 973
921 if (do_idr_lock) 974 if (do_idr_lock)
922 spin_unlock_bh(&prog_idr_lock); 975 spin_unlock_bh(&prog_idr_lock);
@@ -937,10 +990,16 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
937static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) 990static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
938{ 991{
939 if (atomic_dec_and_test(&prog->aux->refcnt)) { 992 if (atomic_dec_and_test(&prog->aux->refcnt)) {
993 int i;
994
940 trace_bpf_prog_put_rcu(prog); 995 trace_bpf_prog_put_rcu(prog);
941 /* bpf_prog_free_id() must be called first */ 996 /* bpf_prog_free_id() must be called first */
942 bpf_prog_free_id(prog, do_idr_lock); 997 bpf_prog_free_id(prog, do_idr_lock);
998
999 for (i = 0; i < prog->aux->func_cnt; i++)
1000 bpf_prog_kallsyms_del(prog->aux->func[i]);
943 bpf_prog_kallsyms_del(prog); 1001 bpf_prog_kallsyms_del(prog);
1002
944 call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); 1003 call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
945 } 1004 }
946} 1005}
@@ -1151,6 +1210,8 @@ static int bpf_prog_load(union bpf_attr *attr)
1151 if (!prog) 1210 if (!prog)
1152 return -ENOMEM; 1211 return -ENOMEM;
1153 1212
1213 prog->aux->offload_requested = !!attr->prog_ifindex;
1214
1154 err = security_bpf_prog_alloc(prog->aux); 1215 err = security_bpf_prog_alloc(prog->aux);
1155 if (err) 1216 if (err)
1156 goto free_prog_nouncharge; 1217 goto free_prog_nouncharge;
@@ -1172,7 +1233,7 @@ static int bpf_prog_load(union bpf_attr *attr)
1172 atomic_set(&prog->aux->refcnt, 1); 1233 atomic_set(&prog->aux->refcnt, 1);
1173 prog->gpl_compatible = is_gpl ? 1 : 0; 1234 prog->gpl_compatible = is_gpl ? 1 : 0;
1174 1235
1175 if (attr->prog_ifindex) { 1236 if (bpf_prog_is_dev_bound(prog->aux)) {
1176 err = bpf_prog_offload_init(prog, attr); 1237 err = bpf_prog_offload_init(prog, attr);
1177 if (err) 1238 if (err)
1178 goto free_prog; 1239 goto free_prog;
@@ -1194,7 +1255,8 @@ static int bpf_prog_load(union bpf_attr *attr)
1194 goto free_used_maps; 1255 goto free_used_maps;
1195 1256
1196 /* eBPF program is ready to be JITed */ 1257 /* eBPF program is ready to be JITed */
1197 prog = bpf_prog_select_runtime(prog, &err); 1258 if (!prog->bpf_func)
1259 prog = bpf_prog_select_runtime(prog, &err);
1198 if (err < 0) 1260 if (err < 0)
1199 goto free_used_maps; 1261 goto free_used_maps;
1200 1262
@@ -1439,6 +1501,8 @@ static int bpf_prog_test_run(const union bpf_attr *attr,
1439 struct bpf_prog *prog; 1501 struct bpf_prog *prog;
1440 int ret = -ENOTSUPP; 1502 int ret = -ENOTSUPP;
1441 1503
1504 if (!capable(CAP_SYS_ADMIN))
1505 return -EPERM;
1442 if (CHECK_ATTR(BPF_PROG_TEST_RUN)) 1506 if (CHECK_ATTR(BPF_PROG_TEST_RUN))
1443 return -EINVAL; 1507 return -EINVAL;
1444 1508
@@ -1551,6 +1615,67 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
1551 return fd; 1615 return fd;
1552} 1616}
1553 1617
1618static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog,
1619 unsigned long addr)
1620{
1621 int i;
1622
1623 for (i = 0; i < prog->aux->used_map_cnt; i++)
1624 if (prog->aux->used_maps[i] == (void *)addr)
1625 return prog->aux->used_maps[i];
1626 return NULL;
1627}
1628
1629static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog)
1630{
1631 const struct bpf_map *map;
1632 struct bpf_insn *insns;
1633 u64 imm;
1634 int i;
1635
1636 insns = kmemdup(prog->insnsi, bpf_prog_insn_size(prog),
1637 GFP_USER);
1638 if (!insns)
1639 return insns;
1640
1641 for (i = 0; i < prog->len; i++) {
1642 if (insns[i].code == (BPF_JMP | BPF_TAIL_CALL)) {
1643 insns[i].code = BPF_JMP | BPF_CALL;
1644 insns[i].imm = BPF_FUNC_tail_call;
1645 /* fall-through */
1646 }
1647 if (insns[i].code == (BPF_JMP | BPF_CALL) ||
1648 insns[i].code == (BPF_JMP | BPF_CALL_ARGS)) {
1649 if (insns[i].code == (BPF_JMP | BPF_CALL_ARGS))
1650 insns[i].code = BPF_JMP | BPF_CALL;
1651 if (!bpf_dump_raw_ok())
1652 insns[i].imm = 0;
1653 continue;
1654 }
1655
1656 if (insns[i].code != (BPF_LD | BPF_IMM | BPF_DW))
1657 continue;
1658
1659 imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm;
1660 map = bpf_map_from_imm(prog, imm);
1661 if (map) {
1662 insns[i].src_reg = BPF_PSEUDO_MAP_FD;
1663 insns[i].imm = map->id;
1664 insns[i + 1].imm = 0;
1665 continue;
1666 }
1667
1668 if (!bpf_dump_raw_ok() &&
1669 imm == (unsigned long)prog->aux) {
1670 insns[i].imm = 0;
1671 insns[i + 1].imm = 0;
1672 continue;
1673 }
1674 }
1675
1676 return insns;
1677}
1678
1554static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, 1679static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
1555 const union bpf_attr *attr, 1680 const union bpf_attr *attr,
1556 union bpf_attr __user *uattr) 1681 union bpf_attr __user *uattr)
@@ -1598,24 +1723,51 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
1598 goto done; 1723 goto done;
1599 } 1724 }
1600 1725
1601 ulen = info.jited_prog_len;
1602 info.jited_prog_len = prog->jited_len;
1603 if (info.jited_prog_len && ulen) {
1604 uinsns = u64_to_user_ptr(info.jited_prog_insns);
1605 ulen = min_t(u32, info.jited_prog_len, ulen);
1606 if (copy_to_user(uinsns, prog->bpf_func, ulen))
1607 return -EFAULT;
1608 }
1609
1610 ulen = info.xlated_prog_len; 1726 ulen = info.xlated_prog_len;
1611 info.xlated_prog_len = bpf_prog_insn_size(prog); 1727 info.xlated_prog_len = bpf_prog_insn_size(prog);
1612 if (info.xlated_prog_len && ulen) { 1728 if (info.xlated_prog_len && ulen) {
1729 struct bpf_insn *insns_sanitized;
1730 bool fault;
1731
1732 if (prog->blinded && !bpf_dump_raw_ok()) {
1733 info.xlated_prog_insns = 0;
1734 goto done;
1735 }
1736 insns_sanitized = bpf_insn_prepare_dump(prog);
1737 if (!insns_sanitized)
1738 return -ENOMEM;
1613 uinsns = u64_to_user_ptr(info.xlated_prog_insns); 1739 uinsns = u64_to_user_ptr(info.xlated_prog_insns);
1614 ulen = min_t(u32, info.xlated_prog_len, ulen); 1740 ulen = min_t(u32, info.xlated_prog_len, ulen);
1615 if (copy_to_user(uinsns, prog->insnsi, ulen)) 1741 fault = copy_to_user(uinsns, insns_sanitized, ulen);
1742 kfree(insns_sanitized);
1743 if (fault)
1616 return -EFAULT; 1744 return -EFAULT;
1617 } 1745 }
1618 1746
1747 if (bpf_prog_is_dev_bound(prog->aux)) {
1748 err = bpf_prog_offload_info_fill(&info, prog);
1749 if (err)
1750 return err;
1751 goto done;
1752 }
1753
1754 /* NOTE: the following code is supposed to be skipped for offload.
1755 * bpf_prog_offload_info_fill() is the place to fill similar fields
1756 * for offload.
1757 */
1758 ulen = info.jited_prog_len;
1759 info.jited_prog_len = prog->jited_len;
1760 if (info.jited_prog_len && ulen) {
1761 if (bpf_dump_raw_ok()) {
1762 uinsns = u64_to_user_ptr(info.jited_prog_insns);
1763 ulen = min_t(u32, info.jited_prog_len, ulen);
1764 if (copy_to_user(uinsns, prog->bpf_func, ulen))
1765 return -EFAULT;
1766 } else {
1767 info.jited_prog_insns = 0;
1768 }
1769 }
1770
1619done: 1771done:
1620 if (copy_to_user(uinfo, &info, info_len) || 1772 if (copy_to_user(uinfo, &info, info_len) ||
1621 put_user(info_len, &uattr->info.info_len)) 1773 put_user(info_len, &uattr->info.info_len))
@@ -1646,6 +1798,12 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
1646 info.map_flags = map->map_flags; 1798 info.map_flags = map->map_flags;
1647 memcpy(info.name, map->name, sizeof(map->name)); 1799 memcpy(info.name, map->name, sizeof(map->name));
1648 1800
1801 if (bpf_map_is_dev_bound(map)) {
1802 err = bpf_map_offload_info_fill(&info, map);
1803 if (err)
1804 return err;
1805 }
1806
1649 if (copy_to_user(uinfo, &info, info_len) || 1807 if (copy_to_user(uinfo, &info, info_len) ||
1650 put_user(info_len, &uattr->info.info_len)) 1808 put_user(info_len, &uattr->info.info_len))
1651 return -EFAULT; 1809 return -EFAULT;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 13551e623501..5fb69a85d967 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -20,6 +20,8 @@
20#include <linux/file.h> 20#include <linux/file.h>
21#include <linux/vmalloc.h> 21#include <linux/vmalloc.h>
22#include <linux/stringify.h> 22#include <linux/stringify.h>
23#include <linux/bsearch.h>
24#include <linux/sort.h>
23 25
24#include "disasm.h" 26#include "disasm.h"
25 27
@@ -167,11 +169,11 @@ struct bpf_call_arg_meta {
167static DEFINE_MUTEX(bpf_verifier_lock); 169static DEFINE_MUTEX(bpf_verifier_lock);
168 170
169/* log_level controls verbosity level of eBPF verifier. 171/* log_level controls verbosity level of eBPF verifier.
170 * verbose() is used to dump the verification trace to the log, so the user 172 * bpf_verifier_log_write() is used to dump the verification trace to the log,
171 * can figure out what's wrong with the program 173 * so the user can figure out what's wrong with the program
172 */ 174 */
173static __printf(2, 3) void verbose(struct bpf_verifier_env *env, 175__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
174 const char *fmt, ...) 176 const char *fmt, ...)
175{ 177{
176 struct bpf_verifer_log *log = &env->log; 178 struct bpf_verifer_log *log = &env->log;
177 unsigned int n; 179 unsigned int n;
@@ -195,6 +197,14 @@ static __printf(2, 3) void verbose(struct bpf_verifier_env *env,
195 else 197 else
196 log->ubuf = NULL; 198 log->ubuf = NULL;
197} 199}
200EXPORT_SYMBOL_GPL(bpf_verifier_log_write);
201/* Historically bpf_verifier_log_write was called verbose, but the name was too
202 * generic for symbol export. The function was renamed, but not the calls in
203 * the verifier to avoid complicating backports. Hence the alias below.
204 */
205static __printf(2, 3) void verbose(struct bpf_verifier_env *env,
206 const char *fmt, ...)
207 __attribute__((alias("bpf_verifier_log_write")));
198 208
199static bool type_is_pkt_pointer(enum bpf_reg_type type) 209static bool type_is_pkt_pointer(enum bpf_reg_type type)
200{ 210{
@@ -216,23 +226,48 @@ static const char * const reg_type_str[] = {
216 [PTR_TO_PACKET_END] = "pkt_end", 226 [PTR_TO_PACKET_END] = "pkt_end",
217}; 227};
218 228
229static void print_liveness(struct bpf_verifier_env *env,
230 enum bpf_reg_liveness live)
231{
232 if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN))
233 verbose(env, "_");
234 if (live & REG_LIVE_READ)
235 verbose(env, "r");
236 if (live & REG_LIVE_WRITTEN)
237 verbose(env, "w");
238}
239
240static struct bpf_func_state *func(struct bpf_verifier_env *env,
241 const struct bpf_reg_state *reg)
242{
243 struct bpf_verifier_state *cur = env->cur_state;
244
245 return cur->frame[reg->frameno];
246}
247
219static void print_verifier_state(struct bpf_verifier_env *env, 248static void print_verifier_state(struct bpf_verifier_env *env,
220 struct bpf_verifier_state *state) 249 const struct bpf_func_state *state)
221{ 250{
222 struct bpf_reg_state *reg; 251 const struct bpf_reg_state *reg;
223 enum bpf_reg_type t; 252 enum bpf_reg_type t;
224 int i; 253 int i;
225 254
255 if (state->frameno)
256 verbose(env, " frame%d:", state->frameno);
226 for (i = 0; i < MAX_BPF_REG; i++) { 257 for (i = 0; i < MAX_BPF_REG; i++) {
227 reg = &state->regs[i]; 258 reg = &state->regs[i];
228 t = reg->type; 259 t = reg->type;
229 if (t == NOT_INIT) 260 if (t == NOT_INIT)
230 continue; 261 continue;
231 verbose(env, " R%d=%s", i, reg_type_str[t]); 262 verbose(env, " R%d", i);
263 print_liveness(env, reg->live);
264 verbose(env, "=%s", reg_type_str[t]);
232 if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && 265 if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
233 tnum_is_const(reg->var_off)) { 266 tnum_is_const(reg->var_off)) {
234 /* reg->off should be 0 for SCALAR_VALUE */ 267 /* reg->off should be 0 for SCALAR_VALUE */
235 verbose(env, "%lld", reg->var_off.value + reg->off); 268 verbose(env, "%lld", reg->var_off.value + reg->off);
269 if (t == PTR_TO_STACK)
270 verbose(env, ",call_%d", func(env, reg)->callsite);
236 } else { 271 } else {
237 verbose(env, "(id=%d", reg->id); 272 verbose(env, "(id=%d", reg->id);
238 if (t != SCALAR_VALUE) 273 if (t != SCALAR_VALUE)
@@ -277,16 +312,21 @@ static void print_verifier_state(struct bpf_verifier_env *env,
277 } 312 }
278 } 313 }
279 for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { 314 for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
280 if (state->stack[i].slot_type[0] == STACK_SPILL) 315 if (state->stack[i].slot_type[0] == STACK_SPILL) {
281 verbose(env, " fp%d=%s", 316 verbose(env, " fp%d",
282 -MAX_BPF_STACK + i * BPF_REG_SIZE, 317 (-i - 1) * BPF_REG_SIZE);
318 print_liveness(env, state->stack[i].spilled_ptr.live);
319 verbose(env, "=%s",
283 reg_type_str[state->stack[i].spilled_ptr.type]); 320 reg_type_str[state->stack[i].spilled_ptr.type]);
321 }
322 if (state->stack[i].slot_type[0] == STACK_ZERO)
323 verbose(env, " fp%d=0", (-i - 1) * BPF_REG_SIZE);
284 } 324 }
285 verbose(env, "\n"); 325 verbose(env, "\n");
286} 326}
287 327
288static int copy_stack_state(struct bpf_verifier_state *dst, 328static int copy_stack_state(struct bpf_func_state *dst,
289 const struct bpf_verifier_state *src) 329 const struct bpf_func_state *src)
290{ 330{
291 if (!src->stack) 331 if (!src->stack)
292 return 0; 332 return 0;
@@ -302,13 +342,13 @@ static int copy_stack_state(struct bpf_verifier_state *dst,
302 342
303/* do_check() starts with zero-sized stack in struct bpf_verifier_state to 343/* do_check() starts with zero-sized stack in struct bpf_verifier_state to
304 * make it consume minimal amount of memory. check_stack_write() access from 344 * make it consume minimal amount of memory. check_stack_write() access from
305 * the program calls into realloc_verifier_state() to grow the stack size. 345 * the program calls into realloc_func_state() to grow the stack size.
306 * Note there is a non-zero 'parent' pointer inside bpf_verifier_state 346 * Note there is a non-zero 'parent' pointer inside bpf_verifier_state
307 * which this function copies over. It points to previous bpf_verifier_state 347 * which this function copies over. It points to previous bpf_verifier_state
308 * which is never reallocated 348 * which is never reallocated
309 */ 349 */
310static int realloc_verifier_state(struct bpf_verifier_state *state, int size, 350static int realloc_func_state(struct bpf_func_state *state, int size,
311 bool copy_old) 351 bool copy_old)
312{ 352{
313 u32 old_size = state->allocated_stack; 353 u32 old_size = state->allocated_stack;
314 struct bpf_stack_state *new_stack; 354 struct bpf_stack_state *new_stack;
@@ -341,10 +381,23 @@ static int realloc_verifier_state(struct bpf_verifier_state *state, int size,
341 return 0; 381 return 0;
342} 382}
343 383
384static void free_func_state(struct bpf_func_state *state)
385{
386 if (!state)
387 return;
388 kfree(state->stack);
389 kfree(state);
390}
391
344static void free_verifier_state(struct bpf_verifier_state *state, 392static void free_verifier_state(struct bpf_verifier_state *state,
345 bool free_self) 393 bool free_self)
346{ 394{
347 kfree(state->stack); 395 int i;
396
397 for (i = 0; i <= state->curframe; i++) {
398 free_func_state(state->frame[i]);
399 state->frame[i] = NULL;
400 }
348 if (free_self) 401 if (free_self)
349 kfree(state); 402 kfree(state);
350} 403}
@@ -352,18 +405,46 @@ static void free_verifier_state(struct bpf_verifier_state *state,
352/* copy verifier state from src to dst growing dst stack space 405/* copy verifier state from src to dst growing dst stack space
353 * when necessary to accommodate larger src stack 406 * when necessary to accommodate larger src stack
354 */ 407 */
355static int copy_verifier_state(struct bpf_verifier_state *dst, 408static int copy_func_state(struct bpf_func_state *dst,
356 const struct bpf_verifier_state *src) 409 const struct bpf_func_state *src)
357{ 410{
358 int err; 411 int err;
359 412
360 err = realloc_verifier_state(dst, src->allocated_stack, false); 413 err = realloc_func_state(dst, src->allocated_stack, false);
361 if (err) 414 if (err)
362 return err; 415 return err;
363 memcpy(dst, src, offsetof(struct bpf_verifier_state, allocated_stack)); 416 memcpy(dst, src, offsetof(struct bpf_func_state, allocated_stack));
364 return copy_stack_state(dst, src); 417 return copy_stack_state(dst, src);
365} 418}
366 419
420static int copy_verifier_state(struct bpf_verifier_state *dst_state,
421 const struct bpf_verifier_state *src)
422{
423 struct bpf_func_state *dst;
424 int i, err;
425
426 /* if dst has more stack frames then src frame, free them */
427 for (i = src->curframe + 1; i <= dst_state->curframe; i++) {
428 free_func_state(dst_state->frame[i]);
429 dst_state->frame[i] = NULL;
430 }
431 dst_state->curframe = src->curframe;
432 dst_state->parent = src->parent;
433 for (i = 0; i <= src->curframe; i++) {
434 dst = dst_state->frame[i];
435 if (!dst) {
436 dst = kzalloc(sizeof(*dst), GFP_KERNEL);
437 if (!dst)
438 return -ENOMEM;
439 dst_state->frame[i] = dst;
440 }
441 err = copy_func_state(dst, src->frame[i]);
442 if (err)
443 return err;
444 }
445 return 0;
446}
447
367static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, 448static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
368 int *insn_idx) 449 int *insn_idx)
369{ 450{
@@ -416,6 +497,8 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
416 } 497 }
417 return &elem->st; 498 return &elem->st;
418err: 499err:
500 free_verifier_state(env->cur_state, true);
501 env->cur_state = NULL;
419 /* pop all elements and return */ 502 /* pop all elements and return */
420 while (!pop_stack(env, NULL, NULL)); 503 while (!pop_stack(env, NULL, NULL));
421 return NULL; 504 return NULL;
@@ -425,6 +508,10 @@ err:
425static const int caller_saved[CALLER_SAVED_REGS] = { 508static const int caller_saved[CALLER_SAVED_REGS] = {
426 BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 509 BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
427}; 510};
511#define CALLEE_SAVED_REGS 5
512static const int callee_saved[CALLEE_SAVED_REGS] = {
513 BPF_REG_6, BPF_REG_7, BPF_REG_8, BPF_REG_9
514};
428 515
429static void __mark_reg_not_init(struct bpf_reg_state *reg); 516static void __mark_reg_not_init(struct bpf_reg_state *reg);
430 517
@@ -449,6 +536,13 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg)
449 __mark_reg_known(reg, 0); 536 __mark_reg_known(reg, 0);
450} 537}
451 538
539static void __mark_reg_const_zero(struct bpf_reg_state *reg)
540{
541 __mark_reg_known(reg, 0);
542 reg->off = 0;
543 reg->type = SCALAR_VALUE;
544}
545
452static void mark_reg_known_zero(struct bpf_verifier_env *env, 546static void mark_reg_known_zero(struct bpf_verifier_env *env,
453 struct bpf_reg_state *regs, u32 regno) 547 struct bpf_reg_state *regs, u32 regno)
454{ 548{
@@ -560,6 +654,7 @@ static void __mark_reg_unknown(struct bpf_reg_state *reg)
560 reg->id = 0; 654 reg->id = 0;
561 reg->off = 0; 655 reg->off = 0;
562 reg->var_off = tnum_unknown; 656 reg->var_off = tnum_unknown;
657 reg->frameno = 0;
563 __mark_reg_unbounded(reg); 658 __mark_reg_unbounded(reg);
564} 659}
565 660
@@ -568,8 +663,8 @@ static void mark_reg_unknown(struct bpf_verifier_env *env,
568{ 663{
569 if (WARN_ON(regno >= MAX_BPF_REG)) { 664 if (WARN_ON(regno >= MAX_BPF_REG)) {
570 verbose(env, "mark_reg_unknown(regs, %u)\n", regno); 665 verbose(env, "mark_reg_unknown(regs, %u)\n", regno);
571 /* Something bad happened, let's kill all regs */ 666 /* Something bad happened, let's kill all regs except FP */
572 for (regno = 0; regno < MAX_BPF_REG; regno++) 667 for (regno = 0; regno < BPF_REG_FP; regno++)
573 __mark_reg_not_init(regs + regno); 668 __mark_reg_not_init(regs + regno);
574 return; 669 return;
575 } 670 }
@@ -587,8 +682,8 @@ static void mark_reg_not_init(struct bpf_verifier_env *env,
587{ 682{
588 if (WARN_ON(regno >= MAX_BPF_REG)) { 683 if (WARN_ON(regno >= MAX_BPF_REG)) {
589 verbose(env, "mark_reg_not_init(regs, %u)\n", regno); 684 verbose(env, "mark_reg_not_init(regs, %u)\n", regno);
590 /* Something bad happened, let's kill all regs */ 685 /* Something bad happened, let's kill all regs except FP */
591 for (regno = 0; regno < MAX_BPF_REG; regno++) 686 for (regno = 0; regno < BPF_REG_FP; regno++)
592 __mark_reg_not_init(regs + regno); 687 __mark_reg_not_init(regs + regno);
593 return; 688 return;
594 } 689 }
@@ -596,8 +691,9 @@ static void mark_reg_not_init(struct bpf_verifier_env *env,
596} 691}
597 692
598static void init_reg_state(struct bpf_verifier_env *env, 693static void init_reg_state(struct bpf_verifier_env *env,
599 struct bpf_reg_state *regs) 694 struct bpf_func_state *state)
600{ 695{
696 struct bpf_reg_state *regs = state->regs;
601 int i; 697 int i;
602 698
603 for (i = 0; i < MAX_BPF_REG; i++) { 699 for (i = 0; i < MAX_BPF_REG; i++) {
@@ -608,41 +704,218 @@ static void init_reg_state(struct bpf_verifier_env *env,
608 /* frame pointer */ 704 /* frame pointer */
609 regs[BPF_REG_FP].type = PTR_TO_STACK; 705 regs[BPF_REG_FP].type = PTR_TO_STACK;
610 mark_reg_known_zero(env, regs, BPF_REG_FP); 706 mark_reg_known_zero(env, regs, BPF_REG_FP);
707 regs[BPF_REG_FP].frameno = state->frameno;
611 708
612 /* 1st arg to a function */ 709 /* 1st arg to a function */
613 regs[BPF_REG_1].type = PTR_TO_CTX; 710 regs[BPF_REG_1].type = PTR_TO_CTX;
614 mark_reg_known_zero(env, regs, BPF_REG_1); 711 mark_reg_known_zero(env, regs, BPF_REG_1);
615} 712}
616 713
714#define BPF_MAIN_FUNC (-1)
715static void init_func_state(struct bpf_verifier_env *env,
716 struct bpf_func_state *state,
717 int callsite, int frameno, int subprogno)
718{
719 state->callsite = callsite;
720 state->frameno = frameno;
721 state->subprogno = subprogno;
722 init_reg_state(env, state);
723}
724
617enum reg_arg_type { 725enum reg_arg_type {
618 SRC_OP, /* register is used as source operand */ 726 SRC_OP, /* register is used as source operand */
619 DST_OP, /* register is used as destination operand */ 727 DST_OP, /* register is used as destination operand */
620 DST_OP_NO_MARK /* same as above, check only, don't mark */ 728 DST_OP_NO_MARK /* same as above, check only, don't mark */
621}; 729};
622 730
623static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno) 731static int cmp_subprogs(const void *a, const void *b)
732{
733 return *(int *)a - *(int *)b;
734}
735
736static int find_subprog(struct bpf_verifier_env *env, int off)
624{ 737{
625 struct bpf_verifier_state *parent = state->parent; 738 u32 *p;
739
740 p = bsearch(&off, env->subprog_starts, env->subprog_cnt,
741 sizeof(env->subprog_starts[0]), cmp_subprogs);
742 if (!p)
743 return -ENOENT;
744 return p - env->subprog_starts;
745
746}
747
748static int add_subprog(struct bpf_verifier_env *env, int off)
749{
750 int insn_cnt = env->prog->len;
751 int ret;
752
753 if (off >= insn_cnt || off < 0) {
754 verbose(env, "call to invalid destination\n");
755 return -EINVAL;
756 }
757 ret = find_subprog(env, off);
758 if (ret >= 0)
759 return 0;
760 if (env->subprog_cnt >= BPF_MAX_SUBPROGS) {
761 verbose(env, "too many subprograms\n");
762 return -E2BIG;
763 }
764 env->subprog_starts[env->subprog_cnt++] = off;
765 sort(env->subprog_starts, env->subprog_cnt,
766 sizeof(env->subprog_starts[0]), cmp_subprogs, NULL);
767 return 0;
768}
769
770static int check_subprogs(struct bpf_verifier_env *env)
771{
772 int i, ret, subprog_start, subprog_end, off, cur_subprog = 0;
773 struct bpf_insn *insn = env->prog->insnsi;
774 int insn_cnt = env->prog->len;
775
776 /* determine subprog starts. The end is one before the next starts */
777 for (i = 0; i < insn_cnt; i++) {
778 if (insn[i].code != (BPF_JMP | BPF_CALL))
779 continue;
780 if (insn[i].src_reg != BPF_PSEUDO_CALL)
781 continue;
782 if (!env->allow_ptr_leaks) {
783 verbose(env, "function calls to other bpf functions are allowed for root only\n");
784 return -EPERM;
785 }
786 if (bpf_prog_is_dev_bound(env->prog->aux)) {
787 verbose(env, "function calls in offloaded programs are not supported yet\n");
788 return -EINVAL;
789 }
790 ret = add_subprog(env, i + insn[i].imm + 1);
791 if (ret < 0)
792 return ret;
793 }
794
795 if (env->log.level > 1)
796 for (i = 0; i < env->subprog_cnt; i++)
797 verbose(env, "func#%d @%d\n", i, env->subprog_starts[i]);
798
799 /* now check that all jumps are within the same subprog */
800 subprog_start = 0;
801 if (env->subprog_cnt == cur_subprog)
802 subprog_end = insn_cnt;
803 else
804 subprog_end = env->subprog_starts[cur_subprog++];
805 for (i = 0; i < insn_cnt; i++) {
806 u8 code = insn[i].code;
807
808 if (BPF_CLASS(code) != BPF_JMP)
809 goto next;
810 if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL)
811 goto next;
812 off = i + insn[i].off + 1;
813 if (off < subprog_start || off >= subprog_end) {
814 verbose(env, "jump out of range from insn %d to %d\n", i, off);
815 return -EINVAL;
816 }
817next:
818 if (i == subprog_end - 1) {
819 /* to avoid fall-through from one subprog into another
820 * the last insn of the subprog should be either exit
821 * or unconditional jump back
822 */
823 if (code != (BPF_JMP | BPF_EXIT) &&
824 code != (BPF_JMP | BPF_JA)) {
825 verbose(env, "last insn is not an exit or jmp\n");
826 return -EINVAL;
827 }
828 subprog_start = subprog_end;
829 if (env->subprog_cnt == cur_subprog)
830 subprog_end = insn_cnt;
831 else
832 subprog_end = env->subprog_starts[cur_subprog++];
833 }
834 }
835 return 0;
836}
837
838static
839struct bpf_verifier_state *skip_callee(struct bpf_verifier_env *env,
840 const struct bpf_verifier_state *state,
841 struct bpf_verifier_state *parent,
842 u32 regno)
843{
844 struct bpf_verifier_state *tmp = NULL;
845
846 /* 'parent' could be a state of caller and
847 * 'state' could be a state of callee. In such case
848 * parent->curframe < state->curframe
849 * and it's ok for r1 - r5 registers
850 *
851 * 'parent' could be a callee's state after it bpf_exit-ed.
852 * In such case parent->curframe > state->curframe
853 * and it's ok for r0 only
854 */
855 if (parent->curframe == state->curframe ||
856 (parent->curframe < state->curframe &&
857 regno >= BPF_REG_1 && regno <= BPF_REG_5) ||
858 (parent->curframe > state->curframe &&
859 regno == BPF_REG_0))
860 return parent;
861
862 if (parent->curframe > state->curframe &&
863 regno >= BPF_REG_6) {
864 /* for callee saved regs we have to skip the whole chain
865 * of states that belong to callee and mark as LIVE_READ
866 * the registers before the call
867 */
868 tmp = parent;
869 while (tmp && tmp->curframe != state->curframe) {
870 tmp = tmp->parent;
871 }
872 if (!tmp)
873 goto bug;
874 parent = tmp;
875 } else {
876 goto bug;
877 }
878 return parent;
879bug:
880 verbose(env, "verifier bug regno %d tmp %p\n", regno, tmp);
881 verbose(env, "regno %d parent frame %d current frame %d\n",
882 regno, parent->curframe, state->curframe);
883 return NULL;
884}
885
886static int mark_reg_read(struct bpf_verifier_env *env,
887 const struct bpf_verifier_state *state,
888 struct bpf_verifier_state *parent,
889 u32 regno)
890{
891 bool writes = parent == state->parent; /* Observe write marks */
626 892
627 if (regno == BPF_REG_FP) 893 if (regno == BPF_REG_FP)
628 /* We don't need to worry about FP liveness because it's read-only */ 894 /* We don't need to worry about FP liveness because it's read-only */
629 return; 895 return 0;
630 896
631 while (parent) { 897 while (parent) {
632 /* if read wasn't screened by an earlier write ... */ 898 /* if read wasn't screened by an earlier write ... */
633 if (state->regs[regno].live & REG_LIVE_WRITTEN) 899 if (writes && state->frame[state->curframe]->regs[regno].live & REG_LIVE_WRITTEN)
634 break; 900 break;
901 parent = skip_callee(env, state, parent, regno);
902 if (!parent)
903 return -EFAULT;
635 /* ... then we depend on parent's value */ 904 /* ... then we depend on parent's value */
636 parent->regs[regno].live |= REG_LIVE_READ; 905 parent->frame[parent->curframe]->regs[regno].live |= REG_LIVE_READ;
637 state = parent; 906 state = parent;
638 parent = state->parent; 907 parent = state->parent;
908 writes = true;
639 } 909 }
910 return 0;
640} 911}
641 912
642static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, 913static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
643 enum reg_arg_type t) 914 enum reg_arg_type t)
644{ 915{
645 struct bpf_reg_state *regs = env->cur_state->regs; 916 struct bpf_verifier_state *vstate = env->cur_state;
917 struct bpf_func_state *state = vstate->frame[vstate->curframe];
918 struct bpf_reg_state *regs = state->regs;
646 919
647 if (regno >= MAX_BPF_REG) { 920 if (regno >= MAX_BPF_REG) {
648 verbose(env, "R%d is invalid\n", regno); 921 verbose(env, "R%d is invalid\n", regno);
@@ -655,7 +928,7 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
655 verbose(env, "R%d !read_ok\n", regno); 928 verbose(env, "R%d !read_ok\n", regno);
656 return -EACCES; 929 return -EACCES;
657 } 930 }
658 mark_reg_read(env->cur_state, regno); 931 return mark_reg_read(env, vstate, vstate->parent, regno);
659 } else { 932 } else {
660 /* check whether register used as dest operand can be written to */ 933 /* check whether register used as dest operand can be written to */
661 if (regno == BPF_REG_FP) { 934 if (regno == BPF_REG_FP) {
@@ -686,17 +959,25 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
686 } 959 }
687} 960}
688 961
962/* Does this register contain a constant zero? */
963static bool register_is_null(struct bpf_reg_state *reg)
964{
965 return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0);
966}
967
689/* check_stack_read/write functions track spill/fill of registers, 968/* check_stack_read/write functions track spill/fill of registers,
690 * stack boundary and alignment are checked in check_mem_access() 969 * stack boundary and alignment are checked in check_mem_access()
691 */ 970 */
692static int check_stack_write(struct bpf_verifier_env *env, 971static int check_stack_write(struct bpf_verifier_env *env,
693 struct bpf_verifier_state *state, int off, 972 struct bpf_func_state *state, /* func where register points to */
694 int size, int value_regno) 973 int off, int size, int value_regno)
695{ 974{
975 struct bpf_func_state *cur; /* state of the current function */
696 int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; 976 int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
977 enum bpf_reg_type type;
697 978
698 err = realloc_verifier_state(state, round_up(slot + 1, BPF_REG_SIZE), 979 err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE),
699 true); 980 true);
700 if (err) 981 if (err)
701 return err; 982 return err;
702 /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, 983 /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
@@ -709,8 +990,9 @@ static int check_stack_write(struct bpf_verifier_env *env,
709 return -EACCES; 990 return -EACCES;
710 } 991 }
711 992
993 cur = env->cur_state->frame[env->cur_state->curframe];
712 if (value_regno >= 0 && 994 if (value_regno >= 0 &&
713 is_spillable_regtype(state->regs[value_regno].type)) { 995 is_spillable_regtype((type = cur->regs[value_regno].type))) {
714 996
715 /* register containing pointer is being spilled into stack */ 997 /* register containing pointer is being spilled into stack */
716 if (size != BPF_REG_SIZE) { 998 if (size != BPF_REG_SIZE) {
@@ -718,51 +1000,116 @@ static int check_stack_write(struct bpf_verifier_env *env,
718 return -EACCES; 1000 return -EACCES;
719 } 1001 }
720 1002
1003 if (state != cur && type == PTR_TO_STACK) {
1004 verbose(env, "cannot spill pointers to stack into stack frame of the caller\n");
1005 return -EINVAL;
1006 }
1007
721 /* save register state */ 1008 /* save register state */
722 state->stack[spi].spilled_ptr = state->regs[value_regno]; 1009 state->stack[spi].spilled_ptr = cur->regs[value_regno];
723 state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; 1010 state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
724 1011
725 for (i = 0; i < BPF_REG_SIZE; i++) 1012 for (i = 0; i < BPF_REG_SIZE; i++)
726 state->stack[spi].slot_type[i] = STACK_SPILL; 1013 state->stack[spi].slot_type[i] = STACK_SPILL;
727 } else { 1014 } else {
1015 u8 type = STACK_MISC;
1016
728 /* regular write of data into stack */ 1017 /* regular write of data into stack */
729 state->stack[spi].spilled_ptr = (struct bpf_reg_state) {}; 1018 state->stack[spi].spilled_ptr = (struct bpf_reg_state) {};
730 1019
1020 /* only mark the slot as written if all 8 bytes were written
1021 * otherwise read propagation may incorrectly stop too soon
1022 * when stack slots are partially written.
1023 * This heuristic means that read propagation will be
1024 * conservative, since it will add reg_live_read marks
1025 * to stack slots all the way to first state when programs
1026 * writes+reads less than 8 bytes
1027 */
1028 if (size == BPF_REG_SIZE)
1029 state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
1030
1031 /* when we zero initialize stack slots mark them as such */
1032 if (value_regno >= 0 &&
1033 register_is_null(&cur->regs[value_regno]))
1034 type = STACK_ZERO;
1035
731 for (i = 0; i < size; i++) 1036 for (i = 0; i < size; i++)
732 state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] = 1037 state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] =
733 STACK_MISC; 1038 type;
734 } 1039 }
735 return 0; 1040 return 0;
736} 1041}
737 1042
738static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slot) 1043/* registers of every function are unique and mark_reg_read() propagates
1044 * the liveness in the following cases:
1045 * - from callee into caller for R1 - R5 that were used as arguments
1046 * - from caller into callee for R0 that used as result of the call
1047 * - from caller to the same caller skipping states of the callee for R6 - R9,
1048 * since R6 - R9 are callee saved by implicit function prologue and
1049 * caller's R6 != callee's R6, so when we propagate liveness up to
1050 * parent states we need to skip callee states for R6 - R9.
1051 *
1052 * stack slot marking is different, since stacks of caller and callee are
1053 * accessible in both (since caller can pass a pointer to caller's stack to
1054 * callee which can pass it to another function), hence mark_stack_slot_read()
1055 * has to propagate the stack liveness to all parent states at given frame number.
1056 * Consider code:
1057 * f1() {
1058 * ptr = fp - 8;
1059 * *ptr = ctx;
1060 * call f2 {
1061 * .. = *ptr;
1062 * }
1063 * .. = *ptr;
1064 * }
1065 * First *ptr is reading from f1's stack and mark_stack_slot_read() has
1066 * to mark liveness at the f1's frame and not f2's frame.
1067 * Second *ptr is also reading from f1's stack and mark_stack_slot_read() has
1068 * to propagate liveness to f2 states at f1's frame level and further into
1069 * f1 states at f1's frame level until write into that stack slot
1070 */
1071static void mark_stack_slot_read(struct bpf_verifier_env *env,
1072 const struct bpf_verifier_state *state,
1073 struct bpf_verifier_state *parent,
1074 int slot, int frameno)
739{ 1075{
740 struct bpf_verifier_state *parent = state->parent; 1076 bool writes = parent == state->parent; /* Observe write marks */
741 1077
742 while (parent) { 1078 while (parent) {
1079 if (parent->frame[frameno]->allocated_stack <= slot * BPF_REG_SIZE)
1080 /* since LIVE_WRITTEN mark is only done for full 8-byte
1081 * write the read marks are conservative and parent
1082 * state may not even have the stack allocated. In such case
1083 * end the propagation, since the loop reached beginning
1084 * of the function
1085 */
1086 break;
743 /* if read wasn't screened by an earlier write ... */ 1087 /* if read wasn't screened by an earlier write ... */
744 if (state->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN) 1088 if (writes && state->frame[frameno]->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN)
745 break; 1089 break;
746 /* ... then we depend on parent's value */ 1090 /* ... then we depend on parent's value */
747 parent->stack[slot].spilled_ptr.live |= REG_LIVE_READ; 1091 parent->frame[frameno]->stack[slot].spilled_ptr.live |= REG_LIVE_READ;
748 state = parent; 1092 state = parent;
749 parent = state->parent; 1093 parent = state->parent;
1094 writes = true;
750 } 1095 }
751} 1096}
752 1097
753static int check_stack_read(struct bpf_verifier_env *env, 1098static int check_stack_read(struct bpf_verifier_env *env,
754 struct bpf_verifier_state *state, int off, int size, 1099 struct bpf_func_state *reg_state /* func where register points to */,
755 int value_regno) 1100 int off, int size, int value_regno)
756{ 1101{
1102 struct bpf_verifier_state *vstate = env->cur_state;
1103 struct bpf_func_state *state = vstate->frame[vstate->curframe];
757 int i, slot = -off - 1, spi = slot / BPF_REG_SIZE; 1104 int i, slot = -off - 1, spi = slot / BPF_REG_SIZE;
758 u8 *stype; 1105 u8 *stype;
759 1106
760 if (state->allocated_stack <= slot) { 1107 if (reg_state->allocated_stack <= slot) {
761 verbose(env, "invalid read from stack off %d+0 size %d\n", 1108 verbose(env, "invalid read from stack off %d+0 size %d\n",
762 off, size); 1109 off, size);
763 return -EACCES; 1110 return -EACCES;
764 } 1111 }
765 stype = state->stack[spi].slot_type; 1112 stype = reg_state->stack[spi].slot_type;
766 1113
767 if (stype[0] == STACK_SPILL) { 1114 if (stype[0] == STACK_SPILL) {
768 if (size != BPF_REG_SIZE) { 1115 if (size != BPF_REG_SIZE) {
@@ -778,21 +1125,44 @@ static int check_stack_read(struct bpf_verifier_env *env,
778 1125
779 if (value_regno >= 0) { 1126 if (value_regno >= 0) {
780 /* restore register state from stack */ 1127 /* restore register state from stack */
781 state->regs[value_regno] = state->stack[spi].spilled_ptr; 1128 state->regs[value_regno] = reg_state->stack[spi].spilled_ptr;
782 mark_stack_slot_read(state, spi); 1129 /* mark reg as written since spilled pointer state likely
1130 * has its liveness marks cleared by is_state_visited()
1131 * which resets stack/reg liveness for state transitions
1132 */
1133 state->regs[value_regno].live |= REG_LIVE_WRITTEN;
783 } 1134 }
1135 mark_stack_slot_read(env, vstate, vstate->parent, spi,
1136 reg_state->frameno);
784 return 0; 1137 return 0;
785 } else { 1138 } else {
1139 int zeros = 0;
1140
786 for (i = 0; i < size; i++) { 1141 for (i = 0; i < size; i++) {
787 if (stype[(slot - i) % BPF_REG_SIZE] != STACK_MISC) { 1142 if (stype[(slot - i) % BPF_REG_SIZE] == STACK_MISC)
788 verbose(env, "invalid read from stack off %d+%d size %d\n", 1143 continue;
789 off, i, size); 1144 if (stype[(slot - i) % BPF_REG_SIZE] == STACK_ZERO) {
790 return -EACCES; 1145 zeros++;
1146 continue;
1147 }
1148 verbose(env, "invalid read from stack off %d+%d size %d\n",
1149 off, i, size);
1150 return -EACCES;
1151 }
1152 mark_stack_slot_read(env, vstate, vstate->parent, spi,
1153 reg_state->frameno);
1154 if (value_regno >= 0) {
1155 if (zeros == size) {
1156 /* any size read into register is zero extended,
1157 * so the whole register == const_zero
1158 */
1159 __mark_reg_const_zero(&state->regs[value_regno]);
1160 } else {
1161 /* have read misc data from the stack */
1162 mark_reg_unknown(env, state->regs, value_regno);
791 } 1163 }
1164 state->regs[value_regno].live |= REG_LIVE_WRITTEN;
792 } 1165 }
793 if (value_regno >= 0)
794 /* have read misc data from the stack */
795 mark_reg_unknown(env, state->regs, value_regno);
796 return 0; 1166 return 0;
797 } 1167 }
798} 1168}
@@ -817,7 +1187,8 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
817static int check_map_access(struct bpf_verifier_env *env, u32 regno, 1187static int check_map_access(struct bpf_verifier_env *env, u32 regno,
818 int off, int size, bool zero_size_allowed) 1188 int off, int size, bool zero_size_allowed)
819{ 1189{
820 struct bpf_verifier_state *state = env->cur_state; 1190 struct bpf_verifier_state *vstate = env->cur_state;
1191 struct bpf_func_state *state = vstate->frame[vstate->curframe];
821 struct bpf_reg_state *reg = &state->regs[regno]; 1192 struct bpf_reg_state *reg = &state->regs[regno];
822 int err; 1193 int err;
823 1194
@@ -1079,6 +1450,103 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
1079 strict); 1450 strict);
1080} 1451}
1081 1452
1453static int update_stack_depth(struct bpf_verifier_env *env,
1454 const struct bpf_func_state *func,
1455 int off)
1456{
1457 u16 stack = env->subprog_stack_depth[func->subprogno];
1458
1459 if (stack >= -off)
1460 return 0;
1461
1462 /* update known max for given subprogram */
1463 env->subprog_stack_depth[func->subprogno] = -off;
1464 return 0;
1465}
1466
1467/* starting from main bpf function walk all instructions of the function
1468 * and recursively walk all callees that given function can call.
1469 * Ignore jump and exit insns.
1470 * Since recursion is prevented by check_cfg() this algorithm
1471 * only needs a local stack of MAX_CALL_FRAMES to remember callsites
1472 */
1473static int check_max_stack_depth(struct bpf_verifier_env *env)
1474{
1475 int depth = 0, frame = 0, subprog = 0, i = 0, subprog_end;
1476 struct bpf_insn *insn = env->prog->insnsi;
1477 int insn_cnt = env->prog->len;
1478 int ret_insn[MAX_CALL_FRAMES];
1479 int ret_prog[MAX_CALL_FRAMES];
1480
1481process_func:
1482 /* round up to 32-bytes, since this is granularity
1483 * of interpreter stack size
1484 */
1485 depth += round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32);
1486 if (depth > MAX_BPF_STACK) {
1487 verbose(env, "combined stack size of %d calls is %d. Too large\n",
1488 frame + 1, depth);
1489 return -EACCES;
1490 }
1491continue_func:
1492 if (env->subprog_cnt == subprog)
1493 subprog_end = insn_cnt;
1494 else
1495 subprog_end = env->subprog_starts[subprog];
1496 for (; i < subprog_end; i++) {
1497 if (insn[i].code != (BPF_JMP | BPF_CALL))
1498 continue;
1499 if (insn[i].src_reg != BPF_PSEUDO_CALL)
1500 continue;
1501 /* remember insn and function to return to */
1502 ret_insn[frame] = i + 1;
1503 ret_prog[frame] = subprog;
1504
1505 /* find the callee */
1506 i = i + insn[i].imm + 1;
1507 subprog = find_subprog(env, i);
1508 if (subprog < 0) {
1509 WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
1510 i);
1511 return -EFAULT;
1512 }
1513 subprog++;
1514 frame++;
1515 if (frame >= MAX_CALL_FRAMES) {
1516 WARN_ONCE(1, "verifier bug. Call stack is too deep\n");
1517 return -EFAULT;
1518 }
1519 goto process_func;
1520 }
1521 /* end of for() loop means the last insn of the 'subprog'
1522 * was reached. Doesn't matter whether it was JA or EXIT
1523 */
1524 if (frame == 0)
1525 return 0;
1526 depth -= round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32);
1527 frame--;
1528 i = ret_insn[frame];
1529 subprog = ret_prog[frame];
1530 goto continue_func;
1531}
1532
1533#ifndef CONFIG_BPF_JIT_ALWAYS_ON
1534static int get_callee_stack_depth(struct bpf_verifier_env *env,
1535 const struct bpf_insn *insn, int idx)
1536{
1537 int start = idx + insn->imm + 1, subprog;
1538
1539 subprog = find_subprog(env, start);
1540 if (subprog < 0) {
1541 WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
1542 start);
1543 return -EFAULT;
1544 }
1545 subprog++;
1546 return env->subprog_stack_depth[subprog];
1547}
1548#endif
1549
1082/* truncate register to smaller size (in bytes) 1550/* truncate register to smaller size (in bytes)
1083 * must be called with size < BPF_REG_SIZE 1551 * must be called with size < BPF_REG_SIZE
1084 */ 1552 */
@@ -1112,9 +1580,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
1112 int bpf_size, enum bpf_access_type t, 1580 int bpf_size, enum bpf_access_type t,
1113 int value_regno) 1581 int value_regno)
1114{ 1582{
1115 struct bpf_verifier_state *state = env->cur_state;
1116 struct bpf_reg_state *regs = cur_regs(env); 1583 struct bpf_reg_state *regs = cur_regs(env);
1117 struct bpf_reg_state *reg = regs + regno; 1584 struct bpf_reg_state *reg = regs + regno;
1585 struct bpf_func_state *state;
1118 int size, err = 0; 1586 int size, err = 0;
1119 1587
1120 size = bpf_size_to_bytes(bpf_size); 1588 size = bpf_size_to_bytes(bpf_size);
@@ -1203,8 +1671,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
1203 return -EACCES; 1671 return -EACCES;
1204 } 1672 }
1205 1673
1206 if (env->prog->aux->stack_depth < -off) 1674 state = func(env, reg);
1207 env->prog->aux->stack_depth = -off; 1675 err = update_stack_depth(env, state, off);
1676 if (err)
1677 return err;
1208 1678
1209 if (t == BPF_WRITE) 1679 if (t == BPF_WRITE)
1210 err = check_stack_write(env, state, off, size, 1680 err = check_stack_write(env, state, off, size,
@@ -1282,12 +1752,6 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
1282 BPF_SIZE(insn->code), BPF_WRITE, -1); 1752 BPF_SIZE(insn->code), BPF_WRITE, -1);
1283} 1753}
1284 1754
1285/* Does this register contain a constant zero? */
1286static bool register_is_null(struct bpf_reg_state reg)
1287{
1288 return reg.type == SCALAR_VALUE && tnum_equals_const(reg.var_off, 0);
1289}
1290
1291/* when register 'regno' is passed into function that will read 'access_size' 1755/* when register 'regno' is passed into function that will read 'access_size'
1292 * bytes from that pointer, make sure that it's within stack boundary 1756 * bytes from that pointer, make sure that it's within stack boundary
1293 * and all elements of stack are initialized. 1757 * and all elements of stack are initialized.
@@ -1298,32 +1762,32 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
1298 int access_size, bool zero_size_allowed, 1762 int access_size, bool zero_size_allowed,
1299 struct bpf_call_arg_meta *meta) 1763 struct bpf_call_arg_meta *meta)
1300{ 1764{
1301 struct bpf_verifier_state *state = env->cur_state; 1765 struct bpf_reg_state *reg = cur_regs(env) + regno;
1302 struct bpf_reg_state *regs = state->regs; 1766 struct bpf_func_state *state = func(env, reg);
1303 int off, i, slot, spi; 1767 int off, i, slot, spi;
1304 1768
1305 if (regs[regno].type != PTR_TO_STACK) { 1769 if (reg->type != PTR_TO_STACK) {
1306 /* Allow zero-byte read from NULL, regardless of pointer type */ 1770 /* Allow zero-byte read from NULL, regardless of pointer type */
1307 if (zero_size_allowed && access_size == 0 && 1771 if (zero_size_allowed && access_size == 0 &&
1308 register_is_null(regs[regno])) 1772 register_is_null(reg))
1309 return 0; 1773 return 0;
1310 1774
1311 verbose(env, "R%d type=%s expected=%s\n", regno, 1775 verbose(env, "R%d type=%s expected=%s\n", regno,
1312 reg_type_str[regs[regno].type], 1776 reg_type_str[reg->type],
1313 reg_type_str[PTR_TO_STACK]); 1777 reg_type_str[PTR_TO_STACK]);
1314 return -EACCES; 1778 return -EACCES;
1315 } 1779 }
1316 1780
1317 /* Only allow fixed-offset stack reads */ 1781 /* Only allow fixed-offset stack reads */
1318 if (!tnum_is_const(regs[regno].var_off)) { 1782 if (!tnum_is_const(reg->var_off)) {
1319 char tn_buf[48]; 1783 char tn_buf[48];
1320 1784
1321 tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off); 1785 tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
1322 verbose(env, "invalid variable stack read R%d var_off=%s\n", 1786 verbose(env, "invalid variable stack read R%d var_off=%s\n",
1323 regno, tn_buf); 1787 regno, tn_buf);
1324 return -EACCES; 1788 return -EACCES;
1325 } 1789 }
1326 off = regs[regno].off + regs[regno].var_off.value; 1790 off = reg->off + reg->var_off.value;
1327 if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || 1791 if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
1328 access_size < 0 || (access_size == 0 && !zero_size_allowed)) { 1792 access_size < 0 || (access_size == 0 && !zero_size_allowed)) {
1329 verbose(env, "invalid stack type R%d off=%d access_size=%d\n", 1793 verbose(env, "invalid stack type R%d off=%d access_size=%d\n",
@@ -1331,9 +1795,6 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
1331 return -EACCES; 1795 return -EACCES;
1332 } 1796 }
1333 1797
1334 if (env->prog->aux->stack_depth < -off)
1335 env->prog->aux->stack_depth = -off;
1336
1337 if (meta && meta->raw_mode) { 1798 if (meta && meta->raw_mode) {
1338 meta->access_size = access_size; 1799 meta->access_size = access_size;
1339 meta->regno = regno; 1800 meta->regno = regno;
@@ -1341,17 +1802,32 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
1341 } 1802 }
1342 1803
1343 for (i = 0; i < access_size; i++) { 1804 for (i = 0; i < access_size; i++) {
1805 u8 *stype;
1806
1344 slot = -(off + i) - 1; 1807 slot = -(off + i) - 1;
1345 spi = slot / BPF_REG_SIZE; 1808 spi = slot / BPF_REG_SIZE;
1346 if (state->allocated_stack <= slot || 1809 if (state->allocated_stack <= slot)
1347 state->stack[spi].slot_type[slot % BPF_REG_SIZE] != 1810 goto err;
1348 STACK_MISC) { 1811 stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE];
1349 verbose(env, "invalid indirect read from stack off %d+%d size %d\n", 1812 if (*stype == STACK_MISC)
1350 off, i, access_size); 1813 goto mark;
1351 return -EACCES; 1814 if (*stype == STACK_ZERO) {
1815 /* helper can write anything into the stack */
1816 *stype = STACK_MISC;
1817 goto mark;
1352 } 1818 }
1819err:
1820 verbose(env, "invalid indirect read from stack off %d+%d size %d\n",
1821 off, i, access_size);
1822 return -EACCES;
1823mark:
1824 /* reading any byte out of 8-byte 'spill_slot' will cause
1825 * the whole slot to be marked as 'read'
1826 */
1827 mark_stack_slot_read(env, env->cur_state, env->cur_state->parent,
1828 spi, state->frameno);
1353 } 1829 }
1354 return 0; 1830 return update_stack_depth(env, state, off);
1355} 1831}
1356 1832
1357static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, 1833static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
@@ -1374,6 +1850,19 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
1374 } 1850 }
1375} 1851}
1376 1852
1853static bool arg_type_is_mem_ptr(enum bpf_arg_type type)
1854{
1855 return type == ARG_PTR_TO_MEM ||
1856 type == ARG_PTR_TO_MEM_OR_NULL ||
1857 type == ARG_PTR_TO_UNINIT_MEM;
1858}
1859
1860static bool arg_type_is_mem_size(enum bpf_arg_type type)
1861{
1862 return type == ARG_CONST_SIZE ||
1863 type == ARG_CONST_SIZE_OR_ZERO;
1864}
1865
1377static int check_func_arg(struct bpf_verifier_env *env, u32 regno, 1866static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
1378 enum bpf_arg_type arg_type, 1867 enum bpf_arg_type arg_type,
1379 struct bpf_call_arg_meta *meta) 1868 struct bpf_call_arg_meta *meta)
@@ -1423,15 +1912,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
1423 expected_type = PTR_TO_CTX; 1912 expected_type = PTR_TO_CTX;
1424 if (type != expected_type) 1913 if (type != expected_type)
1425 goto err_type; 1914 goto err_type;
1426 } else if (arg_type == ARG_PTR_TO_MEM || 1915 } else if (arg_type_is_mem_ptr(arg_type)) {
1427 arg_type == ARG_PTR_TO_MEM_OR_NULL ||
1428 arg_type == ARG_PTR_TO_UNINIT_MEM) {
1429 expected_type = PTR_TO_STACK; 1916 expected_type = PTR_TO_STACK;
1430 /* One exception here. In case function allows for NULL to be 1917 /* One exception here. In case function allows for NULL to be
1431 * passed in as argument, it's a SCALAR_VALUE type. Final test 1918 * passed in as argument, it's a SCALAR_VALUE type. Final test
1432 * happens during stack boundary checking. 1919 * happens during stack boundary checking.
1433 */ 1920 */
1434 if (register_is_null(*reg) && 1921 if (register_is_null(reg) &&
1435 arg_type == ARG_PTR_TO_MEM_OR_NULL) 1922 arg_type == ARG_PTR_TO_MEM_OR_NULL)
1436 /* final test in check_stack_boundary() */; 1923 /* final test in check_stack_boundary() */;
1437 else if (!type_is_pkt_pointer(type) && 1924 else if (!type_is_pkt_pointer(type) &&
@@ -1486,25 +1973,12 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
1486 err = check_stack_boundary(env, regno, 1973 err = check_stack_boundary(env, regno,
1487 meta->map_ptr->value_size, 1974 meta->map_ptr->value_size,
1488 false, NULL); 1975 false, NULL);
1489 } else if (arg_type == ARG_CONST_SIZE || 1976 } else if (arg_type_is_mem_size(arg_type)) {
1490 arg_type == ARG_CONST_SIZE_OR_ZERO) {
1491 bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); 1977 bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
1492 1978
1493 /* bpf_xxx(..., buf, len) call will access 'len' bytes
1494 * from stack pointer 'buf'. Check it
1495 * note: regno == len, regno - 1 == buf
1496 */
1497 if (regno == 0) {
1498 /* kernel subsystem misconfigured verifier */
1499 verbose(env,
1500 "ARG_CONST_SIZE cannot be first argument\n");
1501 return -EACCES;
1502 }
1503
1504 /* The register is SCALAR_VALUE; the access check 1979 /* The register is SCALAR_VALUE; the access check
1505 * happens using its boundaries. 1980 * happens using its boundaries.
1506 */ 1981 */
1507
1508 if (!tnum_is_const(reg->var_off)) 1982 if (!tnum_is_const(reg->var_off))
1509 /* For unprivileged variable accesses, disable raw 1983 /* For unprivileged variable accesses, disable raw
1510 * mode so that the program is required to 1984 * mode so that the program is required to
@@ -1604,6 +2078,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
1604 case BPF_FUNC_tail_call: 2078 case BPF_FUNC_tail_call:
1605 if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY) 2079 if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
1606 goto error; 2080 goto error;
2081 if (env->subprog_cnt) {
2082 verbose(env, "tail_calls are not allowed in programs with bpf-to-bpf calls\n");
2083 return -EINVAL;
2084 }
1607 break; 2085 break;
1608 case BPF_FUNC_perf_event_read: 2086 case BPF_FUNC_perf_event_read:
1609 case BPF_FUNC_perf_event_output: 2087 case BPF_FUNC_perf_event_output:
@@ -1644,7 +2122,7 @@ error:
1644 return -EINVAL; 2122 return -EINVAL;
1645} 2123}
1646 2124
1647static int check_raw_mode(const struct bpf_func_proto *fn) 2125static bool check_raw_mode_ok(const struct bpf_func_proto *fn)
1648{ 2126{
1649 int count = 0; 2127 int count = 0;
1650 2128
@@ -1659,15 +2137,52 @@ static int check_raw_mode(const struct bpf_func_proto *fn)
1659 if (fn->arg5_type == ARG_PTR_TO_UNINIT_MEM) 2137 if (fn->arg5_type == ARG_PTR_TO_UNINIT_MEM)
1660 count++; 2138 count++;
1661 2139
1662 return count > 1 ? -EINVAL : 0; 2140 /* We only support one arg being in raw mode at the moment,
2141 * which is sufficient for the helper functions we have
2142 * right now.
2143 */
2144 return count <= 1;
2145}
2146
2147static bool check_args_pair_invalid(enum bpf_arg_type arg_curr,
2148 enum bpf_arg_type arg_next)
2149{
2150 return (arg_type_is_mem_ptr(arg_curr) &&
2151 !arg_type_is_mem_size(arg_next)) ||
2152 (!arg_type_is_mem_ptr(arg_curr) &&
2153 arg_type_is_mem_size(arg_next));
2154}
2155
2156static bool check_arg_pair_ok(const struct bpf_func_proto *fn)
2157{
2158 /* bpf_xxx(..., buf, len) call will access 'len'
2159 * bytes from memory 'buf'. Both arg types need
2160 * to be paired, so make sure there's no buggy
2161 * helper function specification.
2162 */
2163 if (arg_type_is_mem_size(fn->arg1_type) ||
2164 arg_type_is_mem_ptr(fn->arg5_type) ||
2165 check_args_pair_invalid(fn->arg1_type, fn->arg2_type) ||
2166 check_args_pair_invalid(fn->arg2_type, fn->arg3_type) ||
2167 check_args_pair_invalid(fn->arg3_type, fn->arg4_type) ||
2168 check_args_pair_invalid(fn->arg4_type, fn->arg5_type))
2169 return false;
2170
2171 return true;
2172}
2173
2174static int check_func_proto(const struct bpf_func_proto *fn)
2175{
2176 return check_raw_mode_ok(fn) &&
2177 check_arg_pair_ok(fn) ? 0 : -EINVAL;
1663} 2178}
1664 2179
1665/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] 2180/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
1666 * are now invalid, so turn them into unknown SCALAR_VALUE. 2181 * are now invalid, so turn them into unknown SCALAR_VALUE.
1667 */ 2182 */
1668static void clear_all_pkt_pointers(struct bpf_verifier_env *env) 2183static void __clear_all_pkt_pointers(struct bpf_verifier_env *env,
2184 struct bpf_func_state *state)
1669{ 2185{
1670 struct bpf_verifier_state *state = env->cur_state;
1671 struct bpf_reg_state *regs = state->regs, *reg; 2186 struct bpf_reg_state *regs = state->regs, *reg;
1672 int i; 2187 int i;
1673 2188
@@ -1684,7 +2199,121 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
1684 } 2199 }
1685} 2200}
1686 2201
1687static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) 2202static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
2203{
2204 struct bpf_verifier_state *vstate = env->cur_state;
2205 int i;
2206
2207 for (i = 0; i <= vstate->curframe; i++)
2208 __clear_all_pkt_pointers(env, vstate->frame[i]);
2209}
2210
2211static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
2212 int *insn_idx)
2213{
2214 struct bpf_verifier_state *state = env->cur_state;
2215 struct bpf_func_state *caller, *callee;
2216 int i, subprog, target_insn;
2217
2218 if (state->curframe + 1 >= MAX_CALL_FRAMES) {
2219 verbose(env, "the call stack of %d frames is too deep\n",
2220 state->curframe + 2);
2221 return -E2BIG;
2222 }
2223
2224 target_insn = *insn_idx + insn->imm;
2225 subprog = find_subprog(env, target_insn + 1);
2226 if (subprog < 0) {
2227 verbose(env, "verifier bug. No program starts at insn %d\n",
2228 target_insn + 1);
2229 return -EFAULT;
2230 }
2231
2232 caller = state->frame[state->curframe];
2233 if (state->frame[state->curframe + 1]) {
2234 verbose(env, "verifier bug. Frame %d already allocated\n",
2235 state->curframe + 1);
2236 return -EFAULT;
2237 }
2238
2239 callee = kzalloc(sizeof(*callee), GFP_KERNEL);
2240 if (!callee)
2241 return -ENOMEM;
2242 state->frame[state->curframe + 1] = callee;
2243
2244 /* callee cannot access r0, r6 - r9 for reading and has to write
2245 * into its own stack before reading from it.
2246 * callee can read/write into caller's stack
2247 */
2248 init_func_state(env, callee,
2249 /* remember the callsite, it will be used by bpf_exit */
2250 *insn_idx /* callsite */,
2251 state->curframe + 1 /* frameno within this callchain */,
2252 subprog + 1 /* subprog number within this prog */);
2253
2254 /* copy r1 - r5 args that callee can access */
2255 for (i = BPF_REG_1; i <= BPF_REG_5; i++)
2256 callee->regs[i] = caller->regs[i];
2257
2258 /* after the call regsiters r0 - r5 were scratched */
2259 for (i = 0; i < CALLER_SAVED_REGS; i++) {
2260 mark_reg_not_init(env, caller->regs, caller_saved[i]);
2261 check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
2262 }
2263
2264 /* only increment it after check_reg_arg() finished */
2265 state->curframe++;
2266
2267 /* and go analyze first insn of the callee */
2268 *insn_idx = target_insn;
2269
2270 if (env->log.level) {
2271 verbose(env, "caller:\n");
2272 print_verifier_state(env, caller);
2273 verbose(env, "callee:\n");
2274 print_verifier_state(env, callee);
2275 }
2276 return 0;
2277}
2278
2279static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
2280{
2281 struct bpf_verifier_state *state = env->cur_state;
2282 struct bpf_func_state *caller, *callee;
2283 struct bpf_reg_state *r0;
2284
2285 callee = state->frame[state->curframe];
2286 r0 = &callee->regs[BPF_REG_0];
2287 if (r0->type == PTR_TO_STACK) {
2288 /* technically it's ok to return caller's stack pointer
2289 * (or caller's caller's pointer) back to the caller,
2290 * since these pointers are valid. Only current stack
2291 * pointer will be invalid as soon as function exits,
2292 * but let's be conservative
2293 */
2294 verbose(env, "cannot return stack pointer to the caller\n");
2295 return -EINVAL;
2296 }
2297
2298 state->curframe--;
2299 caller = state->frame[state->curframe];
2300 /* return to the caller whatever r0 had in the callee */
2301 caller->regs[BPF_REG_0] = *r0;
2302
2303 *insn_idx = callee->callsite + 1;
2304 if (env->log.level) {
2305 verbose(env, "returning from callee:\n");
2306 print_verifier_state(env, callee);
2307 verbose(env, "to caller at %d:\n", *insn_idx);
2308 print_verifier_state(env, caller);
2309 }
2310 /* clear everything in the callee */
2311 free_func_state(callee);
2312 state->frame[state->curframe + 1] = NULL;
2313 return 0;
2314}
2315
2316static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
1688{ 2317{
1689 const struct bpf_func_proto *fn = NULL; 2318 const struct bpf_func_proto *fn = NULL;
1690 struct bpf_reg_state *regs; 2319 struct bpf_reg_state *regs;
@@ -1701,7 +2330,6 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
1701 2330
1702 if (env->ops->get_func_proto) 2331 if (env->ops->get_func_proto)
1703 fn = env->ops->get_func_proto(func_id); 2332 fn = env->ops->get_func_proto(func_id);
1704
1705 if (!fn) { 2333 if (!fn) {
1706 verbose(env, "unknown func %s#%d\n", func_id_name(func_id), 2334 verbose(env, "unknown func %s#%d\n", func_id_name(func_id),
1707 func_id); 2335 func_id);
@@ -1725,10 +2353,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
1725 memset(&meta, 0, sizeof(meta)); 2353 memset(&meta, 0, sizeof(meta));
1726 meta.pkt_access = fn->pkt_access; 2354 meta.pkt_access = fn->pkt_access;
1727 2355
1728 /* We only support one arg being in raw mode at the moment, which 2356 err = check_func_proto(fn);
1729 * is sufficient for the helper functions we have right now.
1730 */
1731 err = check_raw_mode(fn);
1732 if (err) { 2357 if (err) {
1733 verbose(env, "kernel subsystem misconfigured func %s#%d\n", 2358 verbose(env, "kernel subsystem misconfigured func %s#%d\n",
1734 func_id_name(func_id), func_id); 2359 func_id_name(func_id), func_id);
@@ -1884,7 +2509,9 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
1884 const struct bpf_reg_state *ptr_reg, 2509 const struct bpf_reg_state *ptr_reg,
1885 const struct bpf_reg_state *off_reg) 2510 const struct bpf_reg_state *off_reg)
1886{ 2511{
1887 struct bpf_reg_state *regs = cur_regs(env), *dst_reg; 2512 struct bpf_verifier_state *vstate = env->cur_state;
2513 struct bpf_func_state *state = vstate->frame[vstate->curframe];
2514 struct bpf_reg_state *regs = state->regs, *dst_reg;
1888 bool known = tnum_is_const(off_reg->var_off); 2515 bool known = tnum_is_const(off_reg->var_off);
1889 s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value, 2516 s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value,
1890 smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value; 2517 smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value;
@@ -2319,7 +2946,9 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
2319static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, 2946static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
2320 struct bpf_insn *insn) 2947 struct bpf_insn *insn)
2321{ 2948{
2322 struct bpf_reg_state *regs = cur_regs(env), *dst_reg, *src_reg; 2949 struct bpf_verifier_state *vstate = env->cur_state;
2950 struct bpf_func_state *state = vstate->frame[vstate->curframe];
2951 struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg;
2323 struct bpf_reg_state *ptr_reg = NULL, off_reg = {0}; 2952 struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
2324 u8 opcode = BPF_OP(insn->code); 2953 u8 opcode = BPF_OP(insn->code);
2325 2954
@@ -2370,12 +2999,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
2370 2999
2371 /* Got here implies adding two SCALAR_VALUEs */ 3000 /* Got here implies adding two SCALAR_VALUEs */
2372 if (WARN_ON_ONCE(ptr_reg)) { 3001 if (WARN_ON_ONCE(ptr_reg)) {
2373 print_verifier_state(env, env->cur_state); 3002 print_verifier_state(env, state);
2374 verbose(env, "verifier internal error: unexpected ptr_reg\n"); 3003 verbose(env, "verifier internal error: unexpected ptr_reg\n");
2375 return -EINVAL; 3004 return -EINVAL;
2376 } 3005 }
2377 if (WARN_ON(!src_reg)) { 3006 if (WARN_ON(!src_reg)) {
2378 print_verifier_state(env, env->cur_state); 3007 print_verifier_state(env, state);
2379 verbose(env, "verifier internal error: no src_reg\n"); 3008 verbose(env, "verifier internal error: no src_reg\n");
2380 return -EINVAL; 3009 return -EINVAL;
2381 } 3010 }
@@ -2537,14 +3166,15 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
2537 return 0; 3166 return 0;
2538} 3167}
2539 3168
2540static void find_good_pkt_pointers(struct bpf_verifier_state *state, 3169static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
2541 struct bpf_reg_state *dst_reg, 3170 struct bpf_reg_state *dst_reg,
2542 enum bpf_reg_type type, 3171 enum bpf_reg_type type,
2543 bool range_right_open) 3172 bool range_right_open)
2544{ 3173{
3174 struct bpf_func_state *state = vstate->frame[vstate->curframe];
2545 struct bpf_reg_state *regs = state->regs, *reg; 3175 struct bpf_reg_state *regs = state->regs, *reg;
2546 u16 new_range; 3176 u16 new_range;
2547 int i; 3177 int i, j;
2548 3178
2549 if (dst_reg->off < 0 || 3179 if (dst_reg->off < 0 ||
2550 (dst_reg->off == 0 && range_right_open)) 3180 (dst_reg->off == 0 && range_right_open))
@@ -2614,12 +3244,15 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
2614 /* keep the maximum range already checked */ 3244 /* keep the maximum range already checked */
2615 regs[i].range = max(regs[i].range, new_range); 3245 regs[i].range = max(regs[i].range, new_range);
2616 3246
2617 for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { 3247 for (j = 0; j <= vstate->curframe; j++) {
2618 if (state->stack[i].slot_type[0] != STACK_SPILL) 3248 state = vstate->frame[j];
2619 continue; 3249 for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
2620 reg = &state->stack[i].spilled_ptr; 3250 if (state->stack[i].slot_type[0] != STACK_SPILL)
2621 if (reg->type == type && reg->id == dst_reg->id) 3251 continue;
2622 reg->range = max(reg->range, new_range); 3252 reg = &state->stack[i].spilled_ptr;
3253 if (reg->type == type && reg->id == dst_reg->id)
3254 reg->range = max(reg->range, new_range);
3255 }
2623 } 3256 }
2624} 3257}
2625 3258
@@ -2857,20 +3490,24 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
2857/* The logic is similar to find_good_pkt_pointers(), both could eventually 3490/* The logic is similar to find_good_pkt_pointers(), both could eventually
2858 * be folded together at some point. 3491 * be folded together at some point.
2859 */ 3492 */
2860static void mark_map_regs(struct bpf_verifier_state *state, u32 regno, 3493static void mark_map_regs(struct bpf_verifier_state *vstate, u32 regno,
2861 bool is_null) 3494 bool is_null)
2862{ 3495{
3496 struct bpf_func_state *state = vstate->frame[vstate->curframe];
2863 struct bpf_reg_state *regs = state->regs; 3497 struct bpf_reg_state *regs = state->regs;
2864 u32 id = regs[regno].id; 3498 u32 id = regs[regno].id;
2865 int i; 3499 int i, j;
2866 3500
2867 for (i = 0; i < MAX_BPF_REG; i++) 3501 for (i = 0; i < MAX_BPF_REG; i++)
2868 mark_map_reg(regs, i, id, is_null); 3502 mark_map_reg(regs, i, id, is_null);
2869 3503
2870 for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { 3504 for (j = 0; j <= vstate->curframe; j++) {
2871 if (state->stack[i].slot_type[0] != STACK_SPILL) 3505 state = vstate->frame[j];
2872 continue; 3506 for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
2873 mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null); 3507 if (state->stack[i].slot_type[0] != STACK_SPILL)
3508 continue;
3509 mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null);
3510 }
2874 } 3511 }
2875} 3512}
2876 3513
@@ -2970,8 +3607,10 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn,
2970static int check_cond_jmp_op(struct bpf_verifier_env *env, 3607static int check_cond_jmp_op(struct bpf_verifier_env *env,
2971 struct bpf_insn *insn, int *insn_idx) 3608 struct bpf_insn *insn, int *insn_idx)
2972{ 3609{
2973 struct bpf_verifier_state *other_branch, *this_branch = env->cur_state; 3610 struct bpf_verifier_state *this_branch = env->cur_state;
2974 struct bpf_reg_state *regs = this_branch->regs, *dst_reg; 3611 struct bpf_verifier_state *other_branch;
3612 struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs;
3613 struct bpf_reg_state *dst_reg, *other_branch_regs;
2975 u8 opcode = BPF_OP(insn->code); 3614 u8 opcode = BPF_OP(insn->code);
2976 int err; 3615 int err;
2977 3616
@@ -3014,8 +3653,9 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
3014 if (BPF_SRC(insn->code) == BPF_K && 3653 if (BPF_SRC(insn->code) == BPF_K &&
3015 (opcode == BPF_JEQ || opcode == BPF_JNE) && 3654 (opcode == BPF_JEQ || opcode == BPF_JNE) &&
3016 dst_reg->type == SCALAR_VALUE && 3655 dst_reg->type == SCALAR_VALUE &&
3017 tnum_equals_const(dst_reg->var_off, insn->imm)) { 3656 tnum_is_const(dst_reg->var_off)) {
3018 if (opcode == BPF_JEQ) { 3657 if ((opcode == BPF_JEQ && dst_reg->var_off.value == insn->imm) ||
3658 (opcode == BPF_JNE && dst_reg->var_off.value != insn->imm)) {
3019 /* if (imm == imm) goto pc+off; 3659 /* if (imm == imm) goto pc+off;
3020 * only follow the goto, ignore fall-through 3660 * only follow the goto, ignore fall-through
3021 */ 3661 */
@@ -3033,6 +3673,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
3033 other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx); 3673 other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx);
3034 if (!other_branch) 3674 if (!other_branch)
3035 return -EFAULT; 3675 return -EFAULT;
3676 other_branch_regs = other_branch->frame[other_branch->curframe]->regs;
3036 3677
3037 /* detect if we are comparing against a constant value so we can adjust 3678 /* detect if we are comparing against a constant value so we can adjust
3038 * our min/max values for our dst register. 3679 * our min/max values for our dst register.
@@ -3045,22 +3686,22 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
3045 if (dst_reg->type == SCALAR_VALUE && 3686 if (dst_reg->type == SCALAR_VALUE &&
3046 regs[insn->src_reg].type == SCALAR_VALUE) { 3687 regs[insn->src_reg].type == SCALAR_VALUE) {
3047 if (tnum_is_const(regs[insn->src_reg].var_off)) 3688 if (tnum_is_const(regs[insn->src_reg].var_off))
3048 reg_set_min_max(&other_branch->regs[insn->dst_reg], 3689 reg_set_min_max(&other_branch_regs[insn->dst_reg],
3049 dst_reg, regs[insn->src_reg].var_off.value, 3690 dst_reg, regs[insn->src_reg].var_off.value,
3050 opcode); 3691 opcode);
3051 else if (tnum_is_const(dst_reg->var_off)) 3692 else if (tnum_is_const(dst_reg->var_off))
3052 reg_set_min_max_inv(&other_branch->regs[insn->src_reg], 3693 reg_set_min_max_inv(&other_branch_regs[insn->src_reg],
3053 &regs[insn->src_reg], 3694 &regs[insn->src_reg],
3054 dst_reg->var_off.value, opcode); 3695 dst_reg->var_off.value, opcode);
3055 else if (opcode == BPF_JEQ || opcode == BPF_JNE) 3696 else if (opcode == BPF_JEQ || opcode == BPF_JNE)
3056 /* Comparing for equality, we can combine knowledge */ 3697 /* Comparing for equality, we can combine knowledge */
3057 reg_combine_min_max(&other_branch->regs[insn->src_reg], 3698 reg_combine_min_max(&other_branch_regs[insn->src_reg],
3058 &other_branch->regs[insn->dst_reg], 3699 &other_branch_regs[insn->dst_reg],
3059 &regs[insn->src_reg], 3700 &regs[insn->src_reg],
3060 &regs[insn->dst_reg], opcode); 3701 &regs[insn->dst_reg], opcode);
3061 } 3702 }
3062 } else if (dst_reg->type == SCALAR_VALUE) { 3703 } else if (dst_reg->type == SCALAR_VALUE) {
3063 reg_set_min_max(&other_branch->regs[insn->dst_reg], 3704 reg_set_min_max(&other_branch_regs[insn->dst_reg],
3064 dst_reg, insn->imm, opcode); 3705 dst_reg, insn->imm, opcode);
3065 } 3706 }
3066 3707
@@ -3081,7 +3722,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
3081 return -EACCES; 3722 return -EACCES;
3082 } 3723 }
3083 if (env->log.level) 3724 if (env->log.level)
3084 print_verifier_state(env, this_branch); 3725 print_verifier_state(env, this_branch->frame[this_branch->curframe]);
3085 return 0; 3726 return 0;
3086} 3727}
3087 3728
@@ -3166,6 +3807,18 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
3166 return -EINVAL; 3807 return -EINVAL;
3167 } 3808 }
3168 3809
3810 if (env->subprog_cnt) {
3811 /* when program has LD_ABS insn JITs and interpreter assume
3812 * that r1 == ctx == skb which is not the case for callees
3813 * that can have arbitrary arguments. It's problematic
3814 * for main prog as well since JITs would need to analyze
3815 * all functions in order to make proper register save/restore
3816 * decisions in the main prog. Hence disallow LD_ABS with calls
3817 */
3818 verbose(env, "BPF_LD_[ABS|IND] instructions cannot be mixed with bpf-to-bpf calls\n");
3819 return -EINVAL;
3820 }
3821
3169 if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || 3822 if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
3170 BPF_SIZE(insn->code) == BPF_DW || 3823 BPF_SIZE(insn->code) == BPF_DW ||
3171 (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { 3824 (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
@@ -3342,6 +3995,10 @@ static int check_cfg(struct bpf_verifier_env *env)
3342 int ret = 0; 3995 int ret = 0;
3343 int i, t; 3996 int i, t;
3344 3997
3998 ret = check_subprogs(env);
3999 if (ret < 0)
4000 return ret;
4001
3345 insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); 4002 insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
3346 if (!insn_state) 4003 if (!insn_state)
3347 return -ENOMEM; 4004 return -ENOMEM;
@@ -3374,6 +4031,14 @@ peek_stack:
3374 goto err_free; 4031 goto err_free;
3375 if (t + 1 < insn_cnt) 4032 if (t + 1 < insn_cnt)
3376 env->explored_states[t + 1] = STATE_LIST_MARK; 4033 env->explored_states[t + 1] = STATE_LIST_MARK;
4034 if (insns[t].src_reg == BPF_PSEUDO_CALL) {
4035 env->explored_states[t] = STATE_LIST_MARK;
4036 ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env);
4037 if (ret == 1)
4038 goto peek_stack;
4039 else if (ret < 0)
4040 goto err_free;
4041 }
3377 } else if (opcode == BPF_JA) { 4042 } else if (opcode == BPF_JA) {
3378 if (BPF_SRC(insns[t].code) != BPF_K) { 4043 if (BPF_SRC(insns[t].code) != BPF_K) {
3379 ret = -EINVAL; 4044 ret = -EINVAL;
@@ -3492,11 +4157,21 @@ static bool check_ids(u32 old_id, u32 cur_id, struct idpair *idmap)
3492static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, 4157static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
3493 struct idpair *idmap) 4158 struct idpair *idmap)
3494{ 4159{
4160 bool equal;
4161
3495 if (!(rold->live & REG_LIVE_READ)) 4162 if (!(rold->live & REG_LIVE_READ))
3496 /* explored state didn't use this */ 4163 /* explored state didn't use this */
3497 return true; 4164 return true;
3498 4165
3499 if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, live)) == 0) 4166 equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, frameno)) == 0;
4167
4168 if (rold->type == PTR_TO_STACK)
4169 /* two stack pointers are equal only if they're pointing to
4170 * the same stack frame, since fp-8 in foo != fp-8 in bar
4171 */
4172 return equal && rold->frameno == rcur->frameno;
4173
4174 if (equal)
3500 return true; 4175 return true;
3501 4176
3502 if (rold->type == NOT_INIT) 4177 if (rold->type == NOT_INIT)
@@ -3568,7 +4243,6 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
3568 tnum_in(rold->var_off, rcur->var_off); 4243 tnum_in(rold->var_off, rcur->var_off);
3569 case PTR_TO_CTX: 4244 case PTR_TO_CTX:
3570 case CONST_PTR_TO_MAP: 4245 case CONST_PTR_TO_MAP:
3571 case PTR_TO_STACK:
3572 case PTR_TO_PACKET_END: 4246 case PTR_TO_PACKET_END:
3573 /* Only valid matches are exact, which memcmp() above 4247 /* Only valid matches are exact, which memcmp() above
3574 * would have accepted 4248 * would have accepted
@@ -3583,8 +4257,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
3583 return false; 4257 return false;
3584} 4258}
3585 4259
3586static bool stacksafe(struct bpf_verifier_state *old, 4260static bool stacksafe(struct bpf_func_state *old,
3587 struct bpf_verifier_state *cur, 4261 struct bpf_func_state *cur,
3588 struct idpair *idmap) 4262 struct idpair *idmap)
3589{ 4263{
3590 int i, spi; 4264 int i, spi;
@@ -3602,8 +4276,19 @@ static bool stacksafe(struct bpf_verifier_state *old,
3602 for (i = 0; i < old->allocated_stack; i++) { 4276 for (i = 0; i < old->allocated_stack; i++) {
3603 spi = i / BPF_REG_SIZE; 4277 spi = i / BPF_REG_SIZE;
3604 4278
4279 if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ))
4280 /* explored state didn't use this */
4281 continue;
4282
3605 if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID) 4283 if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID)
3606 continue; 4284 continue;
4285 /* if old state was safe with misc data in the stack
4286 * it will be safe with zero-initialized stack.
4287 * The opposite is not true
4288 */
4289 if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC &&
4290 cur->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_ZERO)
4291 continue;
3607 if (old->stack[spi].slot_type[i % BPF_REG_SIZE] != 4292 if (old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
3608 cur->stack[spi].slot_type[i % BPF_REG_SIZE]) 4293 cur->stack[spi].slot_type[i % BPF_REG_SIZE])
3609 /* Ex: old explored (safe) state has STACK_SPILL in 4294 /* Ex: old explored (safe) state has STACK_SPILL in
@@ -3660,9 +4345,8 @@ static bool stacksafe(struct bpf_verifier_state *old,
3660 * whereas register type in current state is meaningful, it means that 4345 * whereas register type in current state is meaningful, it means that
3661 * the current state will reach 'bpf_exit' instruction safely 4346 * the current state will reach 'bpf_exit' instruction safely
3662 */ 4347 */
3663static bool states_equal(struct bpf_verifier_env *env, 4348static bool func_states_equal(struct bpf_func_state *old,
3664 struct bpf_verifier_state *old, 4349 struct bpf_func_state *cur)
3665 struct bpf_verifier_state *cur)
3666{ 4350{
3667 struct idpair *idmap; 4351 struct idpair *idmap;
3668 bool ret = false; 4352 bool ret = false;
@@ -3686,71 +4370,72 @@ out_free:
3686 return ret; 4370 return ret;
3687} 4371}
3688 4372
4373static bool states_equal(struct bpf_verifier_env *env,
4374 struct bpf_verifier_state *old,
4375 struct bpf_verifier_state *cur)
4376{
4377 int i;
4378
4379 if (old->curframe != cur->curframe)
4380 return false;
4381
4382 /* for states to be equal callsites have to be the same
4383 * and all frame states need to be equivalent
4384 */
4385 for (i = 0; i <= old->curframe; i++) {
4386 if (old->frame[i]->callsite != cur->frame[i]->callsite)
4387 return false;
4388 if (!func_states_equal(old->frame[i], cur->frame[i]))
4389 return false;
4390 }
4391 return true;
4392}
4393
3689/* A write screens off any subsequent reads; but write marks come from the 4394/* A write screens off any subsequent reads; but write marks come from the
3690 * straight-line code between a state and its parent. When we arrive at a 4395 * straight-line code between a state and its parent. When we arrive at an
3691 * jump target (in the first iteration of the propagate_liveness() loop), 4396 * equivalent state (jump target or such) we didn't arrive by the straight-line
3692 * we didn't arrive by the straight-line code, so read marks in state must 4397 * code, so read marks in the state must propagate to the parent regardless
3693 * propagate to parent regardless of state's write marks. 4398 * of the state's write marks. That's what 'parent == state->parent' comparison
4399 * in mark_reg_read() and mark_stack_slot_read() is for.
3694 */ 4400 */
3695static bool do_propagate_liveness(const struct bpf_verifier_state *state, 4401static int propagate_liveness(struct bpf_verifier_env *env,
3696 struct bpf_verifier_state *parent) 4402 const struct bpf_verifier_state *vstate,
4403 struct bpf_verifier_state *vparent)
3697{ 4404{
3698 bool writes = parent == state->parent; /* Observe write marks */ 4405 int i, frame, err = 0;
3699 bool touched = false; /* any changes made? */ 4406 struct bpf_func_state *state, *parent;
3700 int i;
3701 4407
3702 if (!parent) 4408 if (vparent->curframe != vstate->curframe) {
3703 return touched; 4409 WARN(1, "propagate_live: parent frame %d current frame %d\n",
4410 vparent->curframe, vstate->curframe);
4411 return -EFAULT;
4412 }
3704 /* Propagate read liveness of registers... */ 4413 /* Propagate read liveness of registers... */
3705 BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); 4414 BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);
3706 /* We don't need to worry about FP liveness because it's read-only */ 4415 /* We don't need to worry about FP liveness because it's read-only */
3707 for (i = 0; i < BPF_REG_FP; i++) { 4416 for (i = 0; i < BPF_REG_FP; i++) {
3708 if (parent->regs[i].live & REG_LIVE_READ) 4417 if (vparent->frame[vparent->curframe]->regs[i].live & REG_LIVE_READ)
3709 continue; 4418 continue;
3710 if (writes && (state->regs[i].live & REG_LIVE_WRITTEN)) 4419 if (vstate->frame[vstate->curframe]->regs[i].live & REG_LIVE_READ) {
3711 continue; 4420 err = mark_reg_read(env, vstate, vparent, i);
3712 if (state->regs[i].live & REG_LIVE_READ) { 4421 if (err)
3713 parent->regs[i].live |= REG_LIVE_READ; 4422 return err;
3714 touched = true;
3715 } 4423 }
3716 } 4424 }
4425
3717 /* ... and stack slots */ 4426 /* ... and stack slots */
3718 for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && 4427 for (frame = 0; frame <= vstate->curframe; frame++) {
3719 i < parent->allocated_stack / BPF_REG_SIZE; i++) { 4428 state = vstate->frame[frame];
3720 if (parent->stack[i].slot_type[0] != STACK_SPILL) 4429 parent = vparent->frame[frame];
3721 continue; 4430 for (i = 0; i < state->allocated_stack / BPF_REG_SIZE &&
3722 if (state->stack[i].slot_type[0] != STACK_SPILL) 4431 i < parent->allocated_stack / BPF_REG_SIZE; i++) {
3723 continue; 4432 if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ)
3724 if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ) 4433 continue;
3725 continue; 4434 if (state->stack[i].spilled_ptr.live & REG_LIVE_READ)
3726 if (writes && 4435 mark_stack_slot_read(env, vstate, vparent, i, frame);
3727 (state->stack[i].spilled_ptr.live & REG_LIVE_WRITTEN))
3728 continue;
3729 if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) {
3730 parent->stack[i].spilled_ptr.live |= REG_LIVE_READ;
3731 touched = true;
3732 } 4436 }
3733 } 4437 }
3734 return touched; 4438 return err;
3735}
3736
3737/* "parent" is "a state from which we reach the current state", but initially
3738 * it is not the state->parent (i.e. "the state whose straight-line code leads
3739 * to the current state"), instead it is the state that happened to arrive at
3740 * a (prunable) equivalent of the current state. See comment above
3741 * do_propagate_liveness() for consequences of this.
3742 * This function is just a more efficient way of calling mark_reg_read() or
3743 * mark_stack_slot_read() on each reg in "parent" that is read in "state",
3744 * though it requires that parent != state->parent in the call arguments.
3745 */
3746static void propagate_liveness(const struct bpf_verifier_state *state,
3747 struct bpf_verifier_state *parent)
3748{
3749 while (do_propagate_liveness(state, parent)) {
3750 /* Something changed, so we need to feed those changes onward */
3751 state = parent;
3752 parent = state->parent;
3753 }
3754} 4439}
3755 4440
3756static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) 4441static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
@@ -3758,7 +4443,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
3758 struct bpf_verifier_state_list *new_sl; 4443 struct bpf_verifier_state_list *new_sl;
3759 struct bpf_verifier_state_list *sl; 4444 struct bpf_verifier_state_list *sl;
3760 struct bpf_verifier_state *cur = env->cur_state; 4445 struct bpf_verifier_state *cur = env->cur_state;
3761 int i, err; 4446 int i, j, err;
3762 4447
3763 sl = env->explored_states[insn_idx]; 4448 sl = env->explored_states[insn_idx];
3764 if (!sl) 4449 if (!sl)
@@ -3779,7 +4464,9 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
3779 * they'll be immediately forgotten as we're pruning 4464 * they'll be immediately forgotten as we're pruning
3780 * this state and will pop a new one. 4465 * this state and will pop a new one.
3781 */ 4466 */
3782 propagate_liveness(&sl->state, cur); 4467 err = propagate_liveness(env, &sl->state, cur);
4468 if (err)
4469 return err;
3783 return 1; 4470 return 1;
3784 } 4471 }
3785 sl = sl->next; 4472 sl = sl->next;
@@ -3787,9 +4474,10 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
3787 4474
3788 /* there were no equivalent states, remember current one. 4475 /* there were no equivalent states, remember current one.
3789 * technically the current state is not proven to be safe yet, 4476 * technically the current state is not proven to be safe yet,
3790 * but it will either reach bpf_exit (which means it's safe) or 4477 * but it will either reach outer most bpf_exit (which means it's safe)
3791 * it will be rejected. Since there are no loops, we won't be 4478 * or it will be rejected. Since there are no loops, we won't be
3792 * seeing this 'insn_idx' instruction again on the way to bpf_exit 4479 * seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx)
4480 * again on the way to bpf_exit
3793 */ 4481 */
3794 new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL); 4482 new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL);
3795 if (!new_sl) 4483 if (!new_sl)
@@ -3813,19 +4501,15 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
3813 * explored_states can get read marks.) 4501 * explored_states can get read marks.)
3814 */ 4502 */
3815 for (i = 0; i < BPF_REG_FP; i++) 4503 for (i = 0; i < BPF_REG_FP; i++)
3816 cur->regs[i].live = REG_LIVE_NONE; 4504 cur->frame[cur->curframe]->regs[i].live = REG_LIVE_NONE;
3817 for (i = 0; i < cur->allocated_stack / BPF_REG_SIZE; i++)
3818 if (cur->stack[i].slot_type[0] == STACK_SPILL)
3819 cur->stack[i].spilled_ptr.live = REG_LIVE_NONE;
3820 return 0;
3821}
3822 4505
3823static int ext_analyzer_insn_hook(struct bpf_verifier_env *env, 4506 /* all stack frames are accessible from callee, clear them all */
3824 int insn_idx, int prev_insn_idx) 4507 for (j = 0; j <= cur->curframe; j++) {
3825{ 4508 struct bpf_func_state *frame = cur->frame[j];
3826 if (env->dev_ops && env->dev_ops->insn_hook)
3827 return env->dev_ops->insn_hook(env, insn_idx, prev_insn_idx);
3828 4509
4510 for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++)
4511 frame->stack[i].spilled_ptr.live = REG_LIVE_NONE;
4512 }
3829 return 0; 4513 return 0;
3830} 4514}
3831 4515
@@ -3834,7 +4518,7 @@ static int do_check(struct bpf_verifier_env *env)
3834 struct bpf_verifier_state *state; 4518 struct bpf_verifier_state *state;
3835 struct bpf_insn *insns = env->prog->insnsi; 4519 struct bpf_insn *insns = env->prog->insnsi;
3836 struct bpf_reg_state *regs; 4520 struct bpf_reg_state *regs;
3837 int insn_cnt = env->prog->len; 4521 int insn_cnt = env->prog->len, i;
3838 int insn_idx, prev_insn_idx = 0; 4522 int insn_idx, prev_insn_idx = 0;
3839 int insn_processed = 0; 4523 int insn_processed = 0;
3840 bool do_print_state = false; 4524 bool do_print_state = false;
@@ -3842,9 +4526,18 @@ static int do_check(struct bpf_verifier_env *env)
3842 state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL); 4526 state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL);
3843 if (!state) 4527 if (!state)
3844 return -ENOMEM; 4528 return -ENOMEM;
3845 env->cur_state = state; 4529 state->curframe = 0;
3846 init_reg_state(env, state->regs);
3847 state->parent = NULL; 4530 state->parent = NULL;
4531 state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL);
4532 if (!state->frame[0]) {
4533 kfree(state);
4534 return -ENOMEM;
4535 }
4536 env->cur_state = state;
4537 init_func_state(env, state->frame[0],
4538 BPF_MAIN_FUNC /* callsite */,
4539 0 /* frameno */,
4540 0 /* subprogno, zero == main subprog */);
3848 insn_idx = 0; 4541 insn_idx = 0;
3849 for (;;) { 4542 for (;;) {
3850 struct bpf_insn *insn; 4543 struct bpf_insn *insn;
@@ -3891,19 +4584,25 @@ static int do_check(struct bpf_verifier_env *env)
3891 else 4584 else
3892 verbose(env, "\nfrom %d to %d:", 4585 verbose(env, "\nfrom %d to %d:",
3893 prev_insn_idx, insn_idx); 4586 prev_insn_idx, insn_idx);
3894 print_verifier_state(env, state); 4587 print_verifier_state(env, state->frame[state->curframe]);
3895 do_print_state = false; 4588 do_print_state = false;
3896 } 4589 }
3897 4590
3898 if (env->log.level) { 4591 if (env->log.level) {
4592 const struct bpf_insn_cbs cbs = {
4593 .cb_print = verbose,
4594 };
4595
3899 verbose(env, "%d: ", insn_idx); 4596 verbose(env, "%d: ", insn_idx);
3900 print_bpf_insn(verbose, env, insn, 4597 print_bpf_insn(&cbs, env, insn, env->allow_ptr_leaks);
3901 env->allow_ptr_leaks);
3902 } 4598 }
3903 4599
3904 err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx); 4600 if (bpf_prog_is_dev_bound(env->prog->aux)) {
3905 if (err) 4601 err = bpf_prog_offload_verify_insn(env, insn_idx,
3906 return err; 4602 prev_insn_idx);
4603 if (err)
4604 return err;
4605 }
3907 4606
3908 regs = cur_regs(env); 4607 regs = cur_regs(env);
3909 env->insn_aux_data[insn_idx].seen = true; 4608 env->insn_aux_data[insn_idx].seen = true;
@@ -4030,13 +4729,17 @@ static int do_check(struct bpf_verifier_env *env)
4030 if (opcode == BPF_CALL) { 4729 if (opcode == BPF_CALL) {
4031 if (BPF_SRC(insn->code) != BPF_K || 4730 if (BPF_SRC(insn->code) != BPF_K ||
4032 insn->off != 0 || 4731 insn->off != 0 ||
4033 insn->src_reg != BPF_REG_0 || 4732 (insn->src_reg != BPF_REG_0 &&
4733 insn->src_reg != BPF_PSEUDO_CALL) ||
4034 insn->dst_reg != BPF_REG_0) { 4734 insn->dst_reg != BPF_REG_0) {
4035 verbose(env, "BPF_CALL uses reserved fields\n"); 4735 verbose(env, "BPF_CALL uses reserved fields\n");
4036 return -EINVAL; 4736 return -EINVAL;
4037 } 4737 }
4038 4738
4039 err = check_call(env, insn->imm, insn_idx); 4739 if (insn->src_reg == BPF_PSEUDO_CALL)
4740 err = check_func_call(env, insn, &insn_idx);
4741 else
4742 err = check_helper_call(env, insn->imm, insn_idx);
4040 if (err) 4743 if (err)
4041 return err; 4744 return err;
4042 4745
@@ -4061,6 +4764,16 @@ static int do_check(struct bpf_verifier_env *env)
4061 return -EINVAL; 4764 return -EINVAL;
4062 } 4765 }
4063 4766
4767 if (state->curframe) {
4768 /* exit from nested function */
4769 prev_insn_idx = insn_idx;
4770 err = prepare_func_exit(env, &insn_idx);
4771 if (err)
4772 return err;
4773 do_print_state = true;
4774 continue;
4775 }
4776
4064 /* eBPF calling convetion is such that R0 is used 4777 /* eBPF calling convetion is such that R0 is used
4065 * to return the value from eBPF program. 4778 * to return the value from eBPF program.
4066 * Make sure that it's readable at this time 4779 * Make sure that it's readable at this time
@@ -4121,8 +4834,17 @@ process_bpf_exit:
4121 insn_idx++; 4834 insn_idx++;
4122 } 4835 }
4123 4836
4124 verbose(env, "processed %d insns, stack depth %d\n", insn_processed, 4837 verbose(env, "processed %d insns (limit %d), stack depth ",
4125 env->prog->aux->stack_depth); 4838 insn_processed, BPF_COMPLEXITY_LIMIT_INSNS);
4839 for (i = 0; i < env->subprog_cnt + 1; i++) {
4840 u32 depth = env->subprog_stack_depth[i];
4841
4842 verbose(env, "%d", depth);
4843 if (i + 1 < env->subprog_cnt + 1)
4844 verbose(env, "+");
4845 }
4846 verbose(env, "\n");
4847 env->prog->aux->stack_depth = env->subprog_stack_depth[0];
4126 return 0; 4848 return 0;
4127} 4849}
4128 4850
@@ -4155,6 +4877,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
4155 return -EINVAL; 4877 return -EINVAL;
4156 } 4878 }
4157 } 4879 }
4880
4881 if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) &&
4882 !bpf_offload_dev_match(prog, map)) {
4883 verbose(env, "offload device mismatch between prog and map\n");
4884 return -EINVAL;
4885 }
4886
4158 return 0; 4887 return 0;
4159} 4888}
4160 4889
@@ -4252,6 +4981,13 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
4252next_insn: 4981next_insn:
4253 insn++; 4982 insn++;
4254 i++; 4983 i++;
4984 continue;
4985 }
4986
4987 /* Basic sanity check before we invest more work here. */
4988 if (!bpf_opcode_in_insntable(insn->code)) {
4989 verbose(env, "unknown opcode %02x\n", insn->code);
4990 return -EINVAL;
4255 } 4991 }
4256 } 4992 }
4257 4993
@@ -4308,6 +5044,19 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len,
4308 return 0; 5044 return 0;
4309} 5045}
4310 5046
5047static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len)
5048{
5049 int i;
5050
5051 if (len == 1)
5052 return;
5053 for (i = 0; i < env->subprog_cnt; i++) {
5054 if (env->subprog_starts[i] < off)
5055 continue;
5056 env->subprog_starts[i] += len - 1;
5057 }
5058}
5059
4311static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off, 5060static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off,
4312 const struct bpf_insn *patch, u32 len) 5061 const struct bpf_insn *patch, u32 len)
4313{ 5062{
@@ -4318,17 +5067,25 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
4318 return NULL; 5067 return NULL;
4319 if (adjust_insn_aux_data(env, new_prog->len, off, len)) 5068 if (adjust_insn_aux_data(env, new_prog->len, off, len))
4320 return NULL; 5069 return NULL;
5070 adjust_subprog_starts(env, off, len);
4321 return new_prog; 5071 return new_prog;
4322} 5072}
4323 5073
4324/* The verifier does more data flow analysis than llvm and will not explore 5074/* The verifier does more data flow analysis than llvm and will not
4325 * branches that are dead at run time. Malicious programs can have dead code 5075 * explore branches that are dead at run time. Malicious programs can
4326 * too. Therefore replace all dead at-run-time code with nops. 5076 * have dead code too. Therefore replace all dead at-run-time code
5077 * with 'ja -1'.
5078 *
5079 * Just nops are not optimal, e.g. if they would sit at the end of the
5080 * program and through another bug we would manage to jump there, then
5081 * we'd execute beyond program memory otherwise. Returning exception
5082 * code also wouldn't work since we can have subprogs where the dead
5083 * code could be located.
4327 */ 5084 */
4328static void sanitize_dead_code(struct bpf_verifier_env *env) 5085static void sanitize_dead_code(struct bpf_verifier_env *env)
4329{ 5086{
4330 struct bpf_insn_aux_data *aux_data = env->insn_aux_data; 5087 struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
4331 struct bpf_insn nop = BPF_MOV64_REG(BPF_REG_0, BPF_REG_0); 5088 struct bpf_insn trap = BPF_JMP_IMM(BPF_JA, 0, 0, -1);
4332 struct bpf_insn *insn = env->prog->insnsi; 5089 struct bpf_insn *insn = env->prog->insnsi;
4333 const int insn_cnt = env->prog->len; 5090 const int insn_cnt = env->prog->len;
4334 int i; 5091 int i;
@@ -4336,7 +5093,7 @@ static void sanitize_dead_code(struct bpf_verifier_env *env)
4336 for (i = 0; i < insn_cnt; i++) { 5093 for (i = 0; i < insn_cnt; i++) {
4337 if (aux_data[i].seen) 5094 if (aux_data[i].seen)
4338 continue; 5095 continue;
4339 memcpy(insn + i, &nop, sizeof(nop)); 5096 memcpy(insn + i, &trap, sizeof(trap));
4340 } 5097 }
4341} 5098}
4342 5099
@@ -4452,6 +5209,180 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
4452 return 0; 5209 return 0;
4453} 5210}
4454 5211
5212static int jit_subprogs(struct bpf_verifier_env *env)
5213{
5214 struct bpf_prog *prog = env->prog, **func, *tmp;
5215 int i, j, subprog_start, subprog_end = 0, len, subprog;
5216 struct bpf_insn *insn;
5217 void *old_bpf_func;
5218 int err = -ENOMEM;
5219
5220 if (env->subprog_cnt == 0)
5221 return 0;
5222
5223 for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
5224 if (insn->code != (BPF_JMP | BPF_CALL) ||
5225 insn->src_reg != BPF_PSEUDO_CALL)
5226 continue;
5227 subprog = find_subprog(env, i + insn->imm + 1);
5228 if (subprog < 0) {
5229 WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
5230 i + insn->imm + 1);
5231 return -EFAULT;
5232 }
5233 /* temporarily remember subprog id inside insn instead of
5234 * aux_data, since next loop will split up all insns into funcs
5235 */
5236 insn->off = subprog + 1;
5237 /* remember original imm in case JIT fails and fallback
5238 * to interpreter will be needed
5239 */
5240 env->insn_aux_data[i].call_imm = insn->imm;
5241 /* point imm to __bpf_call_base+1 from JITs point of view */
5242 insn->imm = 1;
5243 }
5244
5245 func = kzalloc(sizeof(prog) * (env->subprog_cnt + 1), GFP_KERNEL);
5246 if (!func)
5247 return -ENOMEM;
5248
5249 for (i = 0; i <= env->subprog_cnt; i++) {
5250 subprog_start = subprog_end;
5251 if (env->subprog_cnt == i)
5252 subprog_end = prog->len;
5253 else
5254 subprog_end = env->subprog_starts[i];
5255
5256 len = subprog_end - subprog_start;
5257 func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER);
5258 if (!func[i])
5259 goto out_free;
5260 memcpy(func[i]->insnsi, &prog->insnsi[subprog_start],
5261 len * sizeof(struct bpf_insn));
5262 func[i]->type = prog->type;
5263 func[i]->len = len;
5264 if (bpf_prog_calc_tag(func[i]))
5265 goto out_free;
5266 func[i]->is_func = 1;
5267 /* Use bpf_prog_F_tag to indicate functions in stack traces.
5268 * Long term would need debug info to populate names
5269 */
5270 func[i]->aux->name[0] = 'F';
5271 func[i]->aux->stack_depth = env->subprog_stack_depth[i];
5272 func[i]->jit_requested = 1;
5273 func[i] = bpf_int_jit_compile(func[i]);
5274 if (!func[i]->jited) {
5275 err = -ENOTSUPP;
5276 goto out_free;
5277 }
5278 cond_resched();
5279 }
5280 /* at this point all bpf functions were successfully JITed
5281 * now populate all bpf_calls with correct addresses and
5282 * run last pass of JIT
5283 */
5284 for (i = 0; i <= env->subprog_cnt; i++) {
5285 insn = func[i]->insnsi;
5286 for (j = 0; j < func[i]->len; j++, insn++) {
5287 if (insn->code != (BPF_JMP | BPF_CALL) ||
5288 insn->src_reg != BPF_PSEUDO_CALL)
5289 continue;
5290 subprog = insn->off;
5291 insn->off = 0;
5292 insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
5293 func[subprog]->bpf_func -
5294 __bpf_call_base;
5295 }
5296 }
5297 for (i = 0; i <= env->subprog_cnt; i++) {
5298 old_bpf_func = func[i]->bpf_func;
5299 tmp = bpf_int_jit_compile(func[i]);
5300 if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) {
5301 verbose(env, "JIT doesn't support bpf-to-bpf calls\n");
5302 err = -EFAULT;
5303 goto out_free;
5304 }
5305 cond_resched();
5306 }
5307
5308 /* finally lock prog and jit images for all functions and
5309 * populate kallsysm
5310 */
5311 for (i = 0; i <= env->subprog_cnt; i++) {
5312 bpf_prog_lock_ro(func[i]);
5313 bpf_prog_kallsyms_add(func[i]);
5314 }
5315
5316 /* Last step: make now unused interpreter insns from main
5317 * prog consistent for later dump requests, so they can
5318 * later look the same as if they were interpreted only.
5319 */
5320 for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
5321 unsigned long addr;
5322
5323 if (insn->code != (BPF_JMP | BPF_CALL) ||
5324 insn->src_reg != BPF_PSEUDO_CALL)
5325 continue;
5326 insn->off = env->insn_aux_data[i].call_imm;
5327 subprog = find_subprog(env, i + insn->off + 1);
5328 addr = (unsigned long)func[subprog + 1]->bpf_func;
5329 addr &= PAGE_MASK;
5330 insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
5331 addr - __bpf_call_base;
5332 }
5333
5334 prog->jited = 1;
5335 prog->bpf_func = func[0]->bpf_func;
5336 prog->aux->func = func;
5337 prog->aux->func_cnt = env->subprog_cnt + 1;
5338 return 0;
5339out_free:
5340 for (i = 0; i <= env->subprog_cnt; i++)
5341 if (func[i])
5342 bpf_jit_free(func[i]);
5343 kfree(func);
5344 /* cleanup main prog to be interpreted */
5345 prog->jit_requested = 0;
5346 for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
5347 if (insn->code != (BPF_JMP | BPF_CALL) ||
5348 insn->src_reg != BPF_PSEUDO_CALL)
5349 continue;
5350 insn->off = 0;
5351 insn->imm = env->insn_aux_data[i].call_imm;
5352 }
5353 return err;
5354}
5355
5356static int fixup_call_args(struct bpf_verifier_env *env)
5357{
5358#ifndef CONFIG_BPF_JIT_ALWAYS_ON
5359 struct bpf_prog *prog = env->prog;
5360 struct bpf_insn *insn = prog->insnsi;
5361 int i, depth;
5362#endif
5363 int err;
5364
5365 err = 0;
5366 if (env->prog->jit_requested) {
5367 err = jit_subprogs(env);
5368 if (err == 0)
5369 return 0;
5370 }
5371#ifndef CONFIG_BPF_JIT_ALWAYS_ON
5372 for (i = 0; i < prog->len; i++, insn++) {
5373 if (insn->code != (BPF_JMP | BPF_CALL) ||
5374 insn->src_reg != BPF_PSEUDO_CALL)
5375 continue;
5376 depth = get_callee_stack_depth(env, insn, i);
5377 if (depth < 0)
5378 return depth;
5379 bpf_patch_call_args(insn, depth);
5380 }
5381 err = 0;
5382#endif
5383 return err;
5384}
5385
4455/* fixup insn->imm field of bpf_call instructions 5386/* fixup insn->imm field of bpf_call instructions
4456 * and inline eligible helpers as explicit sequence of BPF instructions 5387 * and inline eligible helpers as explicit sequence of BPF instructions
4457 * 5388 *
@@ -4469,15 +5400,37 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
4469 int i, cnt, delta = 0; 5400 int i, cnt, delta = 0;
4470 5401
4471 for (i = 0; i < insn_cnt; i++, insn++) { 5402 for (i = 0; i < insn_cnt; i++, insn++) {
4472 if (insn->code == (BPF_ALU | BPF_MOD | BPF_X) || 5403 if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) ||
5404 insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
5405 insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
4473 insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { 5406 insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
4474 /* due to JIT bugs clear upper 32-bits of src register 5407 bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
4475 * before div/mod operation 5408 struct bpf_insn mask_and_div[] = {
4476 */ 5409 BPF_MOV32_REG(insn->src_reg, insn->src_reg),
4477 insn_buf[0] = BPF_MOV32_REG(insn->src_reg, insn->src_reg); 5410 /* Rx div 0 -> 0 */
4478 insn_buf[1] = *insn; 5411 BPF_JMP_IMM(BPF_JNE, insn->src_reg, 0, 2),
4479 cnt = 2; 5412 BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg),
4480 new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); 5413 BPF_JMP_IMM(BPF_JA, 0, 0, 1),
5414 *insn,
5415 };
5416 struct bpf_insn mask_and_mod[] = {
5417 BPF_MOV32_REG(insn->src_reg, insn->src_reg),
5418 /* Rx mod 0 -> Rx */
5419 BPF_JMP_IMM(BPF_JEQ, insn->src_reg, 0, 1),
5420 *insn,
5421 };
5422 struct bpf_insn *patchlet;
5423
5424 if (insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
5425 insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
5426 patchlet = mask_and_div + (is64 ? 1 : 0);
5427 cnt = ARRAY_SIZE(mask_and_div) - (is64 ? 1 : 0);
5428 } else {
5429 patchlet = mask_and_mod + (is64 ? 1 : 0);
5430 cnt = ARRAY_SIZE(mask_and_mod) - (is64 ? 1 : 0);
5431 }
5432
5433 new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt);
4481 if (!new_prog) 5434 if (!new_prog)
4482 return -ENOMEM; 5435 return -ENOMEM;
4483 5436
@@ -4489,11 +5442,15 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
4489 5442
4490 if (insn->code != (BPF_JMP | BPF_CALL)) 5443 if (insn->code != (BPF_JMP | BPF_CALL))
4491 continue; 5444 continue;
5445 if (insn->src_reg == BPF_PSEUDO_CALL)
5446 continue;
4492 5447
4493 if (insn->imm == BPF_FUNC_get_route_realm) 5448 if (insn->imm == BPF_FUNC_get_route_realm)
4494 prog->dst_needed = 1; 5449 prog->dst_needed = 1;
4495 if (insn->imm == BPF_FUNC_get_prandom_u32) 5450 if (insn->imm == BPF_FUNC_get_prandom_u32)
4496 bpf_user_rnd_init_once(); 5451 bpf_user_rnd_init_once();
5452 if (insn->imm == BPF_FUNC_override_return)
5453 prog->kprobe_override = 1;
4497 if (insn->imm == BPF_FUNC_tail_call) { 5454 if (insn->imm == BPF_FUNC_tail_call) {
4498 /* If we tail call into other programs, we 5455 /* If we tail call into other programs, we
4499 * cannot make any assumptions since they can 5456 * cannot make any assumptions since they can
@@ -4545,7 +5502,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
4545 /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup 5502 /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
4546 * handlers are currently limited to 64 bit only. 5503 * handlers are currently limited to 64 bit only.
4547 */ 5504 */
4548 if (ebpf_jit_enabled() && BITS_PER_LONG == 64 && 5505 if (prog->jit_requested && BITS_PER_LONG == 64 &&
4549 insn->imm == BPF_FUNC_map_lookup_elem) { 5506 insn->imm == BPF_FUNC_map_lookup_elem) {
4550 map_ptr = env->insn_aux_data[i + delta].map_ptr; 5507 map_ptr = env->insn_aux_data[i + delta].map_ptr;
4551 if (map_ptr == BPF_MAP_PTR_POISON || 5508 if (map_ptr == BPF_MAP_PTR_POISON ||
@@ -4680,7 +5637,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
4680 if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) 5637 if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
4681 env->strict_alignment = true; 5638 env->strict_alignment = true;
4682 5639
4683 if (env->prog->aux->offload) { 5640 if (bpf_prog_is_dev_bound(env->prog->aux)) {
4684 ret = bpf_prog_offload_verifier_prep(env); 5641 ret = bpf_prog_offload_verifier_prep(env);
4685 if (ret) 5642 if (ret)
4686 goto err_unlock; 5643 goto err_unlock;
@@ -4697,12 +5654,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
4697 if (!env->explored_states) 5654 if (!env->explored_states)
4698 goto skip_full_check; 5655 goto skip_full_check;
4699 5656
5657 env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
5658
4700 ret = check_cfg(env); 5659 ret = check_cfg(env);
4701 if (ret < 0) 5660 if (ret < 0)
4702 goto skip_full_check; 5661 goto skip_full_check;
4703 5662
4704 env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
4705
4706 ret = do_check(env); 5663 ret = do_check(env);
4707 if (env->cur_state) { 5664 if (env->cur_state) {
4708 free_verifier_state(env->cur_state, true); 5665 free_verifier_state(env->cur_state, true);
@@ -4717,12 +5674,18 @@ skip_full_check:
4717 sanitize_dead_code(env); 5674 sanitize_dead_code(env);
4718 5675
4719 if (ret == 0) 5676 if (ret == 0)
5677 ret = check_max_stack_depth(env);
5678
5679 if (ret == 0)
4720 /* program is valid, convert *(u32*)(ctx + off) accesses */ 5680 /* program is valid, convert *(u32*)(ctx + off) accesses */
4721 ret = convert_ctx_accesses(env); 5681 ret = convert_ctx_accesses(env);
4722 5682
4723 if (ret == 0) 5683 if (ret == 0)
4724 ret = fixup_bpf_calls(env); 5684 ret = fixup_bpf_calls(env);
4725 5685
5686 if (ret == 0)
5687 ret = fixup_call_args(env);
5688
4726 if (log->level && bpf_verifier_log_full(log)) 5689 if (log->level && bpf_verifier_log_full(log))
4727 ret = -ENOSPC; 5690 ret = -ENOSPC;
4728 if (log->level && !log->ubuf) { 5691 if (log->level && !log->ubuf) {
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 7e4c44538119..8cda3bc3ae22 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1397,7 +1397,7 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
1397 cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, 1397 cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
1398 cft->name); 1398 cft->name);
1399 else 1399 else
1400 strlcpy(buf, cft->name, CGROUP_FILE_NAME_MAX); 1400 strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
1401 return buf; 1401 return buf;
1402} 1402}
1403 1403
@@ -1864,9 +1864,9 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
1864 1864
1865 root->flags = opts->flags; 1865 root->flags = opts->flags;
1866 if (opts->release_agent) 1866 if (opts->release_agent)
1867 strlcpy(root->release_agent_path, opts->release_agent, PATH_MAX); 1867 strscpy(root->release_agent_path, opts->release_agent, PATH_MAX);
1868 if (opts->name) 1868 if (opts->name)
1869 strlcpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN); 1869 strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN);
1870 if (opts->cpuset_clone_children) 1870 if (opts->cpuset_clone_children)
1871 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); 1871 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
1872} 1872}
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index c8146d53ca67..dbb0781a0533 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -2441,7 +2441,6 @@ static int kdb_kill(int argc, const char **argv)
2441 long sig, pid; 2441 long sig, pid;
2442 char *endp; 2442 char *endp;
2443 struct task_struct *p; 2443 struct task_struct *p;
2444 struct siginfo info;
2445 2444
2446 if (argc != 2) 2445 if (argc != 2)
2447 return KDB_ARGCOUNT; 2446 return KDB_ARGCOUNT;
@@ -2449,7 +2448,7 @@ static int kdb_kill(int argc, const char **argv)
2449 sig = simple_strtol(argv[1], &endp, 0); 2448 sig = simple_strtol(argv[1], &endp, 0);
2450 if (*endp) 2449 if (*endp)
2451 return KDB_BADINT; 2450 return KDB_BADINT;
2452 if (sig >= 0) { 2451 if ((sig >= 0) || !valid_signal(-sig)) {
2453 kdb_printf("Invalid signal parameter.<-signal>\n"); 2452 kdb_printf("Invalid signal parameter.<-signal>\n");
2454 return 0; 2453 return 0;
2455 } 2454 }
@@ -2470,12 +2469,7 @@ static int kdb_kill(int argc, const char **argv)
2470 return 0; 2469 return 0;
2471 } 2470 }
2472 p = p->group_leader; 2471 p = p->group_leader;
2473 info.si_signo = sig; 2472 kdb_send_sig(p, sig);
2474 info.si_errno = 0;
2475 info.si_code = SI_USER;
2476 info.si_pid = pid; /* same capabilities as process being signalled */
2477 info.si_uid = 0; /* kdb has root authority */
2478 kdb_send_sig_info(p, &info);
2479 return 0; 2473 return 0;
2480} 2474}
2481 2475
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index fc224fbcf954..1e5a502ba4a7 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -208,7 +208,7 @@ extern unsigned long kdb_task_state(const struct task_struct *p,
208extern void kdb_ps_suppressed(void); 208extern void kdb_ps_suppressed(void);
209extern void kdb_ps1(const struct task_struct *p); 209extern void kdb_ps1(const struct task_struct *p);
210extern void kdb_print_nameval(const char *name, unsigned long val); 210extern void kdb_print_nameval(const char *name, unsigned long val);
211extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info); 211extern void kdb_send_sig(struct task_struct *p, int sig);
212extern void kdb_meminfo_proc_show(void); 212extern void kdb_meminfo_proc_show(void);
213extern char *kdb_getstr(char *, size_t, const char *); 213extern char *kdb_getstr(char *, size_t, const char *);
214extern void kdb_gdb_state_pass(char *buf); 214extern void kdb_gdb_state_pass(char *buf);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d99fe3fdec8a..f0549e79978b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4520,11 +4520,11 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
4520 return ret; 4520 return ret;
4521} 4521}
4522 4522
4523static unsigned int perf_poll(struct file *file, poll_table *wait) 4523static __poll_t perf_poll(struct file *file, poll_table *wait)
4524{ 4524{
4525 struct perf_event *event = file->private_data; 4525 struct perf_event *event = file->private_data;
4526 struct ring_buffer *rb; 4526 struct ring_buffer *rb;
4527 unsigned int events = POLLHUP; 4527 __poll_t events = POLLHUP;
4528 4528
4529 poll_wait(file, &event->waitq, wait); 4529 poll_wait(file, &event->waitq, wait);
4530 4530
@@ -4732,6 +4732,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
4732 rcu_read_unlock(); 4732 rcu_read_unlock();
4733 return 0; 4733 return 0;
4734 } 4734 }
4735
4736 case PERF_EVENT_IOC_QUERY_BPF:
4737 return perf_event_query_prog_array(event, (void __user *)arg);
4735 default: 4738 default:
4736 return -ENOTTY; 4739 return -ENOTTY;
4737 } 4740 }
@@ -4913,6 +4916,7 @@ void perf_event_update_userpage(struct perf_event *event)
4913unlock: 4916unlock:
4914 rcu_read_unlock(); 4917 rcu_read_unlock();
4915} 4918}
4919EXPORT_SYMBOL_GPL(perf_event_update_userpage);
4916 4920
4917static int perf_mmap_fault(struct vm_fault *vmf) 4921static int perf_mmap_fault(struct vm_fault *vmf)
4918{ 4922{
@@ -8099,6 +8103,13 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
8099 return -EINVAL; 8103 return -EINVAL;
8100 } 8104 }
8101 8105
8106 /* Kprobe override only works for kprobes, not uprobes. */
8107 if (prog->kprobe_override &&
8108 !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) {
8109 bpf_prog_put(prog);
8110 return -EINVAL;
8111 }
8112
8102 if (is_tracepoint || is_syscall_tp) { 8113 if (is_tracepoint || is_syscall_tp) {
8103 int off = trace_event_get_offsets(event->tp_event); 8114 int off = trace_event_get_offsets(event->tp_event);
8104 8115
diff --git a/kernel/fail_function.c b/kernel/fail_function.c
new file mode 100644
index 000000000000..21b0122cb39c
--- /dev/null
+++ b/kernel/fail_function.c
@@ -0,0 +1,349 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * fail_function.c: Function-based error injection
4 */
5#include <linux/error-injection.h>
6#include <linux/debugfs.h>
7#include <linux/fault-inject.h>
8#include <linux/kallsyms.h>
9#include <linux/kprobes.h>
10#include <linux/module.h>
11#include <linux/mutex.h>
12#include <linux/slab.h>
13#include <linux/uaccess.h>
14
15static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs);
16
17struct fei_attr {
18 struct list_head list;
19 struct kprobe kp;
20 unsigned long retval;
21};
22static DEFINE_MUTEX(fei_lock);
23static LIST_HEAD(fei_attr_list);
24static DECLARE_FAULT_ATTR(fei_fault_attr);
25static struct dentry *fei_debugfs_dir;
26
27static unsigned long adjust_error_retval(unsigned long addr, unsigned long retv)
28{
29 switch (get_injectable_error_type(addr)) {
30 case EI_ETYPE_NULL:
31 if (retv != 0)
32 return 0;
33 break;
34 case EI_ETYPE_ERRNO:
35 if (retv < (unsigned long)-MAX_ERRNO)
36 return (unsigned long)-EINVAL;
37 break;
38 case EI_ETYPE_ERRNO_NULL:
39 if (retv != 0 && retv < (unsigned long)-MAX_ERRNO)
40 return (unsigned long)-EINVAL;
41 break;
42 }
43
44 return retv;
45}
46
47static struct fei_attr *fei_attr_new(const char *sym, unsigned long addr)
48{
49 struct fei_attr *attr;
50
51 attr = kzalloc(sizeof(*attr), GFP_KERNEL);
52 if (attr) {
53 attr->kp.symbol_name = kstrdup(sym, GFP_KERNEL);
54 if (!attr->kp.symbol_name) {
55 kfree(attr);
56 return NULL;
57 }
58 attr->kp.pre_handler = fei_kprobe_handler;
59 attr->retval = adjust_error_retval(addr, 0);
60 INIT_LIST_HEAD(&attr->list);
61 }
62 return attr;
63}
64
65static void fei_attr_free(struct fei_attr *attr)
66{
67 if (attr) {
68 kfree(attr->kp.symbol_name);
69 kfree(attr);
70 }
71}
72
73static struct fei_attr *fei_attr_lookup(const char *sym)
74{
75 struct fei_attr *attr;
76
77 list_for_each_entry(attr, &fei_attr_list, list) {
78 if (!strcmp(attr->kp.symbol_name, sym))
79 return attr;
80 }
81
82 return NULL;
83}
84
85static bool fei_attr_is_valid(struct fei_attr *_attr)
86{
87 struct fei_attr *attr;
88
89 list_for_each_entry(attr, &fei_attr_list, list) {
90 if (attr == _attr)
91 return true;
92 }
93
94 return false;
95}
96
97static int fei_retval_set(void *data, u64 val)
98{
99 struct fei_attr *attr = data;
100 unsigned long retv = (unsigned long)val;
101 int err = 0;
102
103 mutex_lock(&fei_lock);
104 /*
105 * Since this operation can be done after retval file is removed,
106 * It is safer to check the attr is still valid before accessing
107 * its member.
108 */
109 if (!fei_attr_is_valid(attr)) {
110 err = -ENOENT;
111 goto out;
112 }
113
114 if (attr->kp.addr) {
115 if (adjust_error_retval((unsigned long)attr->kp.addr,
116 val) != retv)
117 err = -EINVAL;
118 }
119 if (!err)
120 attr->retval = val;
121out:
122 mutex_unlock(&fei_lock);
123
124 return err;
125}
126
127static int fei_retval_get(void *data, u64 *val)
128{
129 struct fei_attr *attr = data;
130 int err = 0;
131
132 mutex_lock(&fei_lock);
133 /* Here we also validate @attr to ensure it still exists. */
134 if (!fei_attr_is_valid(attr))
135 err = -ENOENT;
136 else
137 *val = attr->retval;
138 mutex_unlock(&fei_lock);
139
140 return err;
141}
142DEFINE_DEBUGFS_ATTRIBUTE(fei_retval_ops, fei_retval_get, fei_retval_set,
143 "%llx\n");
144
145static int fei_debugfs_add_attr(struct fei_attr *attr)
146{
147 struct dentry *dir;
148
149 dir = debugfs_create_dir(attr->kp.symbol_name, fei_debugfs_dir);
150 if (!dir)
151 return -ENOMEM;
152
153 if (!debugfs_create_file("retval", 0600, dir, attr, &fei_retval_ops)) {
154 debugfs_remove_recursive(dir);
155 return -ENOMEM;
156 }
157
158 return 0;
159}
160
161static void fei_debugfs_remove_attr(struct fei_attr *attr)
162{
163 struct dentry *dir;
164
165 dir = debugfs_lookup(attr->kp.symbol_name, fei_debugfs_dir);
166 if (dir)
167 debugfs_remove_recursive(dir);
168}
169
170static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs)
171{
172 struct fei_attr *attr = container_of(kp, struct fei_attr, kp);
173
174 if (should_fail(&fei_fault_attr, 1)) {
175 regs_set_return_value(regs, attr->retval);
176 override_function_with_return(regs);
177 /* Kprobe specific fixup */
178 reset_current_kprobe();
179 preempt_enable_no_resched();
180 return 1;
181 }
182
183 return 0;
184}
185NOKPROBE_SYMBOL(fei_kprobe_handler)
186
187static void *fei_seq_start(struct seq_file *m, loff_t *pos)
188{
189 mutex_lock(&fei_lock);
190 return seq_list_start(&fei_attr_list, *pos);
191}
192
193static void fei_seq_stop(struct seq_file *m, void *v)
194{
195 mutex_unlock(&fei_lock);
196}
197
198static void *fei_seq_next(struct seq_file *m, void *v, loff_t *pos)
199{
200 return seq_list_next(v, &fei_attr_list, pos);
201}
202
203static int fei_seq_show(struct seq_file *m, void *v)
204{
205 struct fei_attr *attr = list_entry(v, struct fei_attr, list);
206
207 seq_printf(m, "%pf\n", attr->kp.addr);
208 return 0;
209}
210
211static const struct seq_operations fei_seq_ops = {
212 .start = fei_seq_start,
213 .next = fei_seq_next,
214 .stop = fei_seq_stop,
215 .show = fei_seq_show,
216};
217
218static int fei_open(struct inode *inode, struct file *file)
219{
220 return seq_open(file, &fei_seq_ops);
221}
222
223static void fei_attr_remove(struct fei_attr *attr)
224{
225 fei_debugfs_remove_attr(attr);
226 unregister_kprobe(&attr->kp);
227 list_del(&attr->list);
228 fei_attr_free(attr);
229}
230
231static void fei_attr_remove_all(void)
232{
233 struct fei_attr *attr, *n;
234
235 list_for_each_entry_safe(attr, n, &fei_attr_list, list) {
236 fei_attr_remove(attr);
237 }
238}
239
240static ssize_t fei_write(struct file *file, const char __user *buffer,
241 size_t count, loff_t *ppos)
242{
243 struct fei_attr *attr;
244 unsigned long addr;
245 char *buf, *sym;
246 int ret;
247
248 /* cut off if it is too long */
249 if (count > KSYM_NAME_LEN)
250 count = KSYM_NAME_LEN;
251 buf = kmalloc(sizeof(char) * (count + 1), GFP_KERNEL);
252 if (!buf)
253 return -ENOMEM;
254
255 if (copy_from_user(buf, buffer, count)) {
256 ret = -EFAULT;
257 goto out;
258 }
259 buf[count] = '\0';
260 sym = strstrip(buf);
261
262 mutex_lock(&fei_lock);
263
264 /* Writing just spaces will remove all injection points */
265 if (sym[0] == '\0') {
266 fei_attr_remove_all();
267 ret = count;
268 goto out;
269 }
270 /* Writing !function will remove one injection point */
271 if (sym[0] == '!') {
272 attr = fei_attr_lookup(sym + 1);
273 if (!attr) {
274 ret = -ENOENT;
275 goto out;
276 }
277 fei_attr_remove(attr);
278 ret = count;
279 goto out;
280 }
281
282 addr = kallsyms_lookup_name(sym);
283 if (!addr) {
284 ret = -EINVAL;
285 goto out;
286 }
287 if (!within_error_injection_list(addr)) {
288 ret = -ERANGE;
289 goto out;
290 }
291 if (fei_attr_lookup(sym)) {
292 ret = -EBUSY;
293 goto out;
294 }
295 attr = fei_attr_new(sym, addr);
296 if (!attr) {
297 ret = -ENOMEM;
298 goto out;
299 }
300
301 ret = register_kprobe(&attr->kp);
302 if (!ret)
303 ret = fei_debugfs_add_attr(attr);
304 if (ret < 0)
305 fei_attr_remove(attr);
306 else {
307 list_add_tail(&attr->list, &fei_attr_list);
308 ret = count;
309 }
310out:
311 kfree(buf);
312 mutex_unlock(&fei_lock);
313 return ret;
314}
315
316static const struct file_operations fei_ops = {
317 .open = fei_open,
318 .read = seq_read,
319 .write = fei_write,
320 .llseek = seq_lseek,
321 .release = seq_release,
322};
323
324static int __init fei_debugfs_init(void)
325{
326 struct dentry *dir;
327
328 dir = fault_create_debugfs_attr("fail_function", NULL,
329 &fei_fault_attr);
330 if (IS_ERR(dir))
331 return PTR_ERR(dir);
332
333 /* injectable attribute is just a symlink of error_inject/list */
334 if (!debugfs_create_symlink("injectable", dir,
335 "../error_injection/list"))
336 goto error;
337
338 if (!debugfs_create_file("inject", 0600, dir, NULL, &fei_ops))
339 goto error;
340
341 fei_debugfs_dir = dir;
342
343 return 0;
344error:
345 debugfs_remove_recursive(dir);
346 return -ENOMEM;
347}
348
349late_initcall(fei_debugfs_init);
diff --git a/kernel/fork.c b/kernel/fork.c
index 2295fc69717f..c7c112391d79 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -77,6 +77,7 @@
77#include <linux/blkdev.h> 77#include <linux/blkdev.h>
78#include <linux/fs_struct.h> 78#include <linux/fs_struct.h>
79#include <linux/magic.h> 79#include <linux/magic.h>
80#include <linux/sched/mm.h>
80#include <linux/perf_event.h> 81#include <linux/perf_event.h>
81#include <linux/posix-timers.h> 82#include <linux/posix-timers.h>
82#include <linux/user-return-notifier.h> 83#include <linux/user-return-notifier.h>
@@ -282,8 +283,9 @@ static void free_thread_stack(struct task_struct *tsk)
282 283
283void thread_stack_cache_init(void) 284void thread_stack_cache_init(void)
284{ 285{
285 thread_stack_cache = kmem_cache_create("thread_stack", THREAD_SIZE, 286 thread_stack_cache = kmem_cache_create_usercopy("thread_stack",
286 THREAD_SIZE, 0, NULL); 287 THREAD_SIZE, THREAD_SIZE, 0, 0,
288 THREAD_SIZE, NULL);
287 BUG_ON(thread_stack_cache == NULL); 289 BUG_ON(thread_stack_cache == NULL);
288} 290}
289# endif 291# endif
@@ -390,6 +392,246 @@ void free_task(struct task_struct *tsk)
390} 392}
391EXPORT_SYMBOL(free_task); 393EXPORT_SYMBOL(free_task);
392 394
395#ifdef CONFIG_MMU
396static __latent_entropy int dup_mmap(struct mm_struct *mm,
397 struct mm_struct *oldmm)
398{
399 struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
400 struct rb_node **rb_link, *rb_parent;
401 int retval;
402 unsigned long charge;
403 LIST_HEAD(uf);
404
405 uprobe_start_dup_mmap();
406 if (down_write_killable(&oldmm->mmap_sem)) {
407 retval = -EINTR;
408 goto fail_uprobe_end;
409 }
410 flush_cache_dup_mm(oldmm);
411 uprobe_dup_mmap(oldmm, mm);
412 /*
413 * Not linked in yet - no deadlock potential:
414 */
415 down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
416
417 /* No ordering required: file already has been exposed. */
418 RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
419
420 mm->total_vm = oldmm->total_vm;
421 mm->data_vm = oldmm->data_vm;
422 mm->exec_vm = oldmm->exec_vm;
423 mm->stack_vm = oldmm->stack_vm;
424
425 rb_link = &mm->mm_rb.rb_node;
426 rb_parent = NULL;
427 pprev = &mm->mmap;
428 retval = ksm_fork(mm, oldmm);
429 if (retval)
430 goto out;
431 retval = khugepaged_fork(mm, oldmm);
432 if (retval)
433 goto out;
434
435 prev = NULL;
436 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
437 struct file *file;
438
439 if (mpnt->vm_flags & VM_DONTCOPY) {
440 vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
441 continue;
442 }
443 charge = 0;
444 if (mpnt->vm_flags & VM_ACCOUNT) {
445 unsigned long len = vma_pages(mpnt);
446
447 if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
448 goto fail_nomem;
449 charge = len;
450 }
451 tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
452 if (!tmp)
453 goto fail_nomem;
454 *tmp = *mpnt;
455 INIT_LIST_HEAD(&tmp->anon_vma_chain);
456 retval = vma_dup_policy(mpnt, tmp);
457 if (retval)
458 goto fail_nomem_policy;
459 tmp->vm_mm = mm;
460 retval = dup_userfaultfd(tmp, &uf);
461 if (retval)
462 goto fail_nomem_anon_vma_fork;
463 if (tmp->vm_flags & VM_WIPEONFORK) {
464 /* VM_WIPEONFORK gets a clean slate in the child. */
465 tmp->anon_vma = NULL;
466 if (anon_vma_prepare(tmp))
467 goto fail_nomem_anon_vma_fork;
468 } else if (anon_vma_fork(tmp, mpnt))
469 goto fail_nomem_anon_vma_fork;
470 tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
471 tmp->vm_next = tmp->vm_prev = NULL;
472 file = tmp->vm_file;
473 if (file) {
474 struct inode *inode = file_inode(file);
475 struct address_space *mapping = file->f_mapping;
476
477 get_file(file);
478 if (tmp->vm_flags & VM_DENYWRITE)
479 atomic_dec(&inode->i_writecount);
480 i_mmap_lock_write(mapping);
481 if (tmp->vm_flags & VM_SHARED)
482 atomic_inc(&mapping->i_mmap_writable);
483 flush_dcache_mmap_lock(mapping);
484 /* insert tmp into the share list, just after mpnt */
485 vma_interval_tree_insert_after(tmp, mpnt,
486 &mapping->i_mmap);
487 flush_dcache_mmap_unlock(mapping);
488 i_mmap_unlock_write(mapping);
489 }
490
491 /*
492 * Clear hugetlb-related page reserves for children. This only
493 * affects MAP_PRIVATE mappings. Faults generated by the child
494 * are not guaranteed to succeed, even if read-only
495 */
496 if (is_vm_hugetlb_page(tmp))
497 reset_vma_resv_huge_pages(tmp);
498
499 /*
500 * Link in the new vma and copy the page table entries.
501 */
502 *pprev = tmp;
503 pprev = &tmp->vm_next;
504 tmp->vm_prev = prev;
505 prev = tmp;
506
507 __vma_link_rb(mm, tmp, rb_link, rb_parent);
508 rb_link = &tmp->vm_rb.rb_right;
509 rb_parent = &tmp->vm_rb;
510
511 mm->map_count++;
512 if (!(tmp->vm_flags & VM_WIPEONFORK))
513 retval = copy_page_range(mm, oldmm, mpnt);
514
515 if (tmp->vm_ops && tmp->vm_ops->open)
516 tmp->vm_ops->open(tmp);
517
518 if (retval)
519 goto out;
520 }
521 /* a new mm has just been created */
522 arch_dup_mmap(oldmm, mm);
523 retval = 0;
524out:
525 up_write(&mm->mmap_sem);
526 flush_tlb_mm(oldmm);
527 up_write(&oldmm->mmap_sem);
528 dup_userfaultfd_complete(&uf);
529fail_uprobe_end:
530 uprobe_end_dup_mmap();
531 return retval;
532fail_nomem_anon_vma_fork:
533 mpol_put(vma_policy(tmp));
534fail_nomem_policy:
535 kmem_cache_free(vm_area_cachep, tmp);
536fail_nomem:
537 retval = -ENOMEM;
538 vm_unacct_memory(charge);
539 goto out;
540}
541
542static inline int mm_alloc_pgd(struct mm_struct *mm)
543{
544 mm->pgd = pgd_alloc(mm);
545 if (unlikely(!mm->pgd))
546 return -ENOMEM;
547 return 0;
548}
549
550static inline void mm_free_pgd(struct mm_struct *mm)
551{
552 pgd_free(mm, mm->pgd);
553}
554#else
555static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
556{
557 down_write(&oldmm->mmap_sem);
558 RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
559 up_write(&oldmm->mmap_sem);
560 return 0;
561}
562#define mm_alloc_pgd(mm) (0)
563#define mm_free_pgd(mm)
564#endif /* CONFIG_MMU */
565
566static void check_mm(struct mm_struct *mm)
567{
568 int i;
569
570 for (i = 0; i < NR_MM_COUNTERS; i++) {
571 long x = atomic_long_read(&mm->rss_stat.count[i]);
572
573 if (unlikely(x))
574 printk(KERN_ALERT "BUG: Bad rss-counter state "
575 "mm:%p idx:%d val:%ld\n", mm, i, x);
576 }
577
578 if (mm_pgtables_bytes(mm))
579 pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
580 mm_pgtables_bytes(mm));
581
582#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
583 VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
584#endif
585}
586
587#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
588#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
589
590/*
591 * Called when the last reference to the mm
592 * is dropped: either by a lazy thread or by
593 * mmput. Free the page directory and the mm.
594 */
595static void __mmdrop(struct mm_struct *mm)
596{
597 BUG_ON(mm == &init_mm);
598 mm_free_pgd(mm);
599 destroy_context(mm);
600 hmm_mm_destroy(mm);
601 mmu_notifier_mm_destroy(mm);
602 check_mm(mm);
603 put_user_ns(mm->user_ns);
604 free_mm(mm);
605}
606
607void mmdrop(struct mm_struct *mm)
608{
609 /*
610 * The implicit full barrier implied by atomic_dec_and_test() is
611 * required by the membarrier system call before returning to
612 * user-space, after storing to rq->curr.
613 */
614 if (unlikely(atomic_dec_and_test(&mm->mm_count)))
615 __mmdrop(mm);
616}
617EXPORT_SYMBOL_GPL(mmdrop);
618
619static void mmdrop_async_fn(struct work_struct *work)
620{
621 struct mm_struct *mm;
622
623 mm = container_of(work, struct mm_struct, async_put_work);
624 __mmdrop(mm);
625}
626
627static void mmdrop_async(struct mm_struct *mm)
628{
629 if (unlikely(atomic_dec_and_test(&mm->mm_count))) {
630 INIT_WORK(&mm->async_put_work, mmdrop_async_fn);
631 schedule_work(&mm->async_put_work);
632 }
633}
634
393static inline void free_signal_struct(struct signal_struct *sig) 635static inline void free_signal_struct(struct signal_struct *sig)
394{ 636{
395 taskstats_tgid_free(sig); 637 taskstats_tgid_free(sig);
@@ -457,6 +699,21 @@ static void set_max_threads(unsigned int max_threads_suggested)
457int arch_task_struct_size __read_mostly; 699int arch_task_struct_size __read_mostly;
458#endif 700#endif
459 701
702static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
703{
704 /* Fetch thread_struct whitelist for the architecture. */
705 arch_thread_struct_whitelist(offset, size);
706
707 /*
708 * Handle zero-sized whitelist or empty thread_struct, otherwise
709 * adjust offset to position of thread_struct in task_struct.
710 */
711 if (unlikely(*size == 0))
712 *offset = 0;
713 else
714 *offset += offsetof(struct task_struct, thread);
715}
716
460void __init fork_init(void) 717void __init fork_init(void)
461{ 718{
462 int i; 719 int i;
@@ -465,11 +722,14 @@ void __init fork_init(void)
465#define ARCH_MIN_TASKALIGN 0 722#define ARCH_MIN_TASKALIGN 0
466#endif 723#endif
467 int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN); 724 int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
725 unsigned long useroffset, usersize;
468 726
469 /* create a slab on which task_structs can be allocated */ 727 /* create a slab on which task_structs can be allocated */
470 task_struct_cachep = kmem_cache_create("task_struct", 728 task_struct_whitelist(&useroffset, &usersize);
729 task_struct_cachep = kmem_cache_create_usercopy("task_struct",
471 arch_task_struct_size, align, 730 arch_task_struct_size, align,
472 SLAB_PANIC|SLAB_ACCOUNT, NULL); 731 SLAB_PANIC|SLAB_ACCOUNT,
732 useroffset, usersize, NULL);
473#endif 733#endif
474 734
475 /* do the arch specific task caches init */ 735 /* do the arch specific task caches init */
@@ -594,181 +854,8 @@ free_tsk:
594 return NULL; 854 return NULL;
595} 855}
596 856
597#ifdef CONFIG_MMU
598static __latent_entropy int dup_mmap(struct mm_struct *mm,
599 struct mm_struct *oldmm)
600{
601 struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
602 struct rb_node **rb_link, *rb_parent;
603 int retval;
604 unsigned long charge;
605 LIST_HEAD(uf);
606
607 uprobe_start_dup_mmap();
608 if (down_write_killable(&oldmm->mmap_sem)) {
609 retval = -EINTR;
610 goto fail_uprobe_end;
611 }
612 flush_cache_dup_mm(oldmm);
613 uprobe_dup_mmap(oldmm, mm);
614 /*
615 * Not linked in yet - no deadlock potential:
616 */
617 down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
618
619 /* No ordering required: file already has been exposed. */
620 RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
621
622 mm->total_vm = oldmm->total_vm;
623 mm->data_vm = oldmm->data_vm;
624 mm->exec_vm = oldmm->exec_vm;
625 mm->stack_vm = oldmm->stack_vm;
626
627 rb_link = &mm->mm_rb.rb_node;
628 rb_parent = NULL;
629 pprev = &mm->mmap;
630 retval = ksm_fork(mm, oldmm);
631 if (retval)
632 goto out;
633 retval = khugepaged_fork(mm, oldmm);
634 if (retval)
635 goto out;
636
637 prev = NULL;
638 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
639 struct file *file;
640
641 if (mpnt->vm_flags & VM_DONTCOPY) {
642 vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
643 continue;
644 }
645 charge = 0;
646 if (mpnt->vm_flags & VM_ACCOUNT) {
647 unsigned long len = vma_pages(mpnt);
648
649 if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
650 goto fail_nomem;
651 charge = len;
652 }
653 tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
654 if (!tmp)
655 goto fail_nomem;
656 *tmp = *mpnt;
657 INIT_LIST_HEAD(&tmp->anon_vma_chain);
658 retval = vma_dup_policy(mpnt, tmp);
659 if (retval)
660 goto fail_nomem_policy;
661 tmp->vm_mm = mm;
662 retval = dup_userfaultfd(tmp, &uf);
663 if (retval)
664 goto fail_nomem_anon_vma_fork;
665 if (tmp->vm_flags & VM_WIPEONFORK) {
666 /* VM_WIPEONFORK gets a clean slate in the child. */
667 tmp->anon_vma = NULL;
668 if (anon_vma_prepare(tmp))
669 goto fail_nomem_anon_vma_fork;
670 } else if (anon_vma_fork(tmp, mpnt))
671 goto fail_nomem_anon_vma_fork;
672 tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
673 tmp->vm_next = tmp->vm_prev = NULL;
674 file = tmp->vm_file;
675 if (file) {
676 struct inode *inode = file_inode(file);
677 struct address_space *mapping = file->f_mapping;
678
679 get_file(file);
680 if (tmp->vm_flags & VM_DENYWRITE)
681 atomic_dec(&inode->i_writecount);
682 i_mmap_lock_write(mapping);
683 if (tmp->vm_flags & VM_SHARED)
684 atomic_inc(&mapping->i_mmap_writable);
685 flush_dcache_mmap_lock(mapping);
686 /* insert tmp into the share list, just after mpnt */
687 vma_interval_tree_insert_after(tmp, mpnt,
688 &mapping->i_mmap);
689 flush_dcache_mmap_unlock(mapping);
690 i_mmap_unlock_write(mapping);
691 }
692
693 /*
694 * Clear hugetlb-related page reserves for children. This only
695 * affects MAP_PRIVATE mappings. Faults generated by the child
696 * are not guaranteed to succeed, even if read-only
697 */
698 if (is_vm_hugetlb_page(tmp))
699 reset_vma_resv_huge_pages(tmp);
700
701 /*
702 * Link in the new vma and copy the page table entries.
703 */
704 *pprev = tmp;
705 pprev = &tmp->vm_next;
706 tmp->vm_prev = prev;
707 prev = tmp;
708
709 __vma_link_rb(mm, tmp, rb_link, rb_parent);
710 rb_link = &tmp->vm_rb.rb_right;
711 rb_parent = &tmp->vm_rb;
712
713 mm->map_count++;
714 if (!(tmp->vm_flags & VM_WIPEONFORK))
715 retval = copy_page_range(mm, oldmm, mpnt);
716
717 if (tmp->vm_ops && tmp->vm_ops->open)
718 tmp->vm_ops->open(tmp);
719
720 if (retval)
721 goto out;
722 }
723 /* a new mm has just been created */
724 retval = arch_dup_mmap(oldmm, mm);
725out:
726 up_write(&mm->mmap_sem);
727 flush_tlb_mm(oldmm);
728 up_write(&oldmm->mmap_sem);
729 dup_userfaultfd_complete(&uf);
730fail_uprobe_end:
731 uprobe_end_dup_mmap();
732 return retval;
733fail_nomem_anon_vma_fork:
734 mpol_put(vma_policy(tmp));
735fail_nomem_policy:
736 kmem_cache_free(vm_area_cachep, tmp);
737fail_nomem:
738 retval = -ENOMEM;
739 vm_unacct_memory(charge);
740 goto out;
741}
742
743static inline int mm_alloc_pgd(struct mm_struct *mm)
744{
745 mm->pgd = pgd_alloc(mm);
746 if (unlikely(!mm->pgd))
747 return -ENOMEM;
748 return 0;
749}
750
751static inline void mm_free_pgd(struct mm_struct *mm)
752{
753 pgd_free(mm, mm->pgd);
754}
755#else
756static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
757{
758 down_write(&oldmm->mmap_sem);
759 RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
760 up_write(&oldmm->mmap_sem);
761 return 0;
762}
763#define mm_alloc_pgd(mm) (0)
764#define mm_free_pgd(mm)
765#endif /* CONFIG_MMU */
766
767__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); 857__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
768 858
769#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
770#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
771
772static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT; 859static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
773 860
774static int __init coredump_filter_setup(char *s) 861static int __init coredump_filter_setup(char *s)
@@ -858,27 +945,6 @@ fail_nopgd:
858 return NULL; 945 return NULL;
859} 946}
860 947
861static void check_mm(struct mm_struct *mm)
862{
863 int i;
864
865 for (i = 0; i < NR_MM_COUNTERS; i++) {
866 long x = atomic_long_read(&mm->rss_stat.count[i]);
867
868 if (unlikely(x))
869 printk(KERN_ALERT "BUG: Bad rss-counter state "
870 "mm:%p idx:%d val:%ld\n", mm, i, x);
871 }
872
873 if (mm_pgtables_bytes(mm))
874 pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
875 mm_pgtables_bytes(mm));
876
877#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
878 VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
879#endif
880}
881
882/* 948/*
883 * Allocate and initialize an mm_struct. 949 * Allocate and initialize an mm_struct.
884 */ 950 */
@@ -894,24 +960,6 @@ struct mm_struct *mm_alloc(void)
894 return mm_init(mm, current, current_user_ns()); 960 return mm_init(mm, current, current_user_ns());
895} 961}
896 962
897/*
898 * Called when the last reference to the mm
899 * is dropped: either by a lazy thread or by
900 * mmput. Free the page directory and the mm.
901 */
902void __mmdrop(struct mm_struct *mm)
903{
904 BUG_ON(mm == &init_mm);
905 mm_free_pgd(mm);
906 destroy_context(mm);
907 hmm_mm_destroy(mm);
908 mmu_notifier_mm_destroy(mm);
909 check_mm(mm);
910 put_user_ns(mm->user_ns);
911 free_mm(mm);
912}
913EXPORT_SYMBOL_GPL(__mmdrop);
914
915static inline void __mmput(struct mm_struct *mm) 963static inline void __mmput(struct mm_struct *mm)
916{ 964{
917 VM_BUG_ON(atomic_read(&mm->mm_users)); 965 VM_BUG_ON(atomic_read(&mm->mm_users));
@@ -2224,9 +2272,11 @@ void __init proc_caches_init(void)
2224 * maximum number of CPU's we can ever have. The cpumask_allocation 2272 * maximum number of CPU's we can ever have. The cpumask_allocation
2225 * is at the end of the structure, exactly for that reason. 2273 * is at the end of the structure, exactly for that reason.
2226 */ 2274 */
2227 mm_cachep = kmem_cache_create("mm_struct", 2275 mm_cachep = kmem_cache_create_usercopy("mm_struct",
2228 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, 2276 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
2229 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, 2277 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
2278 offsetof(struct mm_struct, saved_auxv),
2279 sizeof_field(struct mm_struct, saved_auxv),
2230 NULL); 2280 NULL);
2231 vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); 2281 vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
2232 mmap_init(); 2282 mmap_init();
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 4e8089b319ae..8c82ea26e837 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -71,7 +71,7 @@ unsigned long probe_irq_on(void)
71 raw_spin_lock_irq(&desc->lock); 71 raw_spin_lock_irq(&desc->lock);
72 if (!desc->action && irq_settings_can_probe(desc)) { 72 if (!desc->action && irq_settings_can_probe(desc)) {
73 desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; 73 desc->istate |= IRQS_AUTODETECT | IRQS_WAITING;
74 if (irq_startup(desc, IRQ_NORESEND, IRQ_START_FORCE)) 74 if (irq_activate_and_startup(desc, IRQ_NORESEND))
75 desc->istate |= IRQS_PENDING; 75 desc->istate |= IRQS_PENDING;
76 } 76 }
77 raw_spin_unlock_irq(&desc->lock); 77 raw_spin_unlock_irq(&desc->lock);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 043bfc35b353..c69357a43849 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -294,11 +294,11 @@ int irq_activate(struct irq_desc *desc)
294 return 0; 294 return 0;
295} 295}
296 296
297void irq_activate_and_startup(struct irq_desc *desc, bool resend) 297int irq_activate_and_startup(struct irq_desc *desc, bool resend)
298{ 298{
299 if (WARN_ON(irq_activate(desc))) 299 if (WARN_ON(irq_activate(desc)))
300 return; 300 return 0;
301 irq_startup(desc, resend, IRQ_START_FORCE); 301 return irq_startup(desc, resend, IRQ_START_FORCE);
302} 302}
303 303
304static void __irq_disable(struct irq_desc *desc, bool mask); 304static void __irq_disable(struct irq_desc *desc, bool mask);
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h
index e4d3819a91cc..8ccb326d2977 100644
--- a/kernel/irq/debug.h
+++ b/kernel/irq/debug.h
@@ -3,8 +3,6 @@
3 * Debugging printout: 3 * Debugging printout:
4 */ 4 */
5 5
6#include <linux/kallsyms.h>
7
8#define ___P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f) 6#define ___P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f)
9#define ___PS(f) if (desc->istate & f) printk("%14s set\n", #f) 7#define ___PS(f) if (desc->istate & f) printk("%14s set\n", #f)
10/* FIXME */ 8/* FIXME */
@@ -19,14 +17,14 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
19 17
20 printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n", 18 printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n",
21 irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); 19 irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
22 printk("->handle_irq(): %p, ", desc->handle_irq); 20 printk("->handle_irq(): %p, %pS\n",
23 print_symbol("%s\n", (unsigned long)desc->handle_irq); 21 desc->handle_irq, desc->handle_irq);
24 printk("->irq_data.chip(): %p, ", desc->irq_data.chip); 22 printk("->irq_data.chip(): %p, %pS\n",
25 print_symbol("%s\n", (unsigned long)desc->irq_data.chip); 23 desc->irq_data.chip, desc->irq_data.chip);
26 printk("->action(): %p\n", desc->action); 24 printk("->action(): %p\n", desc->action);
27 if (desc->action) { 25 if (desc->action) {
28 printk("->action->handler(): %p, ", desc->action->handler); 26 printk("->action->handler(): %p, %pS\n",
29 print_symbol("%s\n", (unsigned long)desc->action->handler); 27 desc->action->handler, desc->action->handler);
30 } 28 }
31 29
32 ___P(IRQ_LEVEL); 30 ___P(IRQ_LEVEL);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index ab19371eab9b..ca6afa267070 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -76,7 +76,7 @@ extern void __enable_irq(struct irq_desc *desc);
76#define IRQ_START_COND false 76#define IRQ_START_COND false
77 77
78extern int irq_activate(struct irq_desc *desc); 78extern int irq_activate(struct irq_desc *desc);
79extern void irq_activate_and_startup(struct irq_desc *desc, bool resend); 79extern int irq_activate_and_startup(struct irq_desc *desc, bool resend);
80extern int irq_startup(struct irq_desc *desc, bool resend, bool force); 80extern int irq_startup(struct irq_desc *desc, bool resend, bool force);
81 81
82extern void irq_shutdown(struct irq_desc *desc); 82extern void irq_shutdown(struct irq_desc *desc);
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index d5fa4116688a..a23e21ada81b 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -12,7 +12,6 @@
12 * compression (see scripts/kallsyms.c for a more complete description) 12 * compression (see scripts/kallsyms.c for a more complete description)
13 */ 13 */
14#include <linux/kallsyms.h> 14#include <linux/kallsyms.h>
15#include <linux/module.h>
16#include <linux/init.h> 15#include <linux/init.h>
17#include <linux/seq_file.h> 16#include <linux/seq_file.h>
18#include <linux/fs.h> 17#include <linux/fs.h>
@@ -20,15 +19,12 @@
20#include <linux/err.h> 19#include <linux/err.h>
21#include <linux/proc_fs.h> 20#include <linux/proc_fs.h>
22#include <linux/sched.h> /* for cond_resched */ 21#include <linux/sched.h> /* for cond_resched */
23#include <linux/mm.h>
24#include <linux/ctype.h> 22#include <linux/ctype.h>
25#include <linux/slab.h> 23#include <linux/slab.h>
26#include <linux/filter.h> 24#include <linux/filter.h>
27#include <linux/ftrace.h> 25#include <linux/ftrace.h>
28#include <linux/compiler.h> 26#include <linux/compiler.h>
29 27
30#include <asm/sections.h>
31
32/* 28/*
33 * These will be re-linked against their real values 29 * These will be re-linked against their real values
34 * during the second link stage. 30 * during the second link stage.
@@ -52,37 +48,6 @@ extern const u16 kallsyms_token_index[] __weak;
52 48
53extern const unsigned long kallsyms_markers[] __weak; 49extern const unsigned long kallsyms_markers[] __weak;
54 50
55static inline int is_kernel_inittext(unsigned long addr)
56{
57 if (addr >= (unsigned long)_sinittext
58 && addr <= (unsigned long)_einittext)
59 return 1;
60 return 0;
61}
62
63static inline int is_kernel_text(unsigned long addr)
64{
65 if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) ||
66 arch_is_kernel_text(addr))
67 return 1;
68 return in_gate_area_no_mm(addr);
69}
70
71static inline int is_kernel(unsigned long addr)
72{
73 if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end)
74 return 1;
75 return in_gate_area_no_mm(addr);
76}
77
78static int is_ksym_addr(unsigned long addr)
79{
80 if (IS_ENABLED(CONFIG_KALLSYMS_ALL))
81 return is_kernel(addr);
82
83 return is_kernel_text(addr) || is_kernel_inittext(addr);
84}
85
86/* 51/*
87 * Expand a compressed symbol data into the resulting uncompressed string, 52 * Expand a compressed symbol data into the resulting uncompressed string,
88 * if uncompressed string is too long (>= maxlen), it will be truncated, 53 * if uncompressed string is too long (>= maxlen), it will be truncated,
@@ -464,17 +429,6 @@ int sprint_backtrace(char *buffer, unsigned long address)
464 return __sprint_symbol(buffer, address, -1, 1); 429 return __sprint_symbol(buffer, address, -1, 1);
465} 430}
466 431
467/* Look up a kernel symbol and print it to the kernel messages. */
468void __print_symbol(const char *fmt, unsigned long address)
469{
470 char buffer[KSYM_SYMBOL_LEN];
471
472 sprint_symbol(buffer, address);
473
474 printk(fmt, buffer);
475}
476EXPORT_SYMBOL(__print_symbol);
477
478/* To avoid using get_symbol_offset for every symbol, we carry prefix along. */ 432/* To avoid using get_symbol_offset for every symbol, we carry prefix along. */
479struct kallsym_iter { 433struct kallsym_iter {
480 loff_t pos; 434 loff_t pos;
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index de9e45dca70f..3a4656fb7047 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -366,11 +366,6 @@ static int __klp_enable_patch(struct klp_patch *patch)
366 /* 366 /*
367 * A reference is taken on the patch module to prevent it from being 367 * A reference is taken on the patch module to prevent it from being
368 * unloaded. 368 * unloaded.
369 *
370 * Note: For immediate (no consistency model) patches we don't allow
371 * patch modules to unload since there is no safe/sane method to
372 * determine if a thread is still running in the patched code contained
373 * in the patch module once the ftrace registration is successful.
374 */ 369 */
375 if (!try_module_get(patch->mod)) 370 if (!try_module_get(patch->mod))
376 return -ENODEV; 371 return -ENODEV;
@@ -454,6 +449,8 @@ EXPORT_SYMBOL_GPL(klp_enable_patch);
454 * /sys/kernel/livepatch/<patch> 449 * /sys/kernel/livepatch/<patch>
455 * /sys/kernel/livepatch/<patch>/enabled 450 * /sys/kernel/livepatch/<patch>/enabled
456 * /sys/kernel/livepatch/<patch>/transition 451 * /sys/kernel/livepatch/<patch>/transition
452 * /sys/kernel/livepatch/<patch>/signal
453 * /sys/kernel/livepatch/<patch>/force
457 * /sys/kernel/livepatch/<patch>/<object> 454 * /sys/kernel/livepatch/<patch>/<object>
458 * /sys/kernel/livepatch/<patch>/<object>/<function,sympos> 455 * /sys/kernel/livepatch/<patch>/<object>/<function,sympos>
459 */ 456 */
@@ -528,11 +525,73 @@ static ssize_t transition_show(struct kobject *kobj,
528 patch == klp_transition_patch); 525 patch == klp_transition_patch);
529} 526}
530 527
528static ssize_t signal_store(struct kobject *kobj, struct kobj_attribute *attr,
529 const char *buf, size_t count)
530{
531 struct klp_patch *patch;
532 int ret;
533 bool val;
534
535 ret = kstrtobool(buf, &val);
536 if (ret)
537 return ret;
538
539 if (!val)
540 return count;
541
542 mutex_lock(&klp_mutex);
543
544 patch = container_of(kobj, struct klp_patch, kobj);
545 if (patch != klp_transition_patch) {
546 mutex_unlock(&klp_mutex);
547 return -EINVAL;
548 }
549
550 klp_send_signals();
551
552 mutex_unlock(&klp_mutex);
553
554 return count;
555}
556
557static ssize_t force_store(struct kobject *kobj, struct kobj_attribute *attr,
558 const char *buf, size_t count)
559{
560 struct klp_patch *patch;
561 int ret;
562 bool val;
563
564 ret = kstrtobool(buf, &val);
565 if (ret)
566 return ret;
567
568 if (!val)
569 return count;
570
571 mutex_lock(&klp_mutex);
572
573 patch = container_of(kobj, struct klp_patch, kobj);
574 if (patch != klp_transition_patch) {
575 mutex_unlock(&klp_mutex);
576 return -EINVAL;
577 }
578
579 klp_force_transition();
580
581 mutex_unlock(&klp_mutex);
582
583 return count;
584}
585
531static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled); 586static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled);
532static struct kobj_attribute transition_kobj_attr = __ATTR_RO(transition); 587static struct kobj_attribute transition_kobj_attr = __ATTR_RO(transition);
588static struct kobj_attribute signal_kobj_attr = __ATTR_WO(signal);
589static struct kobj_attribute force_kobj_attr = __ATTR_WO(force);
533static struct attribute *klp_patch_attrs[] = { 590static struct attribute *klp_patch_attrs[] = {
534 &enabled_kobj_attr.attr, 591 &enabled_kobj_attr.attr,
535 &transition_kobj_attr.attr, 592 &transition_kobj_attr.attr,
593 &signal_kobj_attr.attr,
594 &force_kobj_attr.attr,
536 NULL 595 NULL
537}; 596};
538 597
@@ -830,12 +889,7 @@ int klp_register_patch(struct klp_patch *patch)
830 if (!klp_initialized()) 889 if (!klp_initialized())
831 return -ENODEV; 890 return -ENODEV;
832 891
833 /* 892 if (!klp_have_reliable_stack()) {
834 * Architectures without reliable stack traces have to set
835 * patch->immediate because there's currently no way to patch kthreads
836 * with the consistency model.
837 */
838 if (!klp_have_reliable_stack() && !patch->immediate) {
839 pr_err("This architecture doesn't have support for the livepatch consistency model.\n"); 893 pr_err("This architecture doesn't have support for the livepatch consistency model.\n");
840 return -ENOSYS; 894 return -ENOSYS;
841 } 895 }
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index 56add6327736..7c6631e693bc 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -33,6 +33,8 @@ struct klp_patch *klp_transition_patch;
33 33
34static int klp_target_state = KLP_UNDEFINED; 34static int klp_target_state = KLP_UNDEFINED;
35 35
36static bool klp_forced = false;
37
36/* 38/*
37 * This work can be performed periodically to finish patching or unpatching any 39 * This work can be performed periodically to finish patching or unpatching any
38 * "straggler" tasks which failed to transition in the first attempt. 40 * "straggler" tasks which failed to transition in the first attempt.
@@ -80,7 +82,6 @@ static void klp_complete_transition(void)
80 struct klp_func *func; 82 struct klp_func *func;
81 struct task_struct *g, *task; 83 struct task_struct *g, *task;
82 unsigned int cpu; 84 unsigned int cpu;
83 bool immediate_func = false;
84 85
85 pr_debug("'%s': completing %s transition\n", 86 pr_debug("'%s': completing %s transition\n",
86 klp_transition_patch->mod->name, 87 klp_transition_patch->mod->name,
@@ -102,16 +103,9 @@ static void klp_complete_transition(void)
102 klp_synchronize_transition(); 103 klp_synchronize_transition();
103 } 104 }
104 105
105 if (klp_transition_patch->immediate) 106 klp_for_each_object(klp_transition_patch, obj)
106 goto done; 107 klp_for_each_func(obj, func)
107
108 klp_for_each_object(klp_transition_patch, obj) {
109 klp_for_each_func(obj, func) {
110 func->transition = false; 108 func->transition = false;
111 if (func->immediate)
112 immediate_func = true;
113 }
114 }
115 109
116 /* Prevent klp_ftrace_handler() from seeing KLP_UNDEFINED state */ 110 /* Prevent klp_ftrace_handler() from seeing KLP_UNDEFINED state */
117 if (klp_target_state == KLP_PATCHED) 111 if (klp_target_state == KLP_PATCHED)
@@ -130,7 +124,6 @@ static void klp_complete_transition(void)
130 task->patch_state = KLP_UNDEFINED; 124 task->patch_state = KLP_UNDEFINED;
131 } 125 }
132 126
133done:
134 klp_for_each_object(klp_transition_patch, obj) { 127 klp_for_each_object(klp_transition_patch, obj) {
135 if (!klp_is_object_loaded(obj)) 128 if (!klp_is_object_loaded(obj))
136 continue; 129 continue;
@@ -144,13 +137,11 @@ done:
144 klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); 137 klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
145 138
146 /* 139 /*
147 * See complementary comment in __klp_enable_patch() for why we 140 * klp_forced set implies unbounded increase of module's ref count if
148 * keep the module reference for immediate patches. 141 * the module is disabled/enabled in a loop.
149 */ 142 */
150 if (!klp_transition_patch->immediate && !immediate_func && 143 if (!klp_forced && klp_target_state == KLP_UNPATCHED)
151 klp_target_state == KLP_UNPATCHED) {
152 module_put(klp_transition_patch->mod); 144 module_put(klp_transition_patch->mod);
153 }
154 145
155 klp_target_state = KLP_UNDEFINED; 146 klp_target_state = KLP_UNDEFINED;
156 klp_transition_patch = NULL; 147 klp_transition_patch = NULL;
@@ -218,9 +209,6 @@ static int klp_check_stack_func(struct klp_func *func,
218 struct klp_ops *ops; 209 struct klp_ops *ops;
219 int i; 210 int i;
220 211
221 if (func->immediate)
222 return 0;
223
224 for (i = 0; i < trace->nr_entries; i++) { 212 for (i = 0; i < trace->nr_entries; i++) {
225 address = trace->entries[i]; 213 address = trace->entries[i];
226 214
@@ -383,13 +371,6 @@ void klp_try_complete_transition(void)
383 WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED); 371 WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED);
384 372
385 /* 373 /*
386 * If the patch can be applied or reverted immediately, skip the
387 * per-task transitions.
388 */
389 if (klp_transition_patch->immediate)
390 goto success;
391
392 /*
393 * Try to switch the tasks to the target patch state by walking their 374 * Try to switch the tasks to the target patch state by walking their
394 * stacks and looking for any to-be-patched or to-be-unpatched 375 * stacks and looking for any to-be-patched or to-be-unpatched
395 * functions. If such functions are found on a stack, or if the stack 376 * functions. If such functions are found on a stack, or if the stack
@@ -432,7 +413,6 @@ void klp_try_complete_transition(void)
432 return; 413 return;
433 } 414 }
434 415
435success:
436 /* we're done, now cleanup the data structures */ 416 /* we're done, now cleanup the data structures */
437 klp_complete_transition(); 417 klp_complete_transition();
438} 418}
@@ -453,13 +433,6 @@ void klp_start_transition(void)
453 klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); 433 klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
454 434
455 /* 435 /*
456 * If the patch can be applied or reverted immediately, skip the
457 * per-task transitions.
458 */
459 if (klp_transition_patch->immediate)
460 return;
461
462 /*
463 * Mark all normal tasks as needing a patch state update. They'll 436 * Mark all normal tasks as needing a patch state update. They'll
464 * switch either in klp_try_complete_transition() or as they exit the 437 * switch either in klp_try_complete_transition() or as they exit the
465 * kernel. 438 * kernel.
@@ -509,13 +482,6 @@ void klp_init_transition(struct klp_patch *patch, int state)
509 klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); 482 klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
510 483
511 /* 484 /*
512 * If the patch can be applied or reverted immediately, skip the
513 * per-task transitions.
514 */
515 if (patch->immediate)
516 return;
517
518 /*
519 * Initialize all tasks to the initial patch state to prepare them for 485 * Initialize all tasks to the initial patch state to prepare them for
520 * switching to the target state. 486 * switching to the target state.
521 */ 487 */
@@ -608,3 +574,71 @@ void klp_copy_process(struct task_struct *child)
608 574
609 /* TIF_PATCH_PENDING gets copied in setup_thread_stack() */ 575 /* TIF_PATCH_PENDING gets copied in setup_thread_stack() */
610} 576}
577
578/*
579 * Sends a fake signal to all non-kthread tasks with TIF_PATCH_PENDING set.
580 * Kthreads with TIF_PATCH_PENDING set are woken up. Only admin can request this
581 * action currently.
582 */
583void klp_send_signals(void)
584{
585 struct task_struct *g, *task;
586
587 pr_notice("signaling remaining tasks\n");
588
589 read_lock(&tasklist_lock);
590 for_each_process_thread(g, task) {
591 if (!klp_patch_pending(task))
592 continue;
593
594 /*
595 * There is a small race here. We could see TIF_PATCH_PENDING
596 * set and decide to wake up a kthread or send a fake signal.
597 * Meanwhile the task could migrate itself and the action
598 * would be meaningless. It is not serious though.
599 */
600 if (task->flags & PF_KTHREAD) {
601 /*
602 * Wake up a kthread which sleeps interruptedly and
603 * still has not been migrated.
604 */
605 wake_up_state(task, TASK_INTERRUPTIBLE);
606 } else {
607 /*
608 * Send fake signal to all non-kthread tasks which are
609 * still not migrated.
610 */
611 spin_lock_irq(&task->sighand->siglock);
612 signal_wake_up(task, 0);
613 spin_unlock_irq(&task->sighand->siglock);
614 }
615 }
616 read_unlock(&tasklist_lock);
617}
618
619/*
620 * Drop TIF_PATCH_PENDING of all tasks on admin's request. This forces an
621 * existing transition to finish.
622 *
623 * NOTE: klp_update_patch_state(task) requires the task to be inactive or
624 * 'current'. This is not the case here and the consistency model could be
625 * broken. Administrator, who is the only one to execute the
626 * klp_force_transitions(), has to be aware of this.
627 */
628void klp_force_transition(void)
629{
630 struct task_struct *g, *task;
631 unsigned int cpu;
632
633 pr_warn("forcing remaining tasks to the patched state\n");
634
635 read_lock(&tasklist_lock);
636 for_each_process_thread(g, task)
637 klp_update_patch_state(task);
638 read_unlock(&tasklist_lock);
639
640 for_each_possible_cpu(cpu)
641 klp_update_patch_state(idle_task(cpu));
642
643 klp_forced = true;
644}
diff --git a/kernel/livepatch/transition.h b/kernel/livepatch/transition.h
index 0f6e27c481f9..f9d0bc016067 100644
--- a/kernel/livepatch/transition.h
+++ b/kernel/livepatch/transition.h
@@ -11,5 +11,7 @@ void klp_cancel_transition(void);
11void klp_start_transition(void); 11void klp_start_transition(void);
12void klp_try_complete_transition(void); 12void klp_try_complete_transition(void);
13void klp_reverse_transition(void); 13void klp_reverse_transition(void);
14void klp_send_signals(void);
15void klp_force_transition(void);
14 16
15#endif /* _LIVEPATCH_TRANSITION_H */ 17#endif /* _LIVEPATCH_TRANSITION_H */
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 403ab9cdb949..4849be5f9b3c 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -188,13 +188,6 @@ static RADIX_TREE(pgmap_radix, GFP_KERNEL);
188#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1) 188#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
189#define SECTION_SIZE (1UL << PA_SECTION_SHIFT) 189#define SECTION_SIZE (1UL << PA_SECTION_SHIFT)
190 190
191struct page_map {
192 struct resource res;
193 struct percpu_ref *ref;
194 struct dev_pagemap pgmap;
195 struct vmem_altmap altmap;
196};
197
198static unsigned long order_at(struct resource *res, unsigned long pgoff) 191static unsigned long order_at(struct resource *res, unsigned long pgoff)
199{ 192{
200 unsigned long phys_pgoff = PHYS_PFN(res->start) + pgoff; 193 unsigned long phys_pgoff = PHYS_PFN(res->start) + pgoff;
@@ -248,34 +241,36 @@ int device_private_entry_fault(struct vm_area_struct *vma,
248EXPORT_SYMBOL(device_private_entry_fault); 241EXPORT_SYMBOL(device_private_entry_fault);
249#endif /* CONFIG_DEVICE_PRIVATE */ 242#endif /* CONFIG_DEVICE_PRIVATE */
250 243
251static void pgmap_radix_release(struct resource *res) 244static void pgmap_radix_release(struct resource *res, unsigned long end_pgoff)
252{ 245{
253 unsigned long pgoff, order; 246 unsigned long pgoff, order;
254 247
255 mutex_lock(&pgmap_lock); 248 mutex_lock(&pgmap_lock);
256 foreach_order_pgoff(res, order, pgoff) 249 foreach_order_pgoff(res, order, pgoff) {
250 if (pgoff >= end_pgoff)
251 break;
257 radix_tree_delete(&pgmap_radix, PHYS_PFN(res->start) + pgoff); 252 radix_tree_delete(&pgmap_radix, PHYS_PFN(res->start) + pgoff);
253 }
258 mutex_unlock(&pgmap_lock); 254 mutex_unlock(&pgmap_lock);
259 255
260 synchronize_rcu(); 256 synchronize_rcu();
261} 257}
262 258
263static unsigned long pfn_first(struct page_map *page_map) 259static unsigned long pfn_first(struct dev_pagemap *pgmap)
264{ 260{
265 struct dev_pagemap *pgmap = &page_map->pgmap; 261 const struct resource *res = &pgmap->res;
266 const struct resource *res = &page_map->res; 262 struct vmem_altmap *altmap = &pgmap->altmap;
267 struct vmem_altmap *altmap = pgmap->altmap;
268 unsigned long pfn; 263 unsigned long pfn;
269 264
270 pfn = res->start >> PAGE_SHIFT; 265 pfn = res->start >> PAGE_SHIFT;
271 if (altmap) 266 if (pgmap->altmap_valid)
272 pfn += vmem_altmap_offset(altmap); 267 pfn += vmem_altmap_offset(altmap);
273 return pfn; 268 return pfn;
274} 269}
275 270
276static unsigned long pfn_end(struct page_map *page_map) 271static unsigned long pfn_end(struct dev_pagemap *pgmap)
277{ 272{
278 const struct resource *res = &page_map->res; 273 const struct resource *res = &pgmap->res;
279 274
280 return (res->start + resource_size(res)) >> PAGE_SHIFT; 275 return (res->start + resource_size(res)) >> PAGE_SHIFT;
281} 276}
@@ -283,15 +278,15 @@ static unsigned long pfn_end(struct page_map *page_map)
283#define for_each_device_pfn(pfn, map) \ 278#define for_each_device_pfn(pfn, map) \
284 for (pfn = pfn_first(map); pfn < pfn_end(map); pfn++) 279 for (pfn = pfn_first(map); pfn < pfn_end(map); pfn++)
285 280
286static void devm_memremap_pages_release(struct device *dev, void *data) 281static void devm_memremap_pages_release(void *data)
287{ 282{
288 struct page_map *page_map = data; 283 struct dev_pagemap *pgmap = data;
289 struct resource *res = &page_map->res; 284 struct device *dev = pgmap->dev;
285 struct resource *res = &pgmap->res;
290 resource_size_t align_start, align_size; 286 resource_size_t align_start, align_size;
291 struct dev_pagemap *pgmap = &page_map->pgmap;
292 unsigned long pfn; 287 unsigned long pfn;
293 288
294 for_each_device_pfn(pfn, page_map) 289 for_each_device_pfn(pfn, pgmap)
295 put_page(pfn_to_page(pfn)); 290 put_page(pfn_to_page(pfn));
296 291
297 if (percpu_ref_tryget_live(pgmap->ref)) { 292 if (percpu_ref_tryget_live(pgmap->ref)) {
@@ -301,56 +296,51 @@ static void devm_memremap_pages_release(struct device *dev, void *data)
301 296
302 /* pages are dead and unused, undo the arch mapping */ 297 /* pages are dead and unused, undo the arch mapping */
303 align_start = res->start & ~(SECTION_SIZE - 1); 298 align_start = res->start & ~(SECTION_SIZE - 1);
304 align_size = ALIGN(resource_size(res), SECTION_SIZE); 299 align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
300 - align_start;
305 301
306 mem_hotplug_begin(); 302 mem_hotplug_begin();
307 arch_remove_memory(align_start, align_size); 303 arch_remove_memory(align_start, align_size, pgmap->altmap_valid ?
304 &pgmap->altmap : NULL);
308 mem_hotplug_done(); 305 mem_hotplug_done();
309 306
310 untrack_pfn(NULL, PHYS_PFN(align_start), align_size); 307 untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
311 pgmap_radix_release(res); 308 pgmap_radix_release(res, -1);
312 dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc, 309 dev_WARN_ONCE(dev, pgmap->altmap.alloc,
313 "%s: failed to free all reserved pages\n", __func__); 310 "%s: failed to free all reserved pages\n", __func__);
314}
315
316/* assumes rcu_read_lock() held at entry */
317struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
318{
319 struct page_map *page_map;
320
321 WARN_ON_ONCE(!rcu_read_lock_held());
322
323 page_map = radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys));
324 return page_map ? &page_map->pgmap : NULL;
325} 311}
326 312
327/** 313/**
328 * devm_memremap_pages - remap and provide memmap backing for the given resource 314 * devm_memremap_pages - remap and provide memmap backing for the given resource
329 * @dev: hosting device for @res 315 * @dev: hosting device for @res
330 * @res: "host memory" address range 316 * @pgmap: pointer to a struct dev_pgmap
331 * @ref: a live per-cpu reference count
332 * @altmap: optional descriptor for allocating the memmap from @res
333 * 317 *
334 * Notes: 318 * Notes:
335 * 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time 319 * 1/ At a minimum the res, ref and type members of @pgmap must be initialized
336 * (or devm release event). The expected order of events is that @ref has 320 * by the caller before passing it to this function
321 *
322 * 2/ The altmap field may optionally be initialized, in which case altmap_valid
323 * must be set to true
324 *
325 * 3/ pgmap.ref must be 'live' on entry and 'dead' before devm_memunmap_pages()
326 * time (or devm release event). The expected order of events is that ref has
337 * been through percpu_ref_kill() before devm_memremap_pages_release(). The 327 * been through percpu_ref_kill() before devm_memremap_pages_release(). The
338 * wait for the completion of all references being dropped and 328 * wait for the completion of all references being dropped and
339 * percpu_ref_exit() must occur after devm_memremap_pages_release(). 329 * percpu_ref_exit() must occur after devm_memremap_pages_release().
340 * 330 *
341 * 2/ @res is expected to be a host memory range that could feasibly be 331 * 4/ res is expected to be a host memory range that could feasibly be
342 * treated as a "System RAM" range, i.e. not a device mmio range, but 332 * treated as a "System RAM" range, i.e. not a device mmio range, but
343 * this is not enforced. 333 * this is not enforced.
344 */ 334 */
345void *devm_memremap_pages(struct device *dev, struct resource *res, 335void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
346 struct percpu_ref *ref, struct vmem_altmap *altmap)
347{ 336{
348 resource_size_t align_start, align_size, align_end; 337 resource_size_t align_start, align_size, align_end;
338 struct vmem_altmap *altmap = pgmap->altmap_valid ?
339 &pgmap->altmap : NULL;
349 unsigned long pfn, pgoff, order; 340 unsigned long pfn, pgoff, order;
350 pgprot_t pgprot = PAGE_KERNEL; 341 pgprot_t pgprot = PAGE_KERNEL;
351 struct dev_pagemap *pgmap;
352 struct page_map *page_map;
353 int error, nid, is_ram, i = 0; 342 int error, nid, is_ram, i = 0;
343 struct resource *res = &pgmap->res;
354 344
355 align_start = res->start & ~(SECTION_SIZE - 1); 345 align_start = res->start & ~(SECTION_SIZE - 1);
356 align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) 346 align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
@@ -367,47 +357,18 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
367 if (is_ram == REGION_INTERSECTS) 357 if (is_ram == REGION_INTERSECTS)
368 return __va(res->start); 358 return __va(res->start);
369 359
370 if (!ref) 360 if (!pgmap->ref)
371 return ERR_PTR(-EINVAL); 361 return ERR_PTR(-EINVAL);
372 362
373 page_map = devres_alloc_node(devm_memremap_pages_release,
374 sizeof(*page_map), GFP_KERNEL, dev_to_node(dev));
375 if (!page_map)
376 return ERR_PTR(-ENOMEM);
377 pgmap = &page_map->pgmap;
378
379 memcpy(&page_map->res, res, sizeof(*res));
380
381 pgmap->dev = dev; 363 pgmap->dev = dev;
382 if (altmap) {
383 memcpy(&page_map->altmap, altmap, sizeof(*altmap));
384 pgmap->altmap = &page_map->altmap;
385 }
386 pgmap->ref = ref;
387 pgmap->res = &page_map->res;
388 pgmap->type = MEMORY_DEVICE_HOST;
389 pgmap->page_fault = NULL;
390 pgmap->page_free = NULL;
391 pgmap->data = NULL;
392 364
393 mutex_lock(&pgmap_lock); 365 mutex_lock(&pgmap_lock);
394 error = 0; 366 error = 0;
395 align_end = align_start + align_size - 1; 367 align_end = align_start + align_size - 1;
396 368
397 foreach_order_pgoff(res, order, pgoff) { 369 foreach_order_pgoff(res, order, pgoff) {
398 struct dev_pagemap *dup;
399
400 rcu_read_lock();
401 dup = find_dev_pagemap(res->start + PFN_PHYS(pgoff));
402 rcu_read_unlock();
403 if (dup) {
404 dev_err(dev, "%s: %pr collides with mapping for %s\n",
405 __func__, res, dev_name(dup->dev));
406 error = -EBUSY;
407 break;
408 }
409 error = __radix_tree_insert(&pgmap_radix, 370 error = __radix_tree_insert(&pgmap_radix,
410 PHYS_PFN(res->start) + pgoff, order, page_map); 371 PHYS_PFN(res->start) + pgoff, order, pgmap);
411 if (error) { 372 if (error) {
412 dev_err(dev, "%s: failed: %d\n", __func__, error); 373 dev_err(dev, "%s: failed: %d\n", __func__, error);
413 break; 374 break;
@@ -427,16 +388,16 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
427 goto err_pfn_remap; 388 goto err_pfn_remap;
428 389
429 mem_hotplug_begin(); 390 mem_hotplug_begin();
430 error = arch_add_memory(nid, align_start, align_size, false); 391 error = arch_add_memory(nid, align_start, align_size, altmap, false);
431 if (!error) 392 if (!error)
432 move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], 393 move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
433 align_start >> PAGE_SHIFT, 394 align_start >> PAGE_SHIFT,
434 align_size >> PAGE_SHIFT); 395 align_size >> PAGE_SHIFT, altmap);
435 mem_hotplug_done(); 396 mem_hotplug_done();
436 if (error) 397 if (error)
437 goto err_add_memory; 398 goto err_add_memory;
438 399
439 for_each_device_pfn(pfn, page_map) { 400 for_each_device_pfn(pfn, pgmap) {
440 struct page *page = pfn_to_page(pfn); 401 struct page *page = pfn_to_page(pfn);
441 402
442 /* 403 /*
@@ -447,19 +408,21 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
447 */ 408 */
448 list_del(&page->lru); 409 list_del(&page->lru);
449 page->pgmap = pgmap; 410 page->pgmap = pgmap;
450 percpu_ref_get(ref); 411 percpu_ref_get(pgmap->ref);
451 if (!(++i % 1024)) 412 if (!(++i % 1024))
452 cond_resched(); 413 cond_resched();
453 } 414 }
454 devres_add(dev, page_map); 415
416 devm_add_action(dev, devm_memremap_pages_release, pgmap);
417
455 return __va(res->start); 418 return __va(res->start);
456 419
457 err_add_memory: 420 err_add_memory:
458 untrack_pfn(NULL, PHYS_PFN(align_start), align_size); 421 untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
459 err_pfn_remap: 422 err_pfn_remap:
460 err_radix: 423 err_radix:
461 pgmap_radix_release(res); 424 pgmap_radix_release(res, pgoff);
462 devres_free(page_map); 425 devres_free(pgmap);
463 return ERR_PTR(error); 426 return ERR_PTR(error);
464} 427}
465EXPORT_SYMBOL(devm_memremap_pages); 428EXPORT_SYMBOL(devm_memremap_pages);
@@ -475,34 +438,39 @@ void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns)
475 altmap->alloc -= nr_pfns; 438 altmap->alloc -= nr_pfns;
476} 439}
477 440
478struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) 441/**
442 * get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn
443 * @pfn: page frame number to lookup page_map
444 * @pgmap: optional known pgmap that already has a reference
445 *
446 * If @pgmap is non-NULL and covers @pfn it will be returned as-is. If @pgmap
447 * is non-NULL but does not cover @pfn the reference to it will be released.
448 */
449struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
450 struct dev_pagemap *pgmap)
479{ 451{
480 /* 452 resource_size_t phys = PFN_PHYS(pfn);
481 * 'memmap_start' is the virtual address for the first "struct
482 * page" in this range of the vmemmap array. In the case of
483 * CONFIG_SPARSEMEM_VMEMMAP a page_to_pfn conversion is simple
484 * pointer arithmetic, so we can perform this to_vmem_altmap()
485 * conversion without concern for the initialization state of
486 * the struct page fields.
487 */
488 struct page *page = (struct page *) memmap_start;
489 struct dev_pagemap *pgmap;
490 453
491 /* 454 /*
492 * Unconditionally retrieve a dev_pagemap associated with the 455 * In the cached case we're already holding a live reference.
493 * given physical address, this is only for use in the
494 * arch_{add|remove}_memory() for setting up and tearing down
495 * the memmap.
496 */ 456 */
457 if (pgmap) {
458 if (phys >= pgmap->res.start && phys <= pgmap->res.end)
459 return pgmap;
460 put_dev_pagemap(pgmap);
461 }
462
463 /* fall back to slow path lookup */
497 rcu_read_lock(); 464 rcu_read_lock();
498 pgmap = find_dev_pagemap(__pfn_to_phys(page_to_pfn(page))); 465 pgmap = radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys));
466 if (pgmap && !percpu_ref_tryget_live(pgmap->ref))
467 pgmap = NULL;
499 rcu_read_unlock(); 468 rcu_read_unlock();
500 469
501 return pgmap ? pgmap->altmap : NULL; 470 return pgmap;
502} 471}
503#endif /* CONFIG_ZONE_DEVICE */ 472#endif /* CONFIG_ZONE_DEVICE */
504 473
505
506#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) 474#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
507void put_zone_device_private_or_public_page(struct page *page) 475void put_zone_device_private_or_public_page(struct page *page)
508{ 476{
diff --git a/kernel/module.c b/kernel/module.c
index 09e48eee4d55..ccdf24c4949e 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3129,7 +3129,11 @@ static int find_module_sections(struct module *mod, struct load_info *info)
3129 sizeof(*mod->ftrace_callsites), 3129 sizeof(*mod->ftrace_callsites),
3130 &mod->num_ftrace_callsites); 3130 &mod->num_ftrace_callsites);
3131#endif 3131#endif
3132 3132#ifdef CONFIG_FUNCTION_ERROR_INJECTION
3133 mod->ei_funcs = section_objs(info, "_error_injection_whitelist",
3134 sizeof(*mod->ei_funcs),
3135 &mod->num_ei_funcs);
3136#endif
3133 mod->extable = section_objs(info, "__ex_table", 3137 mod->extable = section_objs(info, "__ex_table",
3134 sizeof(*mod->extable), &mod->num_exentries); 3138 sizeof(*mod->extable), &mod->num_exentries);
3135 3139
@@ -3949,6 +3953,12 @@ static const char *get_ksymbol(struct module *mod,
3949 return symname(kallsyms, best); 3953 return symname(kallsyms, best);
3950} 3954}
3951 3955
3956void * __weak dereference_module_function_descriptor(struct module *mod,
3957 void *ptr)
3958{
3959 return ptr;
3960}
3961
3952/* For kallsyms to ask for address resolution. NULL means not found. Careful 3962/* For kallsyms to ask for address resolution. NULL means not found. Careful
3953 * not to lock to avoid deadlock on oopses, simply disable preemption. */ 3963 * not to lock to avoid deadlock on oopses, simply disable preemption. */
3954const char *module_address_lookup(unsigned long addr, 3964const char *module_address_lookup(unsigned long addr,
diff --git a/kernel/padata.c b/kernel/padata.c
index 57c0074d50cc..d568cc56405f 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0
1/* 2/*
2 * padata.c - generic interface to process data streams in parallel 3 * padata.c - generic interface to process data streams in parallel
3 * 4 *
diff --git a/kernel/power/power.h b/kernel/power/power.h
index f29cd178df90..9e58bdc8a562 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -104,9 +104,6 @@ extern int in_suspend;
104extern dev_t swsusp_resume_device; 104extern dev_t swsusp_resume_device;
105extern sector_t swsusp_resume_block; 105extern sector_t swsusp_resume_block;
106 106
107extern asmlinkage int swsusp_arch_suspend(void);
108extern asmlinkage int swsusp_arch_resume(void);
109
110extern int create_basic_memory_bitmaps(void); 107extern int create_basic_memory_bitmaps(void);
111extern void free_basic_memory_bitmaps(void); 108extern void free_basic_memory_bitmaps(void);
112extern int hibernate_preallocate_memory(void); 109extern int hibernate_preallocate_memory(void);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index b9006617710f..db4b9b8929eb 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -131,13 +131,10 @@ static int __init control_devkmsg(char *str)
131 /* 131 /*
132 * Set sysctl string accordingly: 132 * Set sysctl string accordingly:
133 */ 133 */
134 if (devkmsg_log == DEVKMSG_LOG_MASK_ON) { 134 if (devkmsg_log == DEVKMSG_LOG_MASK_ON)
135 memset(devkmsg_log_str, 0, DEVKMSG_STR_MAX_SIZE); 135 strcpy(devkmsg_log_str, "on");
136 strncpy(devkmsg_log_str, "on", 2); 136 else if (devkmsg_log == DEVKMSG_LOG_MASK_OFF)
137 } else if (devkmsg_log == DEVKMSG_LOG_MASK_OFF) { 137 strcpy(devkmsg_log_str, "off");
138 memset(devkmsg_log_str, 0, DEVKMSG_STR_MAX_SIZE);
139 strncpy(devkmsg_log_str, "off", 3);
140 }
141 /* else "ratelimit" which is set by default. */ 138 /* else "ratelimit" which is set by default. */
142 139
143 /* 140 /*
@@ -277,6 +274,13 @@ EXPORT_SYMBOL(console_set_on_cmdline);
277/* Flag: console code may call schedule() */ 274/* Flag: console code may call schedule() */
278static int console_may_schedule; 275static int console_may_schedule;
279 276
277enum con_msg_format_flags {
278 MSG_FORMAT_DEFAULT = 0,
279 MSG_FORMAT_SYSLOG = (1 << 0),
280};
281
282static int console_msg_format = MSG_FORMAT_DEFAULT;
283
280/* 284/*
281 * The printk log buffer consists of a chain of concatenated variable 285 * The printk log buffer consists of a chain of concatenated variable
282 * length records. Every record starts with a record header, containing 286 * length records. Every record starts with a record header, containing
@@ -920,10 +924,10 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
920 return ret; 924 return ret;
921} 925}
922 926
923static unsigned int devkmsg_poll(struct file *file, poll_table *wait) 927static __poll_t devkmsg_poll(struct file *file, poll_table *wait)
924{ 928{
925 struct devkmsg_user *user = file->private_data; 929 struct devkmsg_user *user = file->private_data;
926 int ret = 0; 930 __poll_t ret = 0;
927 931
928 if (!user) 932 if (!user)
929 return POLLERR|POLLNVAL; 933 return POLLERR|POLLNVAL;
@@ -1544,6 +1548,146 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
1544} 1548}
1545 1549
1546/* 1550/*
1551 * Special console_lock variants that help to reduce the risk of soft-lockups.
1552 * They allow to pass console_lock to another printk() call using a busy wait.
1553 */
1554
1555#ifdef CONFIG_LOCKDEP
1556static struct lockdep_map console_owner_dep_map = {
1557 .name = "console_owner"
1558};
1559#endif
1560
1561static DEFINE_RAW_SPINLOCK(console_owner_lock);
1562static struct task_struct *console_owner;
1563static bool console_waiter;
1564
1565/**
1566 * console_lock_spinning_enable - mark beginning of code where another
1567 * thread might safely busy wait
1568 *
1569 * This basically converts console_lock into a spinlock. This marks
1570 * the section where the console_lock owner can not sleep, because
1571 * there may be a waiter spinning (like a spinlock). Also it must be
1572 * ready to hand over the lock at the end of the section.
1573 */
1574static void console_lock_spinning_enable(void)
1575{
1576 raw_spin_lock(&console_owner_lock);
1577 console_owner = current;
1578 raw_spin_unlock(&console_owner_lock);
1579
1580 /* The waiter may spin on us after setting console_owner */
1581 spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_);
1582}
1583
1584/**
1585 * console_lock_spinning_disable_and_check - mark end of code where another
1586 * thread was able to busy wait and check if there is a waiter
1587 *
1588 * This is called at the end of the section where spinning is allowed.
1589 * It has two functions. First, it is a signal that it is no longer
1590 * safe to start busy waiting for the lock. Second, it checks if
1591 * there is a busy waiter and passes the lock rights to her.
1592 *
1593 * Important: Callers lose the lock if there was a busy waiter.
1594 * They must not touch items synchronized by console_lock
1595 * in this case.
1596 *
1597 * Return: 1 if the lock rights were passed, 0 otherwise.
1598 */
1599static int console_lock_spinning_disable_and_check(void)
1600{
1601 int waiter;
1602
1603 raw_spin_lock(&console_owner_lock);
1604 waiter = READ_ONCE(console_waiter);
1605 console_owner = NULL;
1606 raw_spin_unlock(&console_owner_lock);
1607
1608 if (!waiter) {
1609 spin_release(&console_owner_dep_map, 1, _THIS_IP_);
1610 return 0;
1611 }
1612
1613 /* The waiter is now free to continue */
1614 WRITE_ONCE(console_waiter, false);
1615
1616 spin_release(&console_owner_dep_map, 1, _THIS_IP_);
1617
1618 /*
1619 * Hand off console_lock to waiter. The waiter will perform
1620 * the up(). After this, the waiter is the console_lock owner.
1621 */
1622 mutex_release(&console_lock_dep_map, 1, _THIS_IP_);
1623 return 1;
1624}
1625
1626/**
1627 * console_trylock_spinning - try to get console_lock by busy waiting
1628 *
1629 * This allows to busy wait for the console_lock when the current
1630 * owner is running in specially marked sections. It means that
1631 * the current owner is running and cannot reschedule until it
1632 * is ready to lose the lock.
1633 *
1634 * Return: 1 if we got the lock, 0 othrewise
1635 */
1636static int console_trylock_spinning(void)
1637{
1638 struct task_struct *owner = NULL;
1639 bool waiter;
1640 bool spin = false;
1641 unsigned long flags;
1642
1643 if (console_trylock())
1644 return 1;
1645
1646 printk_safe_enter_irqsave(flags);
1647
1648 raw_spin_lock(&console_owner_lock);
1649 owner = READ_ONCE(console_owner);
1650 waiter = READ_ONCE(console_waiter);
1651 if (!waiter && owner && owner != current) {
1652 WRITE_ONCE(console_waiter, true);
1653 spin = true;
1654 }
1655 raw_spin_unlock(&console_owner_lock);
1656
1657 /*
1658 * If there is an active printk() writing to the
1659 * consoles, instead of having it write our data too,
1660 * see if we can offload that load from the active
1661 * printer, and do some printing ourselves.
1662 * Go into a spin only if there isn't already a waiter
1663 * spinning, and there is an active printer, and
1664 * that active printer isn't us (recursive printk?).
1665 */
1666 if (!spin) {
1667 printk_safe_exit_irqrestore(flags);
1668 return 0;
1669 }
1670
1671 /* We spin waiting for the owner to release us */
1672 spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_);
1673 /* Owner will clear console_waiter on hand off */
1674 while (READ_ONCE(console_waiter))
1675 cpu_relax();
1676 spin_release(&console_owner_dep_map, 1, _THIS_IP_);
1677
1678 printk_safe_exit_irqrestore(flags);
1679 /*
1680 * The owner passed the console lock to us.
1681 * Since we did not spin on console lock, annotate
1682 * this as a trylock. Otherwise lockdep will
1683 * complain.
1684 */
1685 mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_);
1686
1687 return 1;
1688}
1689
1690/*
1547 * Call the console drivers, asking them to write out 1691 * Call the console drivers, asking them to write out
1548 * log_buf[start] to log_buf[end - 1]. 1692 * log_buf[start] to log_buf[end - 1].
1549 * The console_lock must be held. 1693 * The console_lock must be held.
@@ -1749,12 +1893,19 @@ asmlinkage int vprintk_emit(int facility, int level,
1749 /* If called from the scheduler, we can not call up(). */ 1893 /* If called from the scheduler, we can not call up(). */
1750 if (!in_sched) { 1894 if (!in_sched) {
1751 /* 1895 /*
1896 * Disable preemption to avoid being preempted while holding
1897 * console_sem which would prevent anyone from printing to
1898 * console
1899 */
1900 preempt_disable();
1901 /*
1752 * Try to acquire and then immediately release the console 1902 * Try to acquire and then immediately release the console
1753 * semaphore. The release will print out buffers and wake up 1903 * semaphore. The release will print out buffers and wake up
1754 * /dev/kmsg and syslog() users. 1904 * /dev/kmsg and syslog() users.
1755 */ 1905 */
1756 if (console_trylock()) 1906 if (console_trylock_spinning())
1757 console_unlock(); 1907 console_unlock();
1908 preempt_enable();
1758 } 1909 }
1759 1910
1760 return printed_len; 1911 return printed_len;
@@ -1855,6 +2006,8 @@ static ssize_t msg_print_ext_header(char *buf, size_t size,
1855static ssize_t msg_print_ext_body(char *buf, size_t size, 2006static ssize_t msg_print_ext_body(char *buf, size_t size,
1856 char *dict, size_t dict_len, 2007 char *dict, size_t dict_len,
1857 char *text, size_t text_len) { return 0; } 2008 char *text, size_t text_len) { return 0; }
2009static void console_lock_spinning_enable(void) { }
2010static int console_lock_spinning_disable_and_check(void) { return 0; }
1858static void call_console_drivers(const char *ext_text, size_t ext_len, 2011static void call_console_drivers(const char *ext_text, size_t ext_len,
1859 const char *text, size_t len) {} 2012 const char *text, size_t len) {}
1860static size_t msg_print_text(const struct printk_log *msg, 2013static size_t msg_print_text(const struct printk_log *msg,
@@ -1913,6 +2066,17 @@ static int __add_preferred_console(char *name, int idx, char *options,
1913 c->index = idx; 2066 c->index = idx;
1914 return 0; 2067 return 0;
1915} 2068}
2069
2070static int __init console_msg_format_setup(char *str)
2071{
2072 if (!strcmp(str, "syslog"))
2073 console_msg_format = MSG_FORMAT_SYSLOG;
2074 if (!strcmp(str, "default"))
2075 console_msg_format = MSG_FORMAT_DEFAULT;
2076 return 1;
2077}
2078__setup("console_msg_format=", console_msg_format_setup);
2079
1916/* 2080/*
1917 * Set up a console. Called via do_early_param() in init/main.c 2081 * Set up a console. Called via do_early_param() in init/main.c
1918 * for each "console=" parameter in the boot command line. 2082 * for each "console=" parameter in the boot command line.
@@ -2069,20 +2233,7 @@ int console_trylock(void)
2069 return 0; 2233 return 0;
2070 } 2234 }
2071 console_locked = 1; 2235 console_locked = 1;
2072 /* 2236 console_may_schedule = 0;
2073 * When PREEMPT_COUNT disabled we can't reliably detect if it's
2074 * safe to schedule (e.g. calling printk while holding a spin_lock),
2075 * because preempt_disable()/preempt_enable() are just barriers there
2076 * and preempt_count() is always 0.
2077 *
2078 * RCU read sections have a separate preemption counter when
2079 * PREEMPT_RCU enabled thus we must take extra care and check
2080 * rcu_preempt_depth(), otherwise RCU read sections modify
2081 * preempt_count().
2082 */
2083 console_may_schedule = !oops_in_progress &&
2084 preemptible() &&
2085 !rcu_preempt_depth();
2086 return 1; 2237 return 1;
2087} 2238}
2088EXPORT_SYMBOL(console_trylock); 2239EXPORT_SYMBOL(console_trylock);
@@ -2215,7 +2366,10 @@ skip:
2215 goto skip; 2366 goto skip;
2216 } 2367 }
2217 2368
2218 len += msg_print_text(msg, false, text + len, sizeof(text) - len); 2369 len += msg_print_text(msg,
2370 console_msg_format & MSG_FORMAT_SYSLOG,
2371 text + len,
2372 sizeof(text) - len);
2219 if (nr_ext_console_drivers) { 2373 if (nr_ext_console_drivers) {
2220 ext_len = msg_print_ext_header(ext_text, 2374 ext_len = msg_print_ext_header(ext_text,
2221 sizeof(ext_text), 2375 sizeof(ext_text),
@@ -2229,14 +2383,29 @@ skip:
2229 console_seq++; 2383 console_seq++;
2230 raw_spin_unlock(&logbuf_lock); 2384 raw_spin_unlock(&logbuf_lock);
2231 2385
2386 /*
2387 * While actively printing out messages, if another printk()
2388 * were to occur on another CPU, it may wait for this one to
2389 * finish. This task can not be preempted if there is a
2390 * waiter waiting to take over.
2391 */
2392 console_lock_spinning_enable();
2393
2232 stop_critical_timings(); /* don't trace print latency */ 2394 stop_critical_timings(); /* don't trace print latency */
2233 call_console_drivers(ext_text, ext_len, text, len); 2395 call_console_drivers(ext_text, ext_len, text, len);
2234 start_critical_timings(); 2396 start_critical_timings();
2397
2398 if (console_lock_spinning_disable_and_check()) {
2399 printk_safe_exit_irqrestore(flags);
2400 return;
2401 }
2402
2235 printk_safe_exit_irqrestore(flags); 2403 printk_safe_exit_irqrestore(flags);
2236 2404
2237 if (do_cond_resched) 2405 if (do_cond_resched)
2238 cond_resched(); 2406 cond_resched();
2239 } 2407 }
2408
2240 console_locked = 0; 2409 console_locked = 0;
2241 2410
2242 /* Release the exclusive_console once it is used */ 2411 /* Release the exclusive_console once it is used */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 84b1367935e4..5e1d713c8e61 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -659,7 +659,7 @@ static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info)
659 if (lock_task_sighand(child, &flags)) { 659 if (lock_task_sighand(child, &flags)) {
660 error = -EINVAL; 660 error = -EINVAL;
661 if (likely(child->last_siginfo != NULL)) { 661 if (likely(child->last_siginfo != NULL)) {
662 *info = *child->last_siginfo; 662 copy_siginfo(info, child->last_siginfo);
663 error = 0; 663 error = 0;
664 } 664 }
665 unlock_task_sighand(child, &flags); 665 unlock_task_sighand(child, &flags);
@@ -675,7 +675,7 @@ static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info)
675 if (lock_task_sighand(child, &flags)) { 675 if (lock_task_sighand(child, &flags)) {
676 error = -EINVAL; 676 error = -EINVAL;
677 if (likely(child->last_siginfo != NULL)) { 677 if (likely(child->last_siginfo != NULL)) {
678 *child->last_siginfo = *info; 678 copy_siginfo(child->last_siginfo, info);
679 error = 0; 679 error = 0;
680 } 680 }
681 unlock_task_sighand(child, &flags); 681 unlock_task_sighand(child, &flags);
@@ -1092,6 +1092,10 @@ int ptrace_request(struct task_struct *child, long request,
1092 ret = seccomp_get_filter(child, addr, datavp); 1092 ret = seccomp_get_filter(child, addr, datavp);
1093 break; 1093 break;
1094 1094
1095 case PTRACE_SECCOMP_GET_METADATA:
1096 ret = seccomp_get_metadata(child, addr, datavp);
1097 break;
1098
1095 default: 1099 default:
1096 break; 1100 break;
1097 } 1101 }
@@ -1226,7 +1230,6 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
1226 break; 1230 break;
1227 1231
1228 case PTRACE_SETSIGINFO: 1232 case PTRACE_SETSIGINFO:
1229 memset(&siginfo, 0, sizeof siginfo);
1230 if (copy_siginfo_from_user32( 1233 if (copy_siginfo_from_user32(
1231 &siginfo, (struct compat_siginfo __user *) datap)) 1234 &siginfo, (struct compat_siginfo __user *) datap))
1232 ret = -EFAULT; 1235 ret = -EFAULT;
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index fbd56d6e575b..68fa19a5e7bd 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -422,11 +422,13 @@ void init_rcu_head(struct rcu_head *head)
422{ 422{
423 debug_object_init(head, &rcuhead_debug_descr); 423 debug_object_init(head, &rcuhead_debug_descr);
424} 424}
425EXPORT_SYMBOL_GPL(init_rcu_head);
425 426
426void destroy_rcu_head(struct rcu_head *head) 427void destroy_rcu_head(struct rcu_head *head)
427{ 428{
428 debug_object_free(head, &rcuhead_debug_descr); 429 debug_object_free(head, &rcuhead_debug_descr);
429} 430}
431EXPORT_SYMBOL_GPL(destroy_rcu_head);
430 432
431static bool rcuhead_is_static_object(void *addr) 433static bool rcuhead_is_static_object(void *addr)
432{ 434{
diff --git a/kernel/relay.c b/kernel/relay.c
index 39a9dfc69486..41280033a4c5 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -919,9 +919,9 @@ static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma)
919 * 919 *
920 * Poll implemention. 920 * Poll implemention.
921 */ 921 */
922static unsigned int relay_file_poll(struct file *filp, poll_table *wait) 922static __poll_t relay_file_poll(struct file *filp, poll_table *wait)
923{ 923{
924 unsigned int mask = 0; 924 __poll_t mask = 0;
925 struct rchan_buf *buf = filp->private_data; 925 struct rchan_buf *buf = filp->private_data;
926 926
927 if (buf->finalized) 927 if (buf->finalized)
diff --git a/kernel/resource.c b/kernel/resource.c
index 54ba6de3757c..8c527d83ca76 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1022,6 +1022,7 @@ static void __init __reserve_region_with_split(struct resource *root,
1022 struct resource *conflict; 1022 struct resource *conflict;
1023 struct resource *res = alloc_resource(GFP_ATOMIC); 1023 struct resource *res = alloc_resource(GFP_ATOMIC);
1024 struct resource *next_res = NULL; 1024 struct resource *next_res = NULL;
1025 int type = resource_type(root);
1025 1026
1026 if (!res) 1027 if (!res)
1027 return; 1028 return;
@@ -1029,7 +1030,7 @@ static void __init __reserve_region_with_split(struct resource *root,
1029 res->name = name; 1030 res->name = name;
1030 res->start = start; 1031 res->start = start;
1031 res->end = end; 1032 res->end = end;
1032 res->flags = IORESOURCE_BUSY; 1033 res->flags = type | IORESOURCE_BUSY;
1033 res->desc = IORES_DESC_NONE; 1034 res->desc = IORES_DESC_NONE;
1034 1035
1035 while (1) { 1036 while (1) {
@@ -1064,7 +1065,7 @@ static void __init __reserve_region_with_split(struct resource *root,
1064 next_res->name = name; 1065 next_res->name = name;
1065 next_res->start = conflict->end + 1; 1066 next_res->start = conflict->end + 1;
1066 next_res->end = end; 1067 next_res->end = end;
1067 next_res->flags = IORESOURCE_BUSY; 1068 next_res->flags = type | IORESOURCE_BUSY;
1068 next_res->desc = IORES_DESC_NONE; 1069 next_res->desc = IORES_DESC_NONE;
1069 } 1070 }
1070 } else { 1071 } else {
@@ -1478,7 +1479,7 @@ void __devm_release_region(struct device *dev, struct resource *parent,
1478EXPORT_SYMBOL(__devm_release_region); 1479EXPORT_SYMBOL(__devm_release_region);
1479 1480
1480/* 1481/*
1481 * Called from init/main.c to reserve IO ports. 1482 * Reserve I/O ports or memory based on "reserve=" kernel parameter.
1482 */ 1483 */
1483#define MAXRESERVE 4 1484#define MAXRESERVE 4
1484static int __init reserve_setup(char *str) 1485static int __init reserve_setup(char *str)
@@ -1489,26 +1490,38 @@ static int __init reserve_setup(char *str)
1489 for (;;) { 1490 for (;;) {
1490 unsigned int io_start, io_num; 1491 unsigned int io_start, io_num;
1491 int x = reserved; 1492 int x = reserved;
1493 struct resource *parent;
1492 1494
1493 if (get_option (&str, &io_start) != 2) 1495 if (get_option(&str, &io_start) != 2)
1494 break; 1496 break;
1495 if (get_option (&str, &io_num) == 0) 1497 if (get_option(&str, &io_num) == 0)
1496 break; 1498 break;
1497 if (x < MAXRESERVE) { 1499 if (x < MAXRESERVE) {
1498 struct resource *res = reserve + x; 1500 struct resource *res = reserve + x;
1501
1502 /*
1503 * If the region starts below 0x10000, we assume it's
1504 * I/O port space; otherwise assume it's memory.
1505 */
1506 if (io_start < 0x10000) {
1507 res->flags = IORESOURCE_IO;
1508 parent = &ioport_resource;
1509 } else {
1510 res->flags = IORESOURCE_MEM;
1511 parent = &iomem_resource;
1512 }
1499 res->name = "reserved"; 1513 res->name = "reserved";
1500 res->start = io_start; 1514 res->start = io_start;
1501 res->end = io_start + io_num - 1; 1515 res->end = io_start + io_num - 1;
1502 res->flags = IORESOURCE_BUSY; 1516 res->flags |= IORESOURCE_BUSY;
1503 res->desc = IORES_DESC_NONE; 1517 res->desc = IORES_DESC_NONE;
1504 res->child = NULL; 1518 res->child = NULL;
1505 if (request_resource(res->start >= 0x10000 ? &iomem_resource : &ioport_resource, res) == 0) 1519 if (request_resource(parent, res) == 0)
1506 reserved = x+1; 1520 reserved = x+1;
1507 } 1521 }
1508 } 1522 }
1509 return 1; 1523 return 1;
1510} 1524}
1511
1512__setup("reserve=", reserve_setup); 1525__setup("reserve=", reserve_setup);
1513 1526
1514/* 1527/*
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index a43df5193538..bb4b9fe026a1 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -1,13 +1,12 @@
1// SPDX-License-Identifier: GPL-2.0 1// SPDX-License-Identifier: GPL-2.0
2#include "sched.h"
3
4#include <linux/proc_fs.h> 2#include <linux/proc_fs.h>
5#include <linux/seq_file.h> 3#include <linux/seq_file.h>
6#include <linux/kallsyms.h>
7#include <linux/utsname.h> 4#include <linux/utsname.h>
8#include <linux/security.h> 5#include <linux/security.h>
9#include <linux/export.h> 6#include <linux/export.h>
10 7
8#include "sched.h"
9
11unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; 10unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
12static struct autogroup autogroup_default; 11static struct autogroup autogroup_default;
13static atomic_t autogroup_seq_nr; 12static atomic_t autogroup_seq_nr;
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 5f0dfb2abb8d..940fa408a288 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -515,7 +515,7 @@ void put_seccomp_filter(struct task_struct *tsk)
515 515
516static void seccomp_init_siginfo(siginfo_t *info, int syscall, int reason) 516static void seccomp_init_siginfo(siginfo_t *info, int syscall, int reason)
517{ 517{
518 memset(info, 0, sizeof(*info)); 518 clear_siginfo(info);
519 info->si_signo = SIGSYS; 519 info->si_signo = SIGSYS;
520 info->si_code = SYS_SECCOMP; 520 info->si_code = SYS_SECCOMP;
521 info->si_call_addr = (void __user *)KSTK_EIP(current); 521 info->si_call_addr = (void __user *)KSTK_EIP(current);
@@ -978,49 +978,68 @@ long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
978} 978}
979 979
980#if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE) 980#if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE)
981long seccomp_get_filter(struct task_struct *task, unsigned long filter_off, 981static struct seccomp_filter *get_nth_filter(struct task_struct *task,
982 void __user *data) 982 unsigned long filter_off)
983{ 983{
984 struct seccomp_filter *filter; 984 struct seccomp_filter *orig, *filter;
985 struct sock_fprog_kern *fprog; 985 unsigned long count;
986 long ret;
987 unsigned long count = 0;
988
989 if (!capable(CAP_SYS_ADMIN) ||
990 current->seccomp.mode != SECCOMP_MODE_DISABLED) {
991 return -EACCES;
992 }
993 986
987 /*
988 * Note: this is only correct because the caller should be the (ptrace)
989 * tracer of the task, otherwise lock_task_sighand is needed.
990 */
994 spin_lock_irq(&task->sighand->siglock); 991 spin_lock_irq(&task->sighand->siglock);
992
995 if (task->seccomp.mode != SECCOMP_MODE_FILTER) { 993 if (task->seccomp.mode != SECCOMP_MODE_FILTER) {
996 ret = -EINVAL; 994 spin_unlock_irq(&task->sighand->siglock);
997 goto out; 995 return ERR_PTR(-EINVAL);
998 } 996 }
999 997
1000 filter = task->seccomp.filter; 998 orig = task->seccomp.filter;
1001 while (filter) { 999 __get_seccomp_filter(orig);
1002 filter = filter->prev; 1000 spin_unlock_irq(&task->sighand->siglock);
1001
1002 count = 0;
1003 for (filter = orig; filter; filter = filter->prev)
1003 count++; 1004 count++;
1004 }
1005 1005
1006 if (filter_off >= count) { 1006 if (filter_off >= count) {
1007 ret = -ENOENT; 1007 filter = ERR_PTR(-ENOENT);
1008 goto out; 1008 goto out;
1009 } 1009 }
1010 count -= filter_off;
1011 1010
1012 filter = task->seccomp.filter; 1011 count -= filter_off;
1013 while (filter && count > 1) { 1012 for (filter = orig; filter && count > 1; filter = filter->prev)
1014 filter = filter->prev;
1015 count--; 1013 count--;
1016 }
1017 1014
1018 if (WARN_ON(count != 1 || !filter)) { 1015 if (WARN_ON(count != 1 || !filter)) {
1019 /* The filter tree shouldn't shrink while we're using it. */ 1016 filter = ERR_PTR(-ENOENT);
1020 ret = -ENOENT;
1021 goto out; 1017 goto out;
1022 } 1018 }
1023 1019
1020 __get_seccomp_filter(filter);
1021
1022out:
1023 __put_seccomp_filter(orig);
1024 return filter;
1025}
1026
1027long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
1028 void __user *data)
1029{
1030 struct seccomp_filter *filter;
1031 struct sock_fprog_kern *fprog;
1032 long ret;
1033
1034 if (!capable(CAP_SYS_ADMIN) ||
1035 current->seccomp.mode != SECCOMP_MODE_DISABLED) {
1036 return -EACCES;
1037 }
1038
1039 filter = get_nth_filter(task, filter_off);
1040 if (IS_ERR(filter))
1041 return PTR_ERR(filter);
1042
1024 fprog = filter->prog->orig_prog; 1043 fprog = filter->prog->orig_prog;
1025 if (!fprog) { 1044 if (!fprog) {
1026 /* This must be a new non-cBPF filter, since we save 1045 /* This must be a new non-cBPF filter, since we save
@@ -1035,17 +1054,44 @@ long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
1035 if (!data) 1054 if (!data)
1036 goto out; 1055 goto out;
1037 1056
1038 __get_seccomp_filter(filter);
1039 spin_unlock_irq(&task->sighand->siglock);
1040
1041 if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog))) 1057 if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog)))
1042 ret = -EFAULT; 1058 ret = -EFAULT;
1043 1059
1060out:
1044 __put_seccomp_filter(filter); 1061 __put_seccomp_filter(filter);
1045 return ret; 1062 return ret;
1063}
1046 1064
1047out: 1065long seccomp_get_metadata(struct task_struct *task,
1048 spin_unlock_irq(&task->sighand->siglock); 1066 unsigned long size, void __user *data)
1067{
1068 long ret;
1069 struct seccomp_filter *filter;
1070 struct seccomp_metadata kmd = {};
1071
1072 if (!capable(CAP_SYS_ADMIN) ||
1073 current->seccomp.mode != SECCOMP_MODE_DISABLED) {
1074 return -EACCES;
1075 }
1076
1077 size = min_t(unsigned long, size, sizeof(kmd));
1078
1079 if (copy_from_user(&kmd, data, size))
1080 return -EFAULT;
1081
1082 filter = get_nth_filter(task, kmd.filter_off);
1083 if (IS_ERR(filter))
1084 return PTR_ERR(filter);
1085
1086 memset(&kmd, 0, sizeof(kmd));
1087 if (filter->log)
1088 kmd.flags |= SECCOMP_FILTER_FLAG_LOG;
1089
1090 ret = size;
1091 if (copy_to_user(data, &kmd, size))
1092 ret = -EFAULT;
1093
1094 __put_seccomp_filter(filter);
1049 return ret; 1095 return ret;
1050} 1096}
1051#endif 1097#endif
diff --git a/kernel/signal.c b/kernel/signal.c
index 9558664bd9ec..c6e4c83dc090 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -40,6 +40,7 @@
40#include <linux/cn_proc.h> 40#include <linux/cn_proc.h>
41#include <linux/compiler.h> 41#include <linux/compiler.h>
42#include <linux/posix-timers.h> 42#include <linux/posix-timers.h>
43#include <linux/livepatch.h>
43 44
44#define CREATE_TRACE_POINTS 45#define CREATE_TRACE_POINTS
45#include <trace/events/signal.h> 46#include <trace/events/signal.h>
@@ -165,7 +166,8 @@ void recalc_sigpending_and_wake(struct task_struct *t)
165 166
166void recalc_sigpending(void) 167void recalc_sigpending(void)
167{ 168{
168 if (!recalc_sigpending_tsk(current) && !freezing(current)) 169 if (!recalc_sigpending_tsk(current) && !freezing(current) &&
170 !klp_patch_pending(current))
169 clear_thread_flag(TIF_SIGPENDING); 171 clear_thread_flag(TIF_SIGPENDING);
170 172
171} 173}
@@ -549,6 +551,7 @@ still_pending:
549 * a fast-pathed signal or we must have been 551 * a fast-pathed signal or we must have been
550 * out of queue space. So zero out the info. 552 * out of queue space. So zero out the info.
551 */ 553 */
554 clear_siginfo(info);
552 info->si_signo = sig; 555 info->si_signo = sig;
553 info->si_errno = 0; 556 info->si_errno = 0;
554 info->si_code = SI_USER; 557 info->si_code = SI_USER;
@@ -642,6 +645,9 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
642 spin_unlock(&tsk->sighand->siglock); 645 spin_unlock(&tsk->sighand->siglock);
643 posixtimer_rearm(info); 646 posixtimer_rearm(info);
644 spin_lock(&tsk->sighand->siglock); 647 spin_lock(&tsk->sighand->siglock);
648
649 /* Don't expose the si_sys_private value to userspace */
650 info->si_sys_private = 0;
645 } 651 }
646#endif 652#endif
647 return signr; 653 return signr;
@@ -1043,6 +1049,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
1043 list_add_tail(&q->list, &pending->list); 1049 list_add_tail(&q->list, &pending->list);
1044 switch ((unsigned long) info) { 1050 switch ((unsigned long) info) {
1045 case (unsigned long) SEND_SIG_NOINFO: 1051 case (unsigned long) SEND_SIG_NOINFO:
1052 clear_siginfo(&q->info);
1046 q->info.si_signo = sig; 1053 q->info.si_signo = sig;
1047 q->info.si_errno = 0; 1054 q->info.si_errno = 0;
1048 q->info.si_code = SI_USER; 1055 q->info.si_code = SI_USER;
@@ -1051,6 +1058,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
1051 q->info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); 1058 q->info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
1052 break; 1059 break;
1053 case (unsigned long) SEND_SIG_PRIV: 1060 case (unsigned long) SEND_SIG_PRIV:
1061 clear_siginfo(&q->info);
1054 q->info.si_signo = sig; 1062 q->info.si_signo = sig;
1055 q->info.si_errno = 0; 1063 q->info.si_errno = 0;
1056 q->info.si_code = SI_KERNEL; 1064 q->info.si_code = SI_KERNEL;
@@ -1485,6 +1493,129 @@ force_sigsegv(int sig, struct task_struct *p)
1485 return 0; 1493 return 0;
1486} 1494}
1487 1495
1496int force_sig_fault(int sig, int code, void __user *addr
1497 ___ARCH_SI_TRAPNO(int trapno)
1498 ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
1499 , struct task_struct *t)
1500{
1501 struct siginfo info;
1502
1503 clear_siginfo(&info);
1504 info.si_signo = sig;
1505 info.si_errno = 0;
1506 info.si_code = code;
1507 info.si_addr = addr;
1508#ifdef __ARCH_SI_TRAPNO
1509 info.si_trapno = trapno;
1510#endif
1511#ifdef __ia64__
1512 info.si_imm = imm;
1513 info.si_flags = flags;
1514 info.si_isr = isr;
1515#endif
1516 return force_sig_info(info.si_signo, &info, t);
1517}
1518
1519int send_sig_fault(int sig, int code, void __user *addr
1520 ___ARCH_SI_TRAPNO(int trapno)
1521 ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
1522 , struct task_struct *t)
1523{
1524 struct siginfo info;
1525
1526 clear_siginfo(&info);
1527 info.si_signo = sig;
1528 info.si_errno = 0;
1529 info.si_code = code;
1530 info.si_addr = addr;
1531#ifdef __ARCH_SI_TRAPNO
1532 info.si_trapno = trapno;
1533#endif
1534#ifdef __ia64__
1535 info.si_imm = imm;
1536 info.si_flags = flags;
1537 info.si_isr = isr;
1538#endif
1539 return send_sig_info(info.si_signo, &info, t);
1540}
1541
1542#if defined(BUS_MCEERR_AO) && defined(BUS_MCEERR_AR)
1543int force_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t)
1544{
1545 struct siginfo info;
1546
1547 WARN_ON((code != BUS_MCEERR_AO) && (code != BUS_MCEERR_AR));
1548 clear_siginfo(&info);
1549 info.si_signo = SIGBUS;
1550 info.si_errno = 0;
1551 info.si_code = code;
1552 info.si_addr = addr;
1553 info.si_addr_lsb = lsb;
1554 return force_sig_info(info.si_signo, &info, t);
1555}
1556
1557int send_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t)
1558{
1559 struct siginfo info;
1560
1561 WARN_ON((code != BUS_MCEERR_AO) && (code != BUS_MCEERR_AR));
1562 clear_siginfo(&info);
1563 info.si_signo = SIGBUS;
1564 info.si_errno = 0;
1565 info.si_code = code;
1566 info.si_addr = addr;
1567 info.si_addr_lsb = lsb;
1568 return send_sig_info(info.si_signo, &info, t);
1569}
1570EXPORT_SYMBOL(send_sig_mceerr);
1571#endif
1572
1573#ifdef SEGV_BNDERR
1574int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper)
1575{
1576 struct siginfo info;
1577
1578 clear_siginfo(&info);
1579 info.si_signo = SIGSEGV;
1580 info.si_errno = 0;
1581 info.si_code = SEGV_BNDERR;
1582 info.si_addr = addr;
1583 info.si_lower = lower;
1584 info.si_upper = upper;
1585 return force_sig_info(info.si_signo, &info, current);
1586}
1587#endif
1588
1589#ifdef SEGV_PKUERR
1590int force_sig_pkuerr(void __user *addr, u32 pkey)
1591{
1592 struct siginfo info;
1593
1594 clear_siginfo(&info);
1595 info.si_signo = SIGSEGV;
1596 info.si_errno = 0;
1597 info.si_code = SEGV_PKUERR;
1598 info.si_addr = addr;
1599 info.si_pkey = pkey;
1600 return force_sig_info(info.si_signo, &info, current);
1601}
1602#endif
1603
1604/* For the crazy architectures that include trap information in
1605 * the errno field, instead of an actual errno value.
1606 */
1607int force_sig_ptrace_errno_trap(int errno, void __user *addr)
1608{
1609 struct siginfo info;
1610
1611 clear_siginfo(&info);
1612 info.si_signo = SIGTRAP;
1613 info.si_errno = errno;
1614 info.si_code = TRAP_HWBKPT;
1615 info.si_addr = addr;
1616 return force_sig_info(info.si_signo, &info, current);
1617}
1618
1488int kill_pgrp(struct pid *pid, int sig, int priv) 1619int kill_pgrp(struct pid *pid, int sig, int priv)
1489{ 1620{
1490 int ret; 1621 int ret;
@@ -1623,6 +1754,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
1623 sig = SIGCHLD; 1754 sig = SIGCHLD;
1624 } 1755 }
1625 1756
1757 clear_siginfo(&info);
1626 info.si_signo = sig; 1758 info.si_signo = sig;
1627 info.si_errno = 0; 1759 info.si_errno = 0;
1628 /* 1760 /*
@@ -1717,6 +1849,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
1717 parent = tsk->real_parent; 1849 parent = tsk->real_parent;
1718 } 1850 }
1719 1851
1852 clear_siginfo(&info);
1720 info.si_signo = SIGCHLD; 1853 info.si_signo = SIGCHLD;
1721 info.si_errno = 0; 1854 info.si_errno = 0;
1722 /* 1855 /*
@@ -1929,7 +2062,7 @@ static void ptrace_do_notify(int signr, int exit_code, int why)
1929{ 2062{
1930 siginfo_t info; 2063 siginfo_t info;
1931 2064
1932 memset(&info, 0, sizeof info); 2065 clear_siginfo(&info);
1933 info.si_signo = signr; 2066 info.si_signo = signr;
1934 info.si_code = exit_code; 2067 info.si_code = exit_code;
1935 info.si_pid = task_pid_vnr(current); 2068 info.si_pid = task_pid_vnr(current);
@@ -2136,6 +2269,7 @@ static int ptrace_signal(int signr, siginfo_t *info)
2136 * have updated *info via PTRACE_SETSIGINFO. 2269 * have updated *info via PTRACE_SETSIGINFO.
2137 */ 2270 */
2138 if (signr != info->si_signo) { 2271 if (signr != info->si_signo) {
2272 clear_siginfo(info);
2139 info->si_signo = signr; 2273 info->si_signo = signr;
2140 info->si_errno = 0; 2274 info->si_errno = 0;
2141 info->si_code = SI_USER; 2275 info->si_code = SI_USER;
@@ -2688,9 +2822,7 @@ enum siginfo_layout siginfo_layout(int sig, int si_code)
2688#endif 2822#endif
2689 [SIGCHLD] = { NSIGCHLD, SIL_CHLD }, 2823 [SIGCHLD] = { NSIGCHLD, SIL_CHLD },
2690 [SIGPOLL] = { NSIGPOLL, SIL_POLL }, 2824 [SIGPOLL] = { NSIGPOLL, SIL_POLL },
2691#ifdef __ARCH_SIGSYS
2692 [SIGSYS] = { NSIGSYS, SIL_SYS }, 2825 [SIGSYS] = { NSIGSYS, SIL_SYS },
2693#endif
2694 }; 2826 };
2695 if ((sig < ARRAY_SIZE(filter)) && (si_code <= filter[sig].limit)) 2827 if ((sig < ARRAY_SIZE(filter)) && (si_code <= filter[sig].limit))
2696 layout = filter[sig].layout; 2828 layout = filter[sig].layout;
@@ -2712,12 +2844,14 @@ enum siginfo_layout siginfo_layout(int sig, int si_code)
2712 if ((sig == SIGFPE) && (si_code == FPE_FIXME)) 2844 if ((sig == SIGFPE) && (si_code == FPE_FIXME))
2713 layout = SIL_FAULT; 2845 layout = SIL_FAULT;
2714#endif 2846#endif
2847#ifdef BUS_FIXME
2848 if ((sig == SIGBUS) && (si_code == BUS_FIXME))
2849 layout = SIL_FAULT;
2850#endif
2715 } 2851 }
2716 return layout; 2852 return layout;
2717} 2853}
2718 2854
2719#ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER
2720
2721int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) 2855int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
2722{ 2856{
2723 int err; 2857 int err;
@@ -2756,13 +2890,21 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
2756#ifdef __ARCH_SI_TRAPNO 2890#ifdef __ARCH_SI_TRAPNO
2757 err |= __put_user(from->si_trapno, &to->si_trapno); 2891 err |= __put_user(from->si_trapno, &to->si_trapno);
2758#endif 2892#endif
2759#ifdef BUS_MCEERR_AO 2893#ifdef __ia64__
2894 err |= __put_user(from->si_imm, &to->si_imm);
2895 err |= __put_user(from->si_flags, &to->si_flags);
2896 err |= __put_user(from->si_isr, &to->si_isr);
2897#endif
2760 /* 2898 /*
2761 * Other callers might not initialize the si_lsb field, 2899 * Other callers might not initialize the si_lsb field,
2762 * so check explicitly for the right codes here. 2900 * so check explicitly for the right codes here.
2763 */ 2901 */
2764 if (from->si_signo == SIGBUS && 2902#ifdef BUS_MCEERR_AR
2765 (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)) 2903 if (from->si_signo == SIGBUS && from->si_code == BUS_MCEERR_AR)
2904 err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
2905#endif
2906#ifdef BUS_MCEERR_AO
2907 if (from->si_signo == SIGBUS && from->si_code == BUS_MCEERR_AO)
2766 err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); 2908 err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
2767#endif 2909#endif
2768#ifdef SEGV_BNDERR 2910#ifdef SEGV_BNDERR
@@ -2788,18 +2930,185 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
2788 err |= __put_user(from->si_uid, &to->si_uid); 2930 err |= __put_user(from->si_uid, &to->si_uid);
2789 err |= __put_user(from->si_ptr, &to->si_ptr); 2931 err |= __put_user(from->si_ptr, &to->si_ptr);
2790 break; 2932 break;
2791#ifdef __ARCH_SIGSYS
2792 case SIL_SYS: 2933 case SIL_SYS:
2793 err |= __put_user(from->si_call_addr, &to->si_call_addr); 2934 err |= __put_user(from->si_call_addr, &to->si_call_addr);
2794 err |= __put_user(from->si_syscall, &to->si_syscall); 2935 err |= __put_user(from->si_syscall, &to->si_syscall);
2795 err |= __put_user(from->si_arch, &to->si_arch); 2936 err |= __put_user(from->si_arch, &to->si_arch);
2796 break; 2937 break;
2797#endif
2798 } 2938 }
2799 return err; 2939 return err;
2800} 2940}
2801 2941
2942#ifdef CONFIG_COMPAT
2943int copy_siginfo_to_user32(struct compat_siginfo __user *to,
2944 const struct siginfo *from)
2945#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION)
2946{
2947 return __copy_siginfo_to_user32(to, from, in_x32_syscall());
2948}
2949int __copy_siginfo_to_user32(struct compat_siginfo __user *to,
2950 const struct siginfo *from, bool x32_ABI)
2951#endif
2952{
2953 struct compat_siginfo new;
2954 memset(&new, 0, sizeof(new));
2955
2956 new.si_signo = from->si_signo;
2957 new.si_errno = from->si_errno;
2958 new.si_code = from->si_code;
2959 switch(siginfo_layout(from->si_signo, from->si_code)) {
2960 case SIL_KILL:
2961 new.si_pid = from->si_pid;
2962 new.si_uid = from->si_uid;
2963 break;
2964 case SIL_TIMER:
2965 new.si_tid = from->si_tid;
2966 new.si_overrun = from->si_overrun;
2967 new.si_int = from->si_int;
2968 break;
2969 case SIL_POLL:
2970 new.si_band = from->si_band;
2971 new.si_fd = from->si_fd;
2972 break;
2973 case SIL_FAULT:
2974 new.si_addr = ptr_to_compat(from->si_addr);
2975#ifdef __ARCH_SI_TRAPNO
2976 new.si_trapno = from->si_trapno;
2977#endif
2978#ifdef BUS_MCEERR_AR
2979 if ((from->si_signo == SIGBUS) && (from->si_code == BUS_MCEERR_AR))
2980 new.si_addr_lsb = from->si_addr_lsb;
2981#endif
2982#ifdef BUS_MCEERR_AO
2983 if ((from->si_signo == SIGBUS) && (from->si_code == BUS_MCEERR_AO))
2984 new.si_addr_lsb = from->si_addr_lsb;
2985#endif
2986#ifdef SEGV_BNDERR
2987 if ((from->si_signo == SIGSEGV) &&
2988 (from->si_code == SEGV_BNDERR)) {
2989 new.si_lower = ptr_to_compat(from->si_lower);
2990 new.si_upper = ptr_to_compat(from->si_upper);
2991 }
2992#endif
2993#ifdef SEGV_PKUERR
2994 if ((from->si_signo == SIGSEGV) &&
2995 (from->si_code == SEGV_PKUERR))
2996 new.si_pkey = from->si_pkey;
2997#endif
2998
2999 break;
3000 case SIL_CHLD:
3001 new.si_pid = from->si_pid;
3002 new.si_uid = from->si_uid;
3003 new.si_status = from->si_status;
3004#ifdef CONFIG_X86_X32_ABI
3005 if (x32_ABI) {
3006 new._sifields._sigchld_x32._utime = from->si_utime;
3007 new._sifields._sigchld_x32._stime = from->si_stime;
3008 } else
3009#endif
3010 {
3011 new.si_utime = from->si_utime;
3012 new.si_stime = from->si_stime;
3013 }
3014 break;
3015 case SIL_RT:
3016 new.si_pid = from->si_pid;
3017 new.si_uid = from->si_uid;
3018 new.si_int = from->si_int;
3019 break;
3020 case SIL_SYS:
3021 new.si_call_addr = ptr_to_compat(from->si_call_addr);
3022 new.si_syscall = from->si_syscall;
3023 new.si_arch = from->si_arch;
3024 break;
3025 }
3026
3027 if (copy_to_user(to, &new, sizeof(struct compat_siginfo)))
3028 return -EFAULT;
3029
3030 return 0;
3031}
3032
3033int copy_siginfo_from_user32(struct siginfo *to,
3034 const struct compat_siginfo __user *ufrom)
3035{
3036 struct compat_siginfo from;
3037
3038 if (copy_from_user(&from, ufrom, sizeof(struct compat_siginfo)))
3039 return -EFAULT;
3040
3041 clear_siginfo(to);
3042 to->si_signo = from.si_signo;
3043 to->si_errno = from.si_errno;
3044 to->si_code = from.si_code;
3045 switch(siginfo_layout(from.si_signo, from.si_code)) {
3046 case SIL_KILL:
3047 to->si_pid = from.si_pid;
3048 to->si_uid = from.si_uid;
3049 break;
3050 case SIL_TIMER:
3051 to->si_tid = from.si_tid;
3052 to->si_overrun = from.si_overrun;
3053 to->si_int = from.si_int;
3054 break;
3055 case SIL_POLL:
3056 to->si_band = from.si_band;
3057 to->si_fd = from.si_fd;
3058 break;
3059 case SIL_FAULT:
3060 to->si_addr = compat_ptr(from.si_addr);
3061#ifdef __ARCH_SI_TRAPNO
3062 to->si_trapno = from.si_trapno;
3063#endif
3064#ifdef BUS_MCEERR_AR
3065 if ((from.si_signo == SIGBUS) && (from.si_code == BUS_MCEERR_AR))
3066 to->si_addr_lsb = from.si_addr_lsb;
3067#endif
3068#ifdef BUS_MCEER_AO
3069 if ((from.si_signo == SIGBUS) && (from.si_code == BUS_MCEERR_AO))
3070 to->si_addr_lsb = from.si_addr_lsb;
3071#endif
3072#ifdef SEGV_BNDERR
3073 if ((from.si_signo == SIGSEGV) && (from.si_code == SEGV_BNDERR)) {
3074 to->si_lower = compat_ptr(from.si_lower);
3075 to->si_upper = compat_ptr(from.si_upper);
3076 }
3077#endif
3078#ifdef SEGV_PKUERR
3079 if ((from.si_signo == SIGSEGV) && (from.si_code == SEGV_PKUERR))
3080 to->si_pkey = from.si_pkey;
3081#endif
3082 break;
3083 case SIL_CHLD:
3084 to->si_pid = from.si_pid;
3085 to->si_uid = from.si_uid;
3086 to->si_status = from.si_status;
3087#ifdef CONFIG_X86_X32_ABI
3088 if (in_x32_syscall()) {
3089 to->si_utime = from._sifields._sigchld_x32._utime;
3090 to->si_stime = from._sifields._sigchld_x32._stime;
3091 } else
2802#endif 3092#endif
3093 {
3094 to->si_utime = from.si_utime;
3095 to->si_stime = from.si_stime;
3096 }
3097 break;
3098 case SIL_RT:
3099 to->si_pid = from.si_pid;
3100 to->si_uid = from.si_uid;
3101 to->si_int = from.si_int;
3102 break;
3103 case SIL_SYS:
3104 to->si_call_addr = compat_ptr(from.si_call_addr);
3105 to->si_syscall = from.si_syscall;
3106 to->si_arch = from.si_arch;
3107 break;
3108 }
3109 return 0;
3110}
3111#endif /* CONFIG_COMPAT */
2803 3112
2804/** 3113/**
2805 * do_sigtimedwait - wait for queued signals specified in @which 3114 * do_sigtimedwait - wait for queued signals specified in @which
@@ -2937,6 +3246,7 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
2937{ 3246{
2938 struct siginfo info; 3247 struct siginfo info;
2939 3248
3249 clear_siginfo(&info);
2940 info.si_signo = sig; 3250 info.si_signo = sig;
2941 info.si_errno = 0; 3251 info.si_errno = 0;
2942 info.si_code = SI_USER; 3252 info.si_code = SI_USER;
@@ -2978,8 +3288,9 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
2978 3288
2979static int do_tkill(pid_t tgid, pid_t pid, int sig) 3289static int do_tkill(pid_t tgid, pid_t pid, int sig)
2980{ 3290{
2981 struct siginfo info = {}; 3291 struct siginfo info;
2982 3292
3293 clear_siginfo(&info);
2983 info.si_signo = sig; 3294 info.si_signo = sig;
2984 info.si_errno = 0; 3295 info.si_errno = 0;
2985 info.si_code = SI_TKILL; 3296 info.si_code = SI_TKILL;
@@ -3060,7 +3371,7 @@ COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo,
3060 int, sig, 3371 int, sig,
3061 struct compat_siginfo __user *, uinfo) 3372 struct compat_siginfo __user *, uinfo)
3062{ 3373{
3063 siginfo_t info = {}; 3374 siginfo_t info;
3064 int ret = copy_siginfo_from_user32(&info, uinfo); 3375 int ret = copy_siginfo_from_user32(&info, uinfo);
3065 if (unlikely(ret)) 3376 if (unlikely(ret))
3066 return ret; 3377 return ret;
@@ -3104,7 +3415,7 @@ COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo,
3104 int, sig, 3415 int, sig,
3105 struct compat_siginfo __user *, uinfo) 3416 struct compat_siginfo __user *, uinfo)
3106{ 3417{
3107 siginfo_t info = {}; 3418 siginfo_t info;
3108 3419
3109 if (copy_siginfo_from_user32(&info, uinfo)) 3420 if (copy_siginfo_from_user32(&info, uinfo))
3110 return -EFAULT; 3421 return -EFAULT;
@@ -3677,6 +3988,7 @@ void __init signals_init(void)
3677 /* If this check fails, the __ARCH_SI_PREAMBLE_SIZE value is wrong! */ 3988 /* If this check fails, the __ARCH_SI_PREAMBLE_SIZE value is wrong! */
3678 BUILD_BUG_ON(__ARCH_SI_PREAMBLE_SIZE 3989 BUILD_BUG_ON(__ARCH_SI_PREAMBLE_SIZE
3679 != offsetof(struct siginfo, _sifields._pad)); 3990 != offsetof(struct siginfo, _sifields._pad));
3991 BUILD_BUG_ON(sizeof(struct siginfo) != SI_MAX_SIZE);
3680 3992
3681 sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC); 3993 sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC);
3682} 3994}
@@ -3684,26 +3996,25 @@ void __init signals_init(void)
3684#ifdef CONFIG_KGDB_KDB 3996#ifdef CONFIG_KGDB_KDB
3685#include <linux/kdb.h> 3997#include <linux/kdb.h>
3686/* 3998/*
3687 * kdb_send_sig_info - Allows kdb to send signals without exposing 3999 * kdb_send_sig - Allows kdb to send signals without exposing
3688 * signal internals. This function checks if the required locks are 4000 * signal internals. This function checks if the required locks are
3689 * available before calling the main signal code, to avoid kdb 4001 * available before calling the main signal code, to avoid kdb
3690 * deadlocks. 4002 * deadlocks.
3691 */ 4003 */
3692void 4004void kdb_send_sig(struct task_struct *t, int sig)
3693kdb_send_sig_info(struct task_struct *t, struct siginfo *info)
3694{ 4005{
3695 static struct task_struct *kdb_prev_t; 4006 static struct task_struct *kdb_prev_t;
3696 int sig, new_t; 4007 int new_t, ret;
3697 if (!spin_trylock(&t->sighand->siglock)) { 4008 if (!spin_trylock(&t->sighand->siglock)) {
3698 kdb_printf("Can't do kill command now.\n" 4009 kdb_printf("Can't do kill command now.\n"
3699 "The sigmask lock is held somewhere else in " 4010 "The sigmask lock is held somewhere else in "
3700 "kernel, try again later\n"); 4011 "kernel, try again later\n");
3701 return; 4012 return;
3702 } 4013 }
3703 spin_unlock(&t->sighand->siglock);
3704 new_t = kdb_prev_t != t; 4014 new_t = kdb_prev_t != t;
3705 kdb_prev_t = t; 4015 kdb_prev_t = t;
3706 if (t->state != TASK_RUNNING && new_t) { 4016 if (t->state != TASK_RUNNING && new_t) {
4017 spin_unlock(&t->sighand->siglock);
3707 kdb_printf("Process is not RUNNING, sending a signal from " 4018 kdb_printf("Process is not RUNNING, sending a signal from "
3708 "kdb risks deadlock\n" 4019 "kdb risks deadlock\n"
3709 "on the run queue locks. " 4020 "on the run queue locks. "
@@ -3712,8 +4023,9 @@ kdb_send_sig_info(struct task_struct *t, struct siginfo *info)
3712 "the deadlock.\n"); 4023 "the deadlock.\n");
3713 return; 4024 return;
3714 } 4025 }
3715 sig = info->si_signo; 4026 ret = send_signal(sig, SEND_SIG_PRIV, t, false);
3716 if (send_sig_info(sig, info, t)) 4027 spin_unlock(&t->sighand->siglock);
4028 if (ret)
3717 kdb_printf("Fail to deliver Signal %d to process %d.\n", 4029 kdb_printf("Fail to deliver Signal %d to process %d.\n",
3718 sig, t->pid); 4030 sig, t->pid);
3719 else 4031 else
diff --git a/kernel/sys.c b/kernel/sys.c
index 83ffd7dccf23..f2289de20e19 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -135,7 +135,7 @@ EXPORT_SYMBOL(overflowgid);
135 */ 135 */
136 136
137int fs_overflowuid = DEFAULT_FS_OVERFLOWUID; 137int fs_overflowuid = DEFAULT_FS_OVERFLOWUID;
138int fs_overflowgid = DEFAULT_FS_OVERFLOWUID; 138int fs_overflowgid = DEFAULT_FS_OVERFLOWGID;
139 139
140EXPORT_SYMBOL(fs_overflowuid); 140EXPORT_SYMBOL(fs_overflowuid);
141EXPORT_SYMBOL(fs_overflowgid); 141EXPORT_SYMBOL(fs_overflowgid);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 557d46728577..2fb4e27c636a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1374,13 +1374,6 @@ static struct ctl_table vm_table[] = {
1374 .mode = 0644, 1374 .mode = 0644,
1375 .proc_handler = proc_dointvec, 1375 .proc_handler = proc_dointvec,
1376 }, 1376 },
1377 {
1378 .procname = "hugepages_treat_as_movable",
1379 .data = &hugepages_treat_as_movable,
1380 .maxlen = sizeof(int),
1381 .mode = 0644,
1382 .proc_handler = proc_dointvec,
1383 },
1384 { 1377 {
1385 .procname = "nr_overcommit_hugepages", 1378 .procname = "nr_overcommit_hugepages",
1386 .data = NULL, 1379 .data = NULL,
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index cc91d90abd84..94ad46d50b56 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -68,10 +68,10 @@ static ssize_t posix_clock_read(struct file *fp, char __user *buf,
68 return err; 68 return err;
69} 69}
70 70
71static unsigned int posix_clock_poll(struct file *fp, poll_table *wait) 71static __poll_t posix_clock_poll(struct file *fp, poll_table *wait)
72{ 72{
73 struct posix_clock *clk = get_posix_clock(fp); 73 struct posix_clock *clk = get_posix_clock(fp);
74 unsigned int result = 0; 74 __poll_t result = 0;
75 75
76 if (!clk) 76 if (!clk)
77 return POLLERR; 77 return POLLERR;
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index ec999f32c840..75043046914e 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -462,7 +462,7 @@ static struct k_itimer * alloc_posix_timer(void)
462 kmem_cache_free(posix_timers_cache, tmr); 462 kmem_cache_free(posix_timers_cache, tmr);
463 return NULL; 463 return NULL;
464 } 464 }
465 memset(&tmr->sigq->info, 0, sizeof(siginfo_t)); 465 clear_siginfo(&tmr->sigq->info);
466 return tmr; 466 return tmr;
467} 467}
468 468
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index f54dc62b599c..0b249e2f0c3c 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -530,6 +530,15 @@ config FUNCTION_PROFILER
530 530
531 If in doubt, say N. 531 If in doubt, say N.
532 532
533config BPF_KPROBE_OVERRIDE
534 bool "Enable BPF programs to override a kprobed function"
535 depends on BPF_EVENTS
536 depends on FUNCTION_ERROR_INJECTION
537 default n
538 help
539 Allows BPF to override the execution of a probed function and
540 set a different return value. This is used for error injection.
541
533config FTRACE_MCOUNT_RECORD 542config FTRACE_MCOUNT_RECORD
534 def_bool y 543 def_bool y
535 depends on DYNAMIC_FTRACE 544 depends on DYNAMIC_FTRACE
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 40207c2a4113..fc2838ac8b78 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -13,6 +13,10 @@
13#include <linux/filter.h> 13#include <linux/filter.h>
14#include <linux/uaccess.h> 14#include <linux/uaccess.h>
15#include <linux/ctype.h> 15#include <linux/ctype.h>
16#include <linux/kprobes.h>
17#include <linux/error-injection.h>
18
19#include "trace_probe.h"
16#include "trace.h" 20#include "trace.h"
17 21
18u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); 22u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
@@ -76,6 +80,23 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
76} 80}
77EXPORT_SYMBOL_GPL(trace_call_bpf); 81EXPORT_SYMBOL_GPL(trace_call_bpf);
78 82
83#ifdef CONFIG_BPF_KPROBE_OVERRIDE
84BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
85{
86 regs_set_return_value(regs, rc);
87 override_function_with_return(regs);
88 return 0;
89}
90
91static const struct bpf_func_proto bpf_override_return_proto = {
92 .func = bpf_override_return,
93 .gpl_only = true,
94 .ret_type = RET_INTEGER,
95 .arg1_type = ARG_PTR_TO_CTX,
96 .arg2_type = ARG_ANYTHING,
97};
98#endif
99
79BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr) 100BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr)
80{ 101{
81 int ret; 102 int ret;
@@ -224,7 +245,7 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
224 */ 245 */
225#define __BPF_TP_EMIT() __BPF_ARG3_TP() 246#define __BPF_TP_EMIT() __BPF_ARG3_TP()
226#define __BPF_TP(...) \ 247#define __BPF_TP(...) \
227 __trace_printk(1 /* Fake ip will not be printed. */, \ 248 __trace_printk(0 /* Fake ip */, \
228 fmt, ##__VA_ARGS__) 249 fmt, ##__VA_ARGS__)
229 250
230#define __BPF_ARG1_TP(...) \ 251#define __BPF_ARG1_TP(...) \
@@ -556,6 +577,10 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
556 return &bpf_get_stackid_proto; 577 return &bpf_get_stackid_proto;
557 case BPF_FUNC_perf_event_read_value: 578 case BPF_FUNC_perf_event_read_value:
558 return &bpf_perf_event_read_value_proto; 579 return &bpf_perf_event_read_value_proto;
580#ifdef CONFIG_BPF_KPROBE_OVERRIDE
581 case BPF_FUNC_override_return:
582 return &bpf_override_return_proto;
583#endif
559 default: 584 default:
560 return tracing_func_proto(func_id); 585 return tracing_func_proto(func_id);
561 } 586 }
@@ -773,6 +798,15 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
773 struct bpf_prog_array *new_array; 798 struct bpf_prog_array *new_array;
774 int ret = -EEXIST; 799 int ret = -EEXIST;
775 800
801 /*
802 * Kprobe override only works if they are on the function entry,
803 * and only if they are on the opt-in list.
804 */
805 if (prog->kprobe_override &&
806 (!trace_kprobe_on_func_entry(event->tp_event) ||
807 !trace_kprobe_error_injectable(event->tp_event)))
808 return -EINVAL;
809
776 mutex_lock(&bpf_event_mutex); 810 mutex_lock(&bpf_event_mutex);
777 811
778 if (event->prog) 812 if (event->prog)
@@ -825,3 +859,26 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
825unlock: 859unlock:
826 mutex_unlock(&bpf_event_mutex); 860 mutex_unlock(&bpf_event_mutex);
827} 861}
862
863int perf_event_query_prog_array(struct perf_event *event, void __user *info)
864{
865 struct perf_event_query_bpf __user *uquery = info;
866 struct perf_event_query_bpf query = {};
867 int ret;
868
869 if (!capable(CAP_SYS_ADMIN))
870 return -EPERM;
871 if (event->attr.type != PERF_TYPE_TRACEPOINT)
872 return -EINVAL;
873 if (copy_from_user(&query, uquery, sizeof(query)))
874 return -EFAULT;
875
876 mutex_lock(&bpf_event_mutex);
877 ret = bpf_prog_array_copy_info(event->tp_event->prog_array,
878 uquery->ids,
879 query.ids_len,
880 &uquery->prog_cnt);
881 mutex_unlock(&bpf_event_mutex);
882
883 return ret;
884}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 554b517c61a0..dabd9d167d42 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5015,7 +5015,6 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
5015 5015
5016 parser = &iter->parser; 5016 parser = &iter->parser;
5017 if (trace_parser_loaded(parser)) { 5017 if (trace_parser_loaded(parser)) {
5018 parser->buffer[parser->idx] = 0;
5019 ftrace_match_records(iter->hash, parser->buffer, parser->idx); 5018 ftrace_match_records(iter->hash, parser->buffer, parser->idx);
5020 } 5019 }
5021 5020
@@ -5329,7 +5328,6 @@ ftrace_graph_release(struct inode *inode, struct file *file)
5329 parser = &fgd->parser; 5328 parser = &fgd->parser;
5330 5329
5331 if (trace_parser_loaded((parser))) { 5330 if (trace_parser_loaded((parser))) {
5332 parser->buffer[parser->idx] = 0;
5333 ret = ftrace_graph_set_hash(fgd->new_hash, 5331 ret = ftrace_graph_set_hash(fgd->new_hash,
5334 parser->buffer); 5332 parser->buffer);
5335 } 5333 }
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 5af2842dea96..ca6930e0d25e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -630,7 +630,7 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
630 * Returns POLLIN | POLLRDNORM if data exists in the buffers, 630 * Returns POLLIN | POLLRDNORM if data exists in the buffers,
631 * zero otherwise. 631 * zero otherwise.
632 */ 632 */
633int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, 633__poll_t ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
634 struct file *filp, poll_table *poll_table) 634 struct file *filp, poll_table *poll_table)
635{ 635{
636 struct ring_buffer_per_cpu *cpu_buffer; 636 struct ring_buffer_per_cpu *cpu_buffer;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4f3a8e24b426..56608538a4ad 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -530,8 +530,6 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
530 ubuf += ret; 530 ubuf += ret;
531 cnt -= ret; 531 cnt -= ret;
532 532
533 parser.buffer[parser.idx] = 0;
534
535 ret = -EINVAL; 533 ret = -EINVAL;
536 if (kstrtoul(parser.buffer, 0, &val)) 534 if (kstrtoul(parser.buffer, 0, &val))
537 break; 535 break;
@@ -1236,18 +1234,18 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
1236 cnt--; 1234 cnt--;
1237 } 1235 }
1238 1236
1237 parser->idx = 0;
1238
1239 /* only spaces were written */ 1239 /* only spaces were written */
1240 if (isspace(ch)) { 1240 if (isspace(ch) || !ch) {
1241 *ppos += read; 1241 *ppos += read;
1242 ret = read; 1242 ret = read;
1243 goto out; 1243 goto out;
1244 } 1244 }
1245
1246 parser->idx = 0;
1247 } 1245 }
1248 1246
1249 /* read the non-space input */ 1247 /* read the non-space input */
1250 while (cnt && !isspace(ch)) { 1248 while (cnt && !isspace(ch) && ch) {
1251 if (parser->idx < parser->size - 1) 1249 if (parser->idx < parser->size - 1)
1252 parser->buffer[parser->idx++] = ch; 1250 parser->buffer[parser->idx++] = ch;
1253 else { 1251 else {
@@ -1262,12 +1260,14 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
1262 } 1260 }
1263 1261
1264 /* We either got finished input or we have to wait for another call. */ 1262 /* We either got finished input or we have to wait for another call. */
1265 if (isspace(ch)) { 1263 if (isspace(ch) || !ch) {
1266 parser->buffer[parser->idx] = 0; 1264 parser->buffer[parser->idx] = 0;
1267 parser->cont = false; 1265 parser->cont = false;
1268 } else if (parser->idx < parser->size - 1) { 1266 } else if (parser->idx < parser->size - 1) {
1269 parser->cont = true; 1267 parser->cont = true;
1270 parser->buffer[parser->idx++] = ch; 1268 parser->buffer[parser->idx++] = ch;
1269 /* Make sure the parsed string always terminates with '\0'. */
1270 parser->buffer[parser->idx] = 0;
1271 } else { 1271 } else {
1272 ret = -EINVAL; 1272 ret = -EINVAL;
1273 goto out; 1273 goto out;
@@ -5616,7 +5616,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
5616 return 0; 5616 return 0;
5617} 5617}
5618 5618
5619static unsigned int 5619static __poll_t
5620trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table) 5620trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table)
5621{ 5621{
5622 struct trace_array *tr = iter->tr; 5622 struct trace_array *tr = iter->tr;
@@ -5635,7 +5635,7 @@ trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_tabl
5635 filp, poll_table); 5635 filp, poll_table);
5636} 5636}
5637 5637
5638static unsigned int 5638static __poll_t
5639tracing_poll_pipe(struct file *filp, poll_table *poll_table) 5639tracing_poll_pipe(struct file *filp, poll_table *poll_table)
5640{ 5640{
5641 struct trace_iterator *iter = filp->private_data; 5641 struct trace_iterator *iter = filp->private_data;
@@ -6589,7 +6589,7 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
6589 return ret; 6589 return ret;
6590} 6590}
6591 6591
6592static unsigned int 6592static __poll_t
6593tracing_buffers_poll(struct file *filp, poll_table *poll_table) 6593tracing_buffers_poll(struct file *filp, poll_table *poll_table)
6594{ 6594{
6595 struct ftrace_buffer_info *info = filp->private_data; 6595 struct ftrace_buffer_info *info = filp->private_data;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 1b87157edbff..05c7172c6667 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -885,8 +885,6 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
885 if (*parser.buffer == '!') 885 if (*parser.buffer == '!')
886 set = 0; 886 set = 0;
887 887
888 parser.buffer[parser.idx] = 0;
889
890 ret = ftrace_set_clr_event(tr, parser.buffer + !set, set); 888 ret = ftrace_set_clr_event(tr, parser.buffer + !set, set);
891 if (ret) 889 if (ret)
892 goto out_put; 890 goto out_put;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 492700c5fb4d..1fad24acd444 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -21,6 +21,7 @@
21#include <linux/module.h> 21#include <linux/module.h>
22#include <linux/uaccess.h> 22#include <linux/uaccess.h>
23#include <linux/rculist.h> 23#include <linux/rculist.h>
24#include <linux/error-injection.h>
24 25
25#include "trace_probe.h" 26#include "trace_probe.h"
26 27
@@ -42,7 +43,6 @@ struct trace_kprobe {
42 (offsetof(struct trace_kprobe, tp.args) + \ 43 (offsetof(struct trace_kprobe, tp.args) + \
43 (sizeof(struct probe_arg) * (n))) 44 (sizeof(struct probe_arg) * (n)))
44 45
45
46static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk) 46static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk)
47{ 47{
48 return tk->rp.handler != NULL; 48 return tk->rp.handler != NULL;
@@ -87,6 +87,30 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk)
87 return nhit; 87 return nhit;
88} 88}
89 89
90bool trace_kprobe_on_func_entry(struct trace_event_call *call)
91{
92 struct trace_kprobe *tk = (struct trace_kprobe *)call->data;
93
94 return kprobe_on_func_entry(tk->rp.kp.addr,
95 tk->rp.kp.addr ? NULL : tk->rp.kp.symbol_name,
96 tk->rp.kp.addr ? 0 : tk->rp.kp.offset);
97}
98
99bool trace_kprobe_error_injectable(struct trace_event_call *call)
100{
101 struct trace_kprobe *tk = (struct trace_kprobe *)call->data;
102 unsigned long addr;
103
104 if (tk->symbol) {
105 addr = (unsigned long)
106 kallsyms_lookup_name(trace_kprobe_symbol(tk));
107 addr += tk->rp.kp.offset;
108 } else {
109 addr = (unsigned long)tk->rp.kp.addr;
110 }
111 return within_error_injection_list(addr);
112}
113
90static int register_kprobe_event(struct trace_kprobe *tk); 114static int register_kprobe_event(struct trace_kprobe *tk);
91static int unregister_kprobe_event(struct trace_kprobe *tk); 115static int unregister_kprobe_event(struct trace_kprobe *tk);
92 116
@@ -1170,7 +1194,7 @@ static int kretprobe_event_define_fields(struct trace_event_call *event_call)
1170#ifdef CONFIG_PERF_EVENTS 1194#ifdef CONFIG_PERF_EVENTS
1171 1195
1172/* Kprobe profile handler */ 1196/* Kprobe profile handler */
1173static void 1197static int
1174kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) 1198kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
1175{ 1199{
1176 struct trace_event_call *call = &tk->tp.call; 1200 struct trace_event_call *call = &tk->tp.call;
@@ -1179,12 +1203,31 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
1179 int size, __size, dsize; 1203 int size, __size, dsize;
1180 int rctx; 1204 int rctx;
1181 1205
1182 if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs)) 1206 if (bpf_prog_array_valid(call)) {
1183 return; 1207 unsigned long orig_ip = instruction_pointer(regs);
1208 int ret;
1209
1210 ret = trace_call_bpf(call, regs);
1211
1212 /*
1213 * We need to check and see if we modified the pc of the
1214 * pt_regs, and if so clear the kprobe and return 1 so that we
1215 * don't do the single stepping.
1216 * The ftrace kprobe handler leaves it up to us to re-enable
1217 * preemption here before returning if we've modified the ip.
1218 */
1219 if (orig_ip != instruction_pointer(regs)) {
1220 reset_current_kprobe();
1221 preempt_enable_no_resched();
1222 return 1;
1223 }
1224 if (!ret)
1225 return 0;
1226 }
1184 1227
1185 head = this_cpu_ptr(call->perf_events); 1228 head = this_cpu_ptr(call->perf_events);
1186 if (hlist_empty(head)) 1229 if (hlist_empty(head))
1187 return; 1230 return 0;
1188 1231
1189 dsize = __get_data_size(&tk->tp, regs); 1232 dsize = __get_data_size(&tk->tp, regs);
1190 __size = sizeof(*entry) + tk->tp.size + dsize; 1233 __size = sizeof(*entry) + tk->tp.size + dsize;
@@ -1193,13 +1236,14 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
1193 1236
1194 entry = perf_trace_buf_alloc(size, NULL, &rctx); 1237 entry = perf_trace_buf_alloc(size, NULL, &rctx);
1195 if (!entry) 1238 if (!entry)
1196 return; 1239 return 0;
1197 1240
1198 entry->ip = (unsigned long)tk->rp.kp.addr; 1241 entry->ip = (unsigned long)tk->rp.kp.addr;
1199 memset(&entry[1], 0, dsize); 1242 memset(&entry[1], 0, dsize);
1200 store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); 1243 store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
1201 perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, 1244 perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
1202 head, NULL); 1245 head, NULL);
1246 return 0;
1203} 1247}
1204NOKPROBE_SYMBOL(kprobe_perf_func); 1248NOKPROBE_SYMBOL(kprobe_perf_func);
1205 1249
@@ -1275,6 +1319,7 @@ static int kprobe_register(struct trace_event_call *event,
1275static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) 1319static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
1276{ 1320{
1277 struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp); 1321 struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp);
1322 int ret = 0;
1278 1323
1279 raw_cpu_inc(*tk->nhit); 1324 raw_cpu_inc(*tk->nhit);
1280 1325
@@ -1282,9 +1327,9 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
1282 kprobe_trace_func(tk, regs); 1327 kprobe_trace_func(tk, regs);
1283#ifdef CONFIG_PERF_EVENTS 1328#ifdef CONFIG_PERF_EVENTS
1284 if (tk->tp.flags & TP_FLAG_PROFILE) 1329 if (tk->tp.flags & TP_FLAG_PROFILE)
1285 kprobe_perf_func(tk, regs); 1330 ret = kprobe_perf_func(tk, regs);
1286#endif 1331#endif
1287 return 0; /* We don't tweek kernel, so just return 0 */ 1332 return ret;
1288} 1333}
1289NOKPROBE_SYMBOL(kprobe_dispatcher); 1334NOKPROBE_SYMBOL(kprobe_dispatcher);
1290 1335
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index fb66e3eaa192..e101c5bb9eda 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -252,6 +252,8 @@ struct symbol_cache;
252unsigned long update_symbol_cache(struct symbol_cache *sc); 252unsigned long update_symbol_cache(struct symbol_cache *sc);
253void free_symbol_cache(struct symbol_cache *sc); 253void free_symbol_cache(struct symbol_cache *sc);
254struct symbol_cache *alloc_symbol_cache(const char *sym, long offset); 254struct symbol_cache *alloc_symbol_cache(const char *sym, long offset);
255bool trace_kprobe_on_func_entry(struct trace_event_call *call);
256bool trace_kprobe_error_injectable(struct trace_event_call *call);
255#else 257#else
256/* uprobes do not support symbol fetch methods */ 258/* uprobes do not support symbol fetch methods */
257#define fetch_symbol_u8 NULL 259#define fetch_symbol_u8 NULL
@@ -277,6 +279,16 @@ alloc_symbol_cache(const char *sym, long offset)
277{ 279{
278 return NULL; 280 return NULL;
279} 281}
282
283static inline bool trace_kprobe_on_func_entry(struct trace_event_call *call)
284{
285 return false;
286}
287
288static inline bool trace_kprobe_error_injectable(struct trace_event_call *call)
289{
290 return false;
291}
280#endif /* CONFIG_KPROBE_EVENTS */ 292#endif /* CONFIG_KPROBE_EVENTS */
281 293
282struct probe_arg { 294struct probe_arg {
diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c
index 8cda06a10d66..c364cf777e1a 100644
--- a/kernel/trace/trace_selftest_dynamic.c
+++ b/kernel/trace/trace_selftest_dynamic.c
@@ -1,13 +1,14 @@
1// SPDX-License-Identifier: GPL-2.0 1// SPDX-License-Identifier: GPL-2.0
2#include <linux/compiler.h>
2#include "trace.h" 3#include "trace.h"
3 4
4int DYN_FTRACE_TEST_NAME(void) 5noinline __noclone int DYN_FTRACE_TEST_NAME(void)
5{ 6{
6 /* used to call mcount */ 7 /* used to call mcount */
7 return 0; 8 return 0;
8} 9}
9 10
10int DYN_FTRACE_TEST_NAME2(void) 11noinline __noclone int DYN_FTRACE_TEST_NAME2(void)
11{ 12{
12 /* used to call mcount */ 13 /* used to call mcount */
13 return 0; 14 return 0;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 40592e7b3568..268029ae1be6 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -608,7 +608,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
608 608
609 /* Don't print "0x (null)" when offset is 0 */ 609 /* Don't print "0x (null)" when offset is 0 */
610 if (tu->offset) { 610 if (tu->offset) {
611 seq_printf(m, "0x%p", (void *)tu->offset); 611 seq_printf(m, "0x%px", (void *)tu->offset);
612 } else { 612 } else {
613 switch (sizeof(void *)) { 613 switch (sizeof(void *)) {
614 case 4: 614 case 4:
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8c34981d90ad..017044c26233 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3807,6 +3807,7 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
3807 3807
3808 return ret; 3808 return ret;
3809} 3809}
3810EXPORT_SYMBOL_GPL(apply_workqueue_attrs);
3810 3811
3811/** 3812/**
3812 * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug 3813 * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug
@@ -3940,6 +3941,37 @@ static int wq_clamp_max_active(int max_active, unsigned int flags,
3940 return clamp_val(max_active, 1, lim); 3941 return clamp_val(max_active, 1, lim);
3941} 3942}
3942 3943
3944/*
3945 * Workqueues which may be used during memory reclaim should have a rescuer
3946 * to guarantee forward progress.
3947 */
3948static int init_rescuer(struct workqueue_struct *wq)
3949{
3950 struct worker *rescuer;
3951 int ret;
3952
3953 if (!(wq->flags & WQ_MEM_RECLAIM))
3954 return 0;
3955
3956 rescuer = alloc_worker(NUMA_NO_NODE);
3957 if (!rescuer)
3958 return -ENOMEM;
3959
3960 rescuer->rescue_wq = wq;
3961 rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", wq->name);
3962 ret = PTR_ERR_OR_ZERO(rescuer->task);
3963 if (ret) {
3964 kfree(rescuer);
3965 return ret;
3966 }
3967
3968 wq->rescuer = rescuer;
3969 kthread_bind_mask(rescuer->task, cpu_possible_mask);
3970 wake_up_process(rescuer->task);
3971
3972 return 0;
3973}
3974
3943struct workqueue_struct *__alloc_workqueue_key(const char *fmt, 3975struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
3944 unsigned int flags, 3976 unsigned int flags,
3945 int max_active, 3977 int max_active,
@@ -4002,29 +4034,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
4002 if (alloc_and_link_pwqs(wq) < 0) 4034 if (alloc_and_link_pwqs(wq) < 0)
4003 goto err_free_wq; 4035 goto err_free_wq;
4004 4036
4005 /* 4037 if (wq_online && init_rescuer(wq) < 0)
4006 * Workqueues which may be used during memory reclaim should 4038 goto err_destroy;
4007 * have a rescuer to guarantee forward progress.
4008 */
4009 if (flags & WQ_MEM_RECLAIM) {
4010 struct worker *rescuer;
4011
4012 rescuer = alloc_worker(NUMA_NO_NODE);
4013 if (!rescuer)
4014 goto err_destroy;
4015
4016 rescuer->rescue_wq = wq;
4017 rescuer->task = kthread_create(rescuer_thread, rescuer, "%s",
4018 wq->name);
4019 if (IS_ERR(rescuer->task)) {
4020 kfree(rescuer);
4021 goto err_destroy;
4022 }
4023
4024 wq->rescuer = rescuer;
4025 kthread_bind_mask(rescuer->task, cpu_possible_mask);
4026 wake_up_process(rescuer->task);
4027 }
4028 4039
4029 if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq)) 4040 if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
4030 goto err_destroy; 4041 goto err_destroy;
@@ -5642,6 +5653,8 @@ int __init workqueue_init(void)
5642 * archs such as power and arm64. As per-cpu pools created 5653 * archs such as power and arm64. As per-cpu pools created
5643 * previously could be missing node hint and unbound pools NUMA 5654 * previously could be missing node hint and unbound pools NUMA
5644 * affinity, fix them up. 5655 * affinity, fix them up.
5656 *
5657 * Also, while iterating workqueues, create rescuers if requested.
5645 */ 5658 */
5646 wq_numa_init(); 5659 wq_numa_init();
5647 5660
@@ -5653,8 +5666,12 @@ int __init workqueue_init(void)
5653 } 5666 }
5654 } 5667 }
5655 5668
5656 list_for_each_entry(wq, &workqueues, list) 5669 list_for_each_entry(wq, &workqueues, list) {
5657 wq_update_unbound_numa(wq, smp_processor_id(), true); 5670 wq_update_unbound_numa(wq, smp_processor_id(), true);
5671 WARN(init_rescuer(wq),
5672 "workqueue: failed to create early rescuer for %s",
5673 wq->name);
5674 }
5658 5675
5659 mutex_unlock(&wq_pool_mutex); 5676 mutex_unlock(&wq_pool_mutex);
5660 5677