From 9dd957485d7d896ec18d8e2f9dd410efe71eca34 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 3 Jul 2017 00:42:43 -0400 Subject: ipc, kernel, mm: annotate ->poll() instances Signed-off-by: Al Viro --- kernel/events/core.c | 4 ++-- kernel/printk/printk.c | 4 ++-- kernel/relay.c | 4 ++-- kernel/time/posix-clock.c | 4 ++-- kernel/trace/trace.c | 6 +++--- 5 files changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 16beab4767e1..857c40d98d2c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4511,11 +4511,11 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) return ret; } -static unsigned int perf_poll(struct file *file, poll_table *wait) +static __poll_t perf_poll(struct file *file, poll_table *wait) { struct perf_event *event = file->private_data; struct ring_buffer *rb; - unsigned int events = POLLHUP; + __poll_t events = POLLHUP; poll_wait(file, &event->waitq, wait); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 5d81206a572d..8aa27be96012 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -920,10 +920,10 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) return ret; } -static unsigned int devkmsg_poll(struct file *file, poll_table *wait) +static __poll_t devkmsg_poll(struct file *file, poll_table *wait) { struct devkmsg_user *user = file->private_data; - int ret = 0; + __poll_t ret = 0; if (!user) return POLLERR|POLLNVAL; diff --git a/kernel/relay.c b/kernel/relay.c index 39a9dfc69486..41280033a4c5 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -919,9 +919,9 @@ static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma) * * Poll implemention. */ -static unsigned int relay_file_poll(struct file *filp, poll_table *wait) +static __poll_t relay_file_poll(struct file *filp, poll_table *wait) { - unsigned int mask = 0; + __poll_t mask = 0; struct rchan_buf *buf = filp->private_data; if (buf->finalized) diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 17cdc554c9fe..36b04e7fcf62 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -68,10 +68,10 @@ static ssize_t posix_clock_read(struct file *fp, char __user *buf, return err; } -static unsigned int posix_clock_poll(struct file *fp, poll_table *wait) +static __poll_t posix_clock_poll(struct file *fp, poll_table *wait) { struct posix_clock *clk = get_posix_clock(fp); - unsigned int result = 0; + __poll_t result = 0; if (!clk) return POLLERR; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 73e67b68c53b..1e2a45e87b93 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5632,7 +5632,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) return 0; } -static unsigned int +static __poll_t trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table) { struct trace_array *tr = iter->tr; @@ -5651,7 +5651,7 @@ trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_tabl filp, poll_table); } -static unsigned int +static __poll_t tracing_poll_pipe(struct file *filp, poll_table *poll_table) { struct trace_iterator *iter = filp->private_data; @@ -6605,7 +6605,7 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp) return ret; } -static unsigned int +static __poll_t tracing_buffers_poll(struct file *filp, poll_table *poll_table) { struct ftrace_buffer_info *info = filp->private_data; -- cgit v1.2.2 From ecf927000ce3265e9871c79d43c10ceed8bd61c9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 16 Jul 2017 22:11:54 -0400 Subject: ring_buffer_poll_wait() return value used as return value of ->poll() Signed-off-by: Al Viro --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 91874a95060d..d24d48713ef3 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -626,7 +626,7 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full) * Returns POLLIN | POLLRDNORM if data exists in the buffers, * zero otherwise. */ -int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, +__poll_t ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, struct file *filp, poll_table *poll_table) { struct ring_buffer_per_cpu *cpu_buffer; -- cgit v1.2.2 From f06eae831f0c1fc5b982ea200daf552810e1dd55 Mon Sep 17 00:00:00 2001 From: Tycho Andersen Date: Wed, 11 Oct 2017 09:39:20 -0600 Subject: seccomp: hoist out filter resolving logic Hoist out the nth filter resolving logic that ptrace uses into a new function. We'll use this in the next patch to implement the new PTRACE_SECCOMP_GET_FILTER_FLAGS command. Signed-off-by: Tycho Andersen CC: Kees Cook CC: Andy Lutomirski CC: Oleg Nesterov Signed-off-by: Kees Cook --- kernel/seccomp.c | 77 +++++++++++++++++++++++++++++++++----------------------- 1 file changed, 45 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 5f0dfb2abb8d..99bddaf79076 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -978,49 +978,68 @@ long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) } #if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE) -long seccomp_get_filter(struct task_struct *task, unsigned long filter_off, - void __user *data) +static struct seccomp_filter *get_nth_filter(struct task_struct *task, + unsigned long filter_off) { - struct seccomp_filter *filter; - struct sock_fprog_kern *fprog; - long ret; - unsigned long count = 0; - - if (!capable(CAP_SYS_ADMIN) || - current->seccomp.mode != SECCOMP_MODE_DISABLED) { - return -EACCES; - } + struct seccomp_filter *orig, *filter; + unsigned long count; + /* + * Note: this is only correct because the caller should be the (ptrace) + * tracer of the task, otherwise lock_task_sighand is needed. + */ spin_lock_irq(&task->sighand->siglock); + if (task->seccomp.mode != SECCOMP_MODE_FILTER) { - ret = -EINVAL; - goto out; + spin_unlock_irq(&task->sighand->siglock); + return ERR_PTR(-EINVAL); } - filter = task->seccomp.filter; - while (filter) { - filter = filter->prev; + orig = task->seccomp.filter; + __get_seccomp_filter(orig); + spin_unlock_irq(&task->sighand->siglock); + + count = 0; + for (filter = orig; filter; filter = filter->prev) count++; - } if (filter_off >= count) { - ret = -ENOENT; + filter = ERR_PTR(-ENOENT); goto out; } - count -= filter_off; - filter = task->seccomp.filter; - while (filter && count > 1) { - filter = filter->prev; + count -= filter_off; + for (filter = orig; filter && count > 1; filter = filter->prev) count--; - } if (WARN_ON(count != 1 || !filter)) { - /* The filter tree shouldn't shrink while we're using it. */ - ret = -ENOENT; + filter = ERR_PTR(-ENOENT); goto out; } + __get_seccomp_filter(filter); + +out: + __put_seccomp_filter(orig); + return filter; +} + +long seccomp_get_filter(struct task_struct *task, unsigned long filter_off, + void __user *data) +{ + struct seccomp_filter *filter; + struct sock_fprog_kern *fprog; + long ret; + + if (!capable(CAP_SYS_ADMIN) || + current->seccomp.mode != SECCOMP_MODE_DISABLED) { + return -EACCES; + } + + filter = get_nth_filter(task, filter_off); + if (IS_ERR(filter)) + return PTR_ERR(filter); + fprog = filter->prog->orig_prog; if (!fprog) { /* This must be a new non-cBPF filter, since we save @@ -1035,17 +1054,11 @@ long seccomp_get_filter(struct task_struct *task, unsigned long filter_off, if (!data) goto out; - __get_seccomp_filter(filter); - spin_unlock_irq(&task->sighand->siglock); - if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog))) ret = -EFAULT; - __put_seccomp_filter(filter); - return ret; - out: - spin_unlock_irq(&task->sighand->siglock); + __put_seccomp_filter(filter); return ret; } #endif -- cgit v1.2.2 From 26500475ac1b499d8636ff281311d633909f5d20 Mon Sep 17 00:00:00 2001 From: Tycho Andersen Date: Wed, 11 Oct 2017 09:39:21 -0600 Subject: ptrace, seccomp: add support for retrieving seccomp metadata With the new SECCOMP_FILTER_FLAG_LOG, we need to be able to extract these flags for checkpoint restore, since they describe the state of a filter. So, let's add PTRACE_SECCOMP_GET_METADATA, similar to ..._GET_FILTER, which returns the metadata of the nth filter (right now, just the flags). Hopefully this will be future proof, and new per-filter metadata can be added to this struct. Signed-off-by: Tycho Andersen CC: Kees Cook CC: Andy Lutomirski CC: Oleg Nesterov Signed-off-by: Kees Cook --- kernel/ptrace.c | 4 ++++ kernel/seccomp.c | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 84b1367935e4..58291e9f3276 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -1092,6 +1092,10 @@ int ptrace_request(struct task_struct *child, long request, ret = seccomp_get_filter(child, addr, datavp); break; + case PTRACE_SECCOMP_GET_METADATA: + ret = seccomp_get_metadata(child, addr, datavp); + break; + default: break; } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 99bddaf79076..61bd9dc260c8 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1061,6 +1061,39 @@ out: __put_seccomp_filter(filter); return ret; } + +long seccomp_get_metadata(struct task_struct *task, + unsigned long size, void __user *data) +{ + long ret; + struct seccomp_filter *filter; + struct seccomp_metadata kmd = {}; + + if (!capable(CAP_SYS_ADMIN) || + current->seccomp.mode != SECCOMP_MODE_DISABLED) { + return -EACCES; + } + + size = min_t(unsigned long, size, sizeof(kmd)); + + if (copy_from_user(&kmd, data, size)) + return -EFAULT; + + filter = get_nth_filter(task, kmd.filter_off); + if (IS_ERR(filter)) + return PTR_ERR(filter); + + memset(&kmd, 0, sizeof(kmd)); + if (filter->log) + kmd.flags |= SECCOMP_FILTER_FLAG_LOG; + + ret = size; + if (copy_to_user(data, &kmd, size)) + ret = -EFAULT; + + __put_seccomp_filter(filter); + return ret; +} #endif #ifdef CONFIG_SYSCTL -- cgit v1.2.2 From 12a3cc8424fe1237aaeb982dec4f0914ddd22f3e Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 30 Nov 2017 21:31:35 -0800 Subject: bpf: fix stack state printing in verifier log fix incorrect stack state prints in print_verifier_state() Fixes: 638f5b90d460 ("bpf: reduce verifier memory consumption") Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d4593571c404..71a9429fdbb5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -279,7 +279,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { if (state->stack[i].slot_type[0] == STACK_SPILL) verbose(env, " fp%d=%s", - -MAX_BPF_STACK + i * BPF_REG_SIZE, + (-i - 1) * BPF_REG_SIZE, reg_type_str[state->stack[i].spilled_ptr.type]); } verbose(env, "\n"); -- cgit v1.2.2 From 4e92024a48ecbd06fba3ccfb2174abd3d2f54a83 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 30 Nov 2017 21:31:36 -0800 Subject: bpf: print liveness info to verifier log let verifier print register and stack liveness information into verifier log Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 71a9429fdbb5..f7229390c279 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -216,6 +216,17 @@ static const char * const reg_type_str[] = { [PTR_TO_PACKET_END] = "pkt_end", }; +static void print_liveness(struct bpf_verifier_env *env, + enum bpf_reg_liveness live) +{ + if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN)) + verbose(env, "_"); + if (live & REG_LIVE_READ) + verbose(env, "r"); + if (live & REG_LIVE_WRITTEN) + verbose(env, "w"); +} + static void print_verifier_state(struct bpf_verifier_env *env, struct bpf_verifier_state *state) { @@ -228,7 +239,9 @@ static void print_verifier_state(struct bpf_verifier_env *env, t = reg->type; if (t == NOT_INIT) continue; - verbose(env, " R%d=%s", i, reg_type_str[t]); + verbose(env, " R%d", i); + print_liveness(env, reg->live); + verbose(env, "=%s", reg_type_str[t]); if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && tnum_is_const(reg->var_off)) { /* reg->off should be 0 for SCALAR_VALUE */ @@ -277,10 +290,13 @@ static void print_verifier_state(struct bpf_verifier_env *env, } } for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] == STACK_SPILL) - verbose(env, " fp%d=%s", - (-i - 1) * BPF_REG_SIZE, + if (state->stack[i].slot_type[0] == STACK_SPILL) { + verbose(env, " fp%d", + (-i - 1) * BPF_REG_SIZE); + print_liveness(env, state->stack[i].spilled_ptr.live); + verbose(env, "=%s", reg_type_str[state->stack[i].spilled_ptr.type]); + } } verbose(env, "\n"); } -- cgit v1.2.2 From 19ceb4178d31e543479d75e20c2f9df08f16632f Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 30 Nov 2017 21:31:37 -0800 Subject: bpf: don't mark FP reg as uninit when verifier hits an internal bug don't mark register R10==FP as uninit, since it's read only register and it's not technically correct to let verifier run further, since it may assume that R10 has valid auxiliary state. While developing subsequent patches this issue was discovered, though the code eventually changed that aux reg state doesn't have pointers any more it is still safer to avoid clearing readonly register. Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f7229390c279..14ad7c6e806a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -584,8 +584,8 @@ static void mark_reg_unknown(struct bpf_verifier_env *env, { if (WARN_ON(regno >= MAX_BPF_REG)) { verbose(env, "mark_reg_unknown(regs, %u)\n", regno); - /* Something bad happened, let's kill all regs */ - for (regno = 0; regno < MAX_BPF_REG; regno++) + /* Something bad happened, let's kill all regs except FP */ + for (regno = 0; regno < BPF_REG_FP; regno++) __mark_reg_not_init(regs + regno); return; } @@ -603,8 +603,8 @@ static void mark_reg_not_init(struct bpf_verifier_env *env, { if (WARN_ON(regno >= MAX_BPF_REG)) { verbose(env, "mark_reg_not_init(regs, %u)\n", regno); - /* Something bad happened, let's kill all regs */ - for (regno = 0; regno < MAX_BPF_REG; regno++) + /* Something bad happened, let's kill all regs except FP */ + for (regno = 0; regno < BPF_REG_FP; regno++) __mark_reg_not_init(regs + regno); return; } -- cgit v1.2.2 From 2f18f62ee1648b2c93e6ab4a58d548010b0a67e4 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 30 Nov 2017 21:31:38 -0800 Subject: bpf: improve verifier liveness marks registers with pointers filled from stack were missing live_written marks which caused liveness propagation to unnecessary mark more registers as live_read and miss state pruning opportunities later on. before after bpf_lb-DLB_L3.o 2285 2270 bpf_lb-DLB_L4.o 3723 3682 bpf_lb-DUNKNOWN.o 1110 1110 bpf_lxc-DDROP_ALL.o 27954 27876 bpf_lxc-DUNKNOWN.o 38954 38780 bpf_netdev.o 16943 16937 bpf_overlay.o 7929 7929 Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 14ad7c6e806a..46ff4e5b3fb7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -795,6 +795,11 @@ static int check_stack_read(struct bpf_verifier_env *env, if (value_regno >= 0) { /* restore register state from stack */ state->regs[value_regno] = state->stack[spi].spilled_ptr; + /* mark reg as written since spilled pointer state likely + * has its liveness marks cleared by is_state_visited() + * which resets stack/reg liveness for state transitions + */ + state->regs[value_regno].live |= REG_LIVE_WRITTEN; mark_stack_slot_read(state, spi); } return 0; -- cgit v1.2.2 From 3bf15921c58df982f9b15d64754c483785bf66f3 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 30 Nov 2017 21:31:39 -0800 Subject: bpf: improve JEQ/JNE path walking verifier knows how to trim paths that are known not to be taken at run-time when register containing run-time constant is compared with another constant. It was done only for JEQ comparison. Extend it to include JNE as well. More cases can be added in the future. before after bpf_lb-DLB_L3.o 2270 2051 bpf_lb-DLB_L4.o 3682 3287 bpf_lb-DUNKNOWN.o 1110 1080 bpf_lxc-DDROP_ALL.o 27876 24980 bpf_lxc-DUNKNOWN.o 38780 34308 bpf_netdev.o 16937 15404 bpf_overlay.o 7929 7191 Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 46ff4e5b3fb7..afe9a1a0a5fe 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2955,8 +2955,9 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (BPF_SRC(insn->code) == BPF_K && (opcode == BPF_JEQ || opcode == BPF_JNE) && dst_reg->type == SCALAR_VALUE && - tnum_equals_const(dst_reg->var_off, insn->imm)) { - if (opcode == BPF_JEQ) { + tnum_is_const(dst_reg->var_off)) { + if ((opcode == BPF_JEQ && dst_reg->var_off.value == insn->imm) || + (opcode == BPF_JNE && dst_reg->var_off.value != insn->imm)) { /* if (imm == imm) goto pc+off; * only follow the goto, ignore fall-through */ -- cgit v1.2.2 From 914cb781ee1a35f4c7a5173a668d6ba2c4734b91 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 30 Nov 2017 21:31:40 -0800 Subject: bpf: cleanup register_is_null() don't pass large struct bpf_reg_state by value. Instead pass it by pointer. Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index afe9a1a0a5fe..7afa92e9b409 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1265,9 +1265,9 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins } /* Does this register contain a constant zero? */ -static bool register_is_null(struct bpf_reg_state reg) +static bool register_is_null(struct bpf_reg_state *reg) { - return reg.type == SCALAR_VALUE && tnum_equals_const(reg.var_off, 0); + return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0); } /* when register 'regno' is passed into function that will read 'access_size' @@ -1280,31 +1280,31 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, int access_size, bool zero_size_allowed, struct bpf_call_arg_meta *meta) { + struct bpf_reg_state *reg = cur_regs(env) + regno; struct bpf_verifier_state *state = env->cur_state; - struct bpf_reg_state *regs = state->regs; int off, i, slot, spi; - if (regs[regno].type != PTR_TO_STACK) { + if (reg->type != PTR_TO_STACK) { /* Allow zero-byte read from NULL, regardless of pointer type */ if (zero_size_allowed && access_size == 0 && - register_is_null(regs[regno])) + register_is_null(reg)) return 0; verbose(env, "R%d type=%s expected=%s\n", regno, - reg_type_str[regs[regno].type], + reg_type_str[reg->type], reg_type_str[PTR_TO_STACK]); return -EACCES; } /* Only allow fixed-offset stack reads */ - if (!tnum_is_const(regs[regno].var_off)) { + if (!tnum_is_const(reg->var_off)) { char tn_buf[48]; - tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off); + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); verbose(env, "invalid variable stack read R%d var_off=%s\n", regno, tn_buf); } - off = regs[regno].off + regs[regno].var_off.value; + off = reg->off + reg->var_off.value; if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || access_size < 0 || (access_size == 0 && !zero_size_allowed)) { verbose(env, "invalid stack type R%d off=%d access_size=%d\n", @@ -1412,7 +1412,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, * passed in as argument, it's a SCALAR_VALUE type. Final test * happens during stack boundary checking. */ - if (register_is_null(*reg) && + if (register_is_null(reg) && arg_type == ARG_PTR_TO_MEM_OR_NULL) /* final test in check_stack_boundary() */; else if (!type_is_pkt_pointer(type) && -- cgit v1.2.2 From 43347d56c8d9dd732cee2f8efd384ad21dd1f6c4 Mon Sep 17 00:00:00 2001 From: Miroslav Benes Date: Wed, 15 Nov 2017 14:50:13 +0100 Subject: livepatch: send a fake signal to all blocking tasks Live patching consistency model is of LEAVE_PATCHED_SET and SWITCH_THREAD. This means that all tasks in the system have to be marked one by one as safe to call a new patched function. Safe means when a task is not (sleeping) in a set of patched functions. That is, no patched function is on the task's stack. Another clearly safe place is the boundary between kernel and userspace. The patching waits for all tasks to get outside of the patched set or to cross the boundary. The transition is completed afterwards. The problem is that a task can block the transition for quite a long time, if not forever. It could sleep in a set of patched functions, for example. Luckily we can force the task to leave the set by sending it a fake signal, that is a signal with no data in signal pending structures (no handler, no sign of proper signal delivered). Suspend/freezer use this to freeze the tasks as well. The task gets TIF_SIGPENDING set and is woken up (if it has been sleeping in the kernel before) or kicked by rescheduling IPI (if it was running on other CPU). This causes the task to go to kernel/userspace boundary where the signal would be handled and the task would be marked as safe in terms of live patching. There are tasks which are not affected by this technique though. The fake signal is not sent to kthreads. They should be handled differently. They can be woken up so they leave the patched set and their TIF_PATCH_PENDING can be cleared thanks to stack checking. For the sake of completeness, if the task is in TASK_RUNNING state but not currently running on some CPU it doesn't get the IPI, but it would eventually handle the signal anyway. Second, if the task runs in the kernel (in TASK_RUNNING state) it gets the IPI, but the signal is not handled on return from the interrupt. It would be handled on return to the userspace in the future when the fake signal is sent again. Stack checking deals with these cases in a better way. If the task was sleeping in a syscall it would be woken by our fake signal, it would check if TIF_SIGPENDING is set (by calling signal_pending() predicate) and return ERESTART* or EINTR. Syscalls with ERESTART* return values are restarted in case of the fake signal (see do_signal()). EINTR is propagated back to the userspace program. This could disturb the program, but... * each process dealing with signals should react accordingly to EINTR return values. * syscalls returning EINTR happen to be quite common situation in the system even if no fake signal is sent. * freezer sends the fake signal and does not deal with EINTR anyhow. Thus EINTR values are returned when the system is resumed. The very safe marking is done in architectures' "entry" on syscall and interrupt/exception exit paths, and in a stack checking functions of livepatch. TIF_PATCH_PENDING is cleared and the next recalc_sigpending() drops TIF_SIGPENDING. In connection with this, also call klp_update_patch_state() before do_signal(), so that recalc_sigpending() in dequeue_signal() can clear TIF_PATCH_PENDING immediately and thus prevent a double call of do_signal(). Note that the fake signal is not sent to stopped/traced tasks. Such task prevents the patching to finish till it continues again (is not traced anymore). Last, sending the fake signal is not automatic. It is done only when admin requests it by writing 1 to signal sysfs attribute in livepatch sysfs directory. Signed-off-by: Miroslav Benes Cc: Oleg Nesterov Cc: Michael Ellerman Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Andy Lutomirski Cc: linuxppc-dev@lists.ozlabs.org Cc: x86@kernel.org Acked-by: Michael Ellerman (powerpc) Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 30 ++++++++++++++++++++++++++++++ kernel/livepatch/transition.c | 41 +++++++++++++++++++++++++++++++++++++++++ kernel/livepatch/transition.h | 1 + kernel/signal.c | 4 +++- 4 files changed, 75 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index de9e45dca70f..88766bd91803 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -454,6 +454,7 @@ EXPORT_SYMBOL_GPL(klp_enable_patch); * /sys/kernel/livepatch/ * /sys/kernel/livepatch//enabled * /sys/kernel/livepatch//transition + * /sys/kernel/livepatch//signal * /sys/kernel/livepatch// * /sys/kernel/livepatch/// */ @@ -528,11 +529,40 @@ static ssize_t transition_show(struct kobject *kobj, patch == klp_transition_patch); } +static ssize_t signal_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct klp_patch *patch; + int ret; + bool val; + + patch = container_of(kobj, struct klp_patch, kobj); + + /* + * klp_mutex lock is not grabbed here intentionally. It is not really + * needed. The race window is harmless and grabbing the lock would only + * hold the action back. + */ + if (patch != klp_transition_patch) + return -EINVAL; + + ret = kstrtobool(buf, &val); + if (ret) + return ret; + + if (val) + klp_send_signals(); + + return count; +} + static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled); static struct kobj_attribute transition_kobj_attr = __ATTR_RO(transition); +static struct kobj_attribute signal_kobj_attr = __ATTR_WO(signal); static struct attribute *klp_patch_attrs[] = { &enabled_kobj_attr.attr, &transition_kobj_attr.attr, + &signal_kobj_attr.attr, NULL }; diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index 56add6327736..edcfcb8ebb2d 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -608,3 +608,44 @@ void klp_copy_process(struct task_struct *child) /* TIF_PATCH_PENDING gets copied in setup_thread_stack() */ } + +/* + * Sends a fake signal to all non-kthread tasks with TIF_PATCH_PENDING set. + * Kthreads with TIF_PATCH_PENDING set are woken up. Only admin can request this + * action currently. + */ +void klp_send_signals(void) +{ + struct task_struct *g, *task; + + pr_notice("signaling remaining tasks\n"); + + read_lock(&tasklist_lock); + for_each_process_thread(g, task) { + if (!klp_patch_pending(task)) + continue; + + /* + * There is a small race here. We could see TIF_PATCH_PENDING + * set and decide to wake up a kthread or send a fake signal. + * Meanwhile the task could migrate itself and the action + * would be meaningless. It is not serious though. + */ + if (task->flags & PF_KTHREAD) { + /* + * Wake up a kthread which sleeps interruptedly and + * still has not been migrated. + */ + wake_up_state(task, TASK_INTERRUPTIBLE); + } else { + /* + * Send fake signal to all non-kthread tasks which are + * still not migrated. + */ + spin_lock_irq(&task->sighand->siglock); + signal_wake_up(task, 0); + spin_unlock_irq(&task->sighand->siglock); + } + } + read_unlock(&tasklist_lock); +} diff --git a/kernel/livepatch/transition.h b/kernel/livepatch/transition.h index 0f6e27c481f9..40522795a5f6 100644 --- a/kernel/livepatch/transition.h +++ b/kernel/livepatch/transition.h @@ -11,5 +11,6 @@ void klp_cancel_transition(void); void klp_start_transition(void); void klp_try_complete_transition(void); void klp_reverse_transition(void); +void klp_send_signals(void); #endif /* _LIVEPATCH_TRANSITION_H */ diff --git a/kernel/signal.c b/kernel/signal.c index 8dcd8825b2de..186143b06d00 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -40,6 +40,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -163,7 +164,8 @@ void recalc_sigpending_and_wake(struct task_struct *t) void recalc_sigpending(void) { - if (!recalc_sigpending_tsk(current) && !freezing(current)) + if (!recalc_sigpending_tsk(current) && !freezing(current) && + !klp_patch_pending(current)) clear_thread_flag(TIF_SIGPENDING); } -- cgit v1.2.2 From c99a2be790b07752d8cc694434d3450afd4c5a00 Mon Sep 17 00:00:00 2001 From: Miroslav Benes Date: Wed, 22 Nov 2017 11:29:21 +0100 Subject: livepatch: force transition to finish If a task sleeps in a set of patched functions uninterruptedly, it could block the whole transition indefinitely. Thus it may be useful to clear its TIF_PATCH_PENDING to allow the process to finish. Admin can do that now by writing to force sysfs attribute in livepatch sysfs directory. TIF_PATCH_PENDING is then cleared for all tasks and the transition can finish successfully. Important note! Administrator should not use this feature without a clearance from a patch distributor. It must be checked that by doing so the consistency model guarantees are not violated. Removal (rmmod) of patch modules is permanently disabled when the feature is used. It cannot be guaranteed there is no task sleeping in such module. Signed-off-by: Miroslav Benes Acked-by: Josh Poimboeuf Reviewed-by: Petr Mladek Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 30 ++++++++++++++++++++++++++++++ kernel/livepatch/transition.c | 36 ++++++++++++++++++++++++++++++++++-- kernel/livepatch/transition.h | 1 + 3 files changed, 65 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 88766bd91803..1c3c9b27c916 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -455,6 +455,7 @@ EXPORT_SYMBOL_GPL(klp_enable_patch); * /sys/kernel/livepatch//enabled * /sys/kernel/livepatch//transition * /sys/kernel/livepatch//signal + * /sys/kernel/livepatch//force * /sys/kernel/livepatch// * /sys/kernel/livepatch/// */ @@ -556,13 +557,42 @@ static ssize_t signal_store(struct kobject *kobj, struct kobj_attribute *attr, return count; } +static ssize_t force_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct klp_patch *patch; + int ret; + bool val; + + patch = container_of(kobj, struct klp_patch, kobj); + + /* + * klp_mutex lock is not grabbed here intentionally. It is not really + * needed. The race window is harmless and grabbing the lock would only + * hold the action back. + */ + if (patch != klp_transition_patch) + return -EINVAL; + + ret = kstrtobool(buf, &val); + if (ret) + return ret; + + if (val) + klp_force_transition(); + + return count; +} + static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled); static struct kobj_attribute transition_kobj_attr = __ATTR_RO(transition); static struct kobj_attribute signal_kobj_attr = __ATTR_WO(signal); +static struct kobj_attribute force_kobj_attr = __ATTR_WO(force); static struct attribute *klp_patch_attrs[] = { &enabled_kobj_attr.attr, &transition_kobj_attr.attr, &signal_kobj_attr.attr, + &force_kobj_attr.attr, NULL }; diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index edcfcb8ebb2d..be5bfa533ee8 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -33,6 +33,8 @@ struct klp_patch *klp_transition_patch; static int klp_target_state = KLP_UNDEFINED; +static bool klp_forced = false; + /* * This work can be performed periodically to finish patching or unpatching any * "straggler" tasks which failed to transition in the first attempt. @@ -146,9 +148,12 @@ done: /* * See complementary comment in __klp_enable_patch() for why we * keep the module reference for immediate patches. + * + * klp_forced or immediate_func set implies unbounded increase of + * module's ref count if the module is disabled/enabled in a loop. */ - if (!klp_transition_patch->immediate && !immediate_func && - klp_target_state == KLP_UNPATCHED) { + if (!klp_forced && !klp_transition_patch->immediate && + !immediate_func && klp_target_state == KLP_UNPATCHED) { module_put(klp_transition_patch->mod); } @@ -649,3 +654,30 @@ void klp_send_signals(void) } read_unlock(&tasklist_lock); } + +/* + * Drop TIF_PATCH_PENDING of all tasks on admin's request. This forces an + * existing transition to finish. + * + * NOTE: klp_update_patch_state(task) requires the task to be inactive or + * 'current'. This is not the case here and the consistency model could be + * broken. Administrator, who is the only one to execute the + * klp_force_transitions(), has to be aware of this. + */ +void klp_force_transition(void) +{ + struct task_struct *g, *task; + unsigned int cpu; + + pr_warn("forcing remaining tasks to the patched state\n"); + + read_lock(&tasklist_lock); + for_each_process_thread(g, task) + klp_update_patch_state(task); + read_unlock(&tasklist_lock); + + for_each_possible_cpu(cpu) + klp_update_patch_state(idle_task(cpu)); + + klp_forced = true; +} diff --git a/kernel/livepatch/transition.h b/kernel/livepatch/transition.h index 40522795a5f6..f9d0bc016067 100644 --- a/kernel/livepatch/transition.h +++ b/kernel/livepatch/transition.h @@ -12,5 +12,6 @@ void klp_start_transition(void); void klp_try_complete_transition(void); void klp_reverse_transition(void); void klp_send_signals(void); +void klp_force_transition(void); #endif /* _LIVEPATCH_TRANSITION_H */ -- cgit v1.2.2 From 156baec39732f025dc778e00da95fc10d6e45885 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 7 Dec 2017 09:40:38 -0800 Subject: rcu: Export init_rcu_head() and destroy_rcu_head() to GPL modules Use of init_rcu_head() and destroy_rcu_head() from modules results in the following build-time error with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y: ERROR: "init_rcu_head" [drivers/scsi/scsi_mod.ko] undefined! ERROR: "destroy_rcu_head" [drivers/scsi/scsi_mod.ko] undefined! This commit therefore adds EXPORT_SYMBOL_GPL() for each to allow them to be used by GPL-licensed kernel modules. Reported-by: Bart Van Assche Reported-by: Stephen Rothwell Signed-off-by: Paul E. McKenney Signed-off-by: Martin K. Petersen --- kernel/rcu/update.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index fbd56d6e575b..68fa19a5e7bd 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -422,11 +422,13 @@ void init_rcu_head(struct rcu_head *head) { debug_object_init(head, &rcuhead_debug_descr); } +EXPORT_SYMBOL_GPL(init_rcu_head); void destroy_rcu_head(struct rcu_head *head) { debug_object_free(head, &rcuhead_debug_descr); } +EXPORT_SYMBOL_GPL(destroy_rcu_head); static bool rcuhead_is_static_object(void *addr) { -- cgit v1.2.2 From 79ee842891595293be37c5aed0e75b4630166c5a Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Fri, 8 Dec 2017 11:56:08 +0900 Subject: sched/autogroup: remove unneeded kallsyms include Autogroup does not seem to use any of kallsyms functions/defines. Link: http://lkml.kernel.org/r/20171208025616.16267-2-sergey.senozhatsky@gmail.com To: Andrew Morton To: Michal Hocko To: Rafael Wysocki To: Len Brown To: Bjorn Helgaas To: Vlastimil Babka To: Tejun Heo To: Lai Jiangshan To: Thomas Gleixner To: Fengguang Wu Cc: Steven Rostedt Cc: LKML Cc: linux-pm@vger.kernel.org Cc: linux-pci@vger.kernel.org Cc: linux-mm@kvack.org Signed-off-by: Sergey Senozhatsky Acked-by: Peter Zijlstra (Intel) Signed-off-by: Petr Mladek --- kernel/sched/autogroup.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index a43df5193538..0786227a3f48 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -3,7 +3,6 @@ #include #include -#include #include #include #include -- cgit v1.2.2 From 25493e5fba2f7b8cdade29d0fc8945114ee7732b Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Fri, 8 Dec 2017 17:24:22 +0900 Subject: sched/autogroup: move sched.h include Move local "sched.h" include to the bottom. sched.h defines several macros that are getting redefined in ARCH-specific code, for instance, finish_arch_post_lock_switch() and prepare_arch_switch(), so we need ARCH-specific definitions to come in first. Link: http://lkml.kernel.org/r/20171208082422.5021-1-sergey.senozhatsky@gmail.com To: Martin Schwidefsky Cc: Steven Rostedt Cc: LKML Cc: linux-pm@vger.kernel.org Cc: linux-pci@vger.kernel.org Cc: linux-mm@kvack.org Suggested-by: Martin Schwidefsky Signed-off-by: Sergey Senozhatsky Acked-by: Peter Zijlstra (Intel) Signed-off-by: Petr Mladek --- kernel/sched/autogroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index 0786227a3f48..bb4b9fe026a1 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -1,12 +1,12 @@ // SPDX-License-Identifier: GPL-2.0 -#include "sched.h" - #include #include #include #include #include +#include "sched.h" + unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; static struct autogroup autogroup_default; static atomic_t autogroup_seq_nr; -- cgit v1.2.2 From f371b304f12e31fe30207c41ca7754564e0ea4dc Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 11 Dec 2017 11:39:02 -0800 Subject: bpf/tracing: allow user space to query prog array on the same tp Commit e87c6bc3852b ("bpf: permit multiple bpf attachments for a single perf event") added support to attach multiple bpf programs to a single perf event. Although this provides flexibility, users may want to know what other bpf programs attached to the same tp interface. Besides getting visibility for the underlying bpf system, such information may also help consolidate multiple bpf programs, understand potential performance issues due to a large array, and debug (e.g., one bpf program which overwrites return code may impact subsequent program results). Commit 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes") utilized the existing perf ioctl interface and added the command PERF_EVENT_IOC_SET_BPF to attach a bpf program to a tracepoint. This patch adds a new ioctl command, given a perf event fd, to query the bpf program array attached to the same perf tracepoint event. The new uapi ioctl command: PERF_EVENT_IOC_QUERY_BPF The new uapi/linux/perf_event.h structure: struct perf_event_query_bpf { __u32 ids_len; __u32 prog_cnt; __u32 ids[0]; }; User space provides buffer "ids" for kernel to copy to. When returning from the kernel, the number of available programs in the array is set in "prog_cnt". The usage: struct perf_event_query_bpf *query = malloc(sizeof(*query) + sizeof(u32) * ids_len); query.ids_len = ids_len; err = ioctl(pmu_efd, PERF_EVENT_IOC_QUERY_BPF, query); if (err == 0) { /* query.prog_cnt is the number of available progs, * number of progs in ids: (ids_len == 0) ? 0 : query.prog_cnt */ } else if (errno == ENOSPC) { /* query.ids_len number of progs copied, * query.prog_cnt is the number of available progs */ } else { /* other errors */ } Signed-off-by: Yonghong Song Acked-by: Peter Zijlstra (Intel) Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 21 +++++++++++++++++++++ kernel/events/core.c | 3 +++ kernel/trace/bpf_trace.c | 23 +++++++++++++++++++++++ 3 files changed, 47 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 86b50aa26ee8..b16c6f8f42b6 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1462,6 +1462,8 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, rcu_read_lock(); prog = rcu_dereference(progs)->progs; for (; *prog; prog++) { + if (*prog == &dummy_bpf_prog.prog) + continue; id = (*prog)->aux->id; if (copy_to_user(prog_ids + i, &id, sizeof(id))) { rcu_read_unlock(); @@ -1545,6 +1547,25 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, return 0; } +int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, + __u32 __user *prog_ids, u32 request_cnt, + __u32 __user *prog_cnt) +{ + u32 cnt = 0; + + if (array) + cnt = bpf_prog_array_length(array); + + if (copy_to_user(prog_cnt, &cnt, sizeof(cnt))) + return -EFAULT; + + /* return early if user requested only program count or nothing to copy */ + if (!request_cnt || !cnt) + return 0; + + return bpf_prog_array_copy_to_user(array, prog_ids, request_cnt); +} + static void bpf_prog_free_deferred(struct work_struct *work) { struct bpf_prog_aux *aux; diff --git a/kernel/events/core.c b/kernel/events/core.c index 16beab4767e1..f10609e539d4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4723,6 +4723,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon rcu_read_unlock(); return 0; } + + case PERF_EVENT_IOC_QUERY_BPF: + return bpf_event_query_prog_array(event, (void __user *)arg); default: return -ENOTTY; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0ce99c379c30..b143f2a05aff 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -820,3 +820,26 @@ void perf_event_detach_bpf_prog(struct perf_event *event) unlock: mutex_unlock(&bpf_event_mutex); } + +int bpf_event_query_prog_array(struct perf_event *event, void __user *info) +{ + struct perf_event_query_bpf __user *uquery = info; + struct perf_event_query_bpf query = {}; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (event->attr.type != PERF_TYPE_TRACEPOINT) + return -EINVAL; + if (copy_from_user(&query, uquery, sizeof(query))) + return -EFAULT; + + mutex_lock(&bpf_event_mutex); + ret = bpf_prog_array_copy_info(event->tp_event->prog_array, + uquery->ids, + query.ids_len, + &uquery->prog_cnt); + mutex_unlock(&bpf_event_mutex); + + return ret; +} -- cgit v1.2.2 From 92ace9991da08827e809c2d120108a96a281e7fc Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 11 Dec 2017 11:36:46 -0500 Subject: add infrastructure for tagging functions as error injectable Using BPF we can override kprob'ed functions and return arbitrary values. Obviously this can be a bit unsafe, so make this feature opt-in for functions. Simply tag a function with KPROBE_ERROR_INJECT_SYMBOL in order to give BPF access to that function for error injection purposes. Signed-off-by: Josef Bacik Acked-by: Ingo Molnar Signed-off-by: Alexei Starovoitov --- kernel/kprobes.c | 163 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/module.c | 6 +- 2 files changed, 168 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index da2ccf142358..b4aab48ad258 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -83,6 +83,16 @@ static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) return &(kretprobe_table_locks[hash].lock); } +/* List of symbols that can be overriden for error injection. */ +static LIST_HEAD(kprobe_error_injection_list); +static DEFINE_MUTEX(kprobe_ei_mutex); +struct kprobe_ei_entry { + struct list_head list; + unsigned long start_addr; + unsigned long end_addr; + void *priv; +}; + /* Blacklist -- list of struct kprobe_blacklist_entry */ static LIST_HEAD(kprobe_blacklist); @@ -1394,6 +1404,17 @@ bool within_kprobe_blacklist(unsigned long addr) return false; } +bool within_kprobe_error_injection_list(unsigned long addr) +{ + struct kprobe_ei_entry *ent; + + list_for_each_entry(ent, &kprobe_error_injection_list, list) { + if (addr >= ent->start_addr && addr < ent->end_addr) + return true; + } + return false; +} + /* * If we have a symbol_name argument, look it up and add the offset field * to it. This way, we can specify a relative address to a symbol. @@ -2168,6 +2189,86 @@ static int __init populate_kprobe_blacklist(unsigned long *start, return 0; } +#ifdef CONFIG_BPF_KPROBE_OVERRIDE +/* Markers of the _kprobe_error_inject_list section */ +extern unsigned long __start_kprobe_error_inject_list[]; +extern unsigned long __stop_kprobe_error_inject_list[]; + +/* + * Lookup and populate the kprobe_error_injection_list. + * + * For safety reasons we only allow certain functions to be overriden with + * bpf_error_injection, so we need to populate the list of the symbols that have + * been marked as safe for overriding. + */ +static void populate_kprobe_error_injection_list(unsigned long *start, + unsigned long *end, + void *priv) +{ + unsigned long *iter; + struct kprobe_ei_entry *ent; + unsigned long entry, offset = 0, size = 0; + + mutex_lock(&kprobe_ei_mutex); + for (iter = start; iter < end; iter++) { + entry = arch_deref_entry_point((void *)*iter); + + if (!kernel_text_address(entry) || + !kallsyms_lookup_size_offset(entry, &size, &offset)) { + pr_err("Failed to find error inject entry at %p\n", + (void *)entry); + continue; + } + + ent = kmalloc(sizeof(*ent), GFP_KERNEL); + if (!ent) + break; + ent->start_addr = entry; + ent->end_addr = entry + size; + ent->priv = priv; + INIT_LIST_HEAD(&ent->list); + list_add_tail(&ent->list, &kprobe_error_injection_list); + } + mutex_unlock(&kprobe_ei_mutex); +} + +static void __init populate_kernel_kprobe_ei_list(void) +{ + populate_kprobe_error_injection_list(__start_kprobe_error_inject_list, + __stop_kprobe_error_inject_list, + NULL); +} + +static void module_load_kprobe_ei_list(struct module *mod) +{ + if (!mod->num_kprobe_ei_funcs) + return; + populate_kprobe_error_injection_list(mod->kprobe_ei_funcs, + mod->kprobe_ei_funcs + + mod->num_kprobe_ei_funcs, mod); +} + +static void module_unload_kprobe_ei_list(struct module *mod) +{ + struct kprobe_ei_entry *ent, *n; + if (!mod->num_kprobe_ei_funcs) + return; + + mutex_lock(&kprobe_ei_mutex); + list_for_each_entry_safe(ent, n, &kprobe_error_injection_list, list) { + if (ent->priv == mod) { + list_del_init(&ent->list); + kfree(ent); + } + } + mutex_unlock(&kprobe_ei_mutex); +} +#else +static inline void __init populate_kernel_kprobe_ei_list(void) {} +static inline void module_load_kprobe_ei_list(struct module *m) {} +static inline void module_unload_kprobe_ei_list(struct module *m) {} +#endif + /* Module notifier call back, checking kprobes on the module */ static int kprobes_module_callback(struct notifier_block *nb, unsigned long val, void *data) @@ -2178,6 +2279,11 @@ static int kprobes_module_callback(struct notifier_block *nb, unsigned int i; int checkcore = (val == MODULE_STATE_GOING); + if (val == MODULE_STATE_COMING) + module_load_kprobe_ei_list(mod); + else if (val == MODULE_STATE_GOING) + module_unload_kprobe_ei_list(mod); + if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE) return NOTIFY_DONE; @@ -2240,6 +2346,8 @@ static int __init init_kprobes(void) pr_err("Please take care of using kprobes.\n"); } + populate_kernel_kprobe_ei_list(); + if (kretprobe_blacklist_size) { /* lookup the function address from its name */ for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { @@ -2407,6 +2515,56 @@ static const struct file_operations debugfs_kprobe_blacklist_ops = { .release = seq_release, }; +/* + * kprobes/error_injection_list -- shows which functions can be overriden for + * error injection. + * */ +static void *kprobe_ei_seq_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&kprobe_ei_mutex); + return seq_list_start(&kprobe_error_injection_list, *pos); +} + +static void kprobe_ei_seq_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&kprobe_ei_mutex); +} + +static void *kprobe_ei_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + return seq_list_next(v, &kprobe_error_injection_list, pos); +} + +static int kprobe_ei_seq_show(struct seq_file *m, void *v) +{ + char buffer[KSYM_SYMBOL_LEN]; + struct kprobe_ei_entry *ent = + list_entry(v, struct kprobe_ei_entry, list); + + sprint_symbol(buffer, ent->start_addr); + seq_printf(m, "%s\n", buffer); + return 0; +} + +static const struct seq_operations kprobe_ei_seq_ops = { + .start = kprobe_ei_seq_start, + .next = kprobe_ei_seq_next, + .stop = kprobe_ei_seq_stop, + .show = kprobe_ei_seq_show, +}; + +static int kprobe_ei_open(struct inode *inode, struct file *filp) +{ + return seq_open(filp, &kprobe_ei_seq_ops); +} + +static const struct file_operations debugfs_kprobe_ei_ops = { + .open = kprobe_ei_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + static void arm_all_kprobes(void) { struct hlist_head *head; @@ -2548,6 +2706,11 @@ static int __init debugfs_kprobe_init(void) if (!file) goto error; + file = debugfs_create_file("error_injection_list", 0444, dir, NULL, + &debugfs_kprobe_ei_ops); + if (!file) + goto error; + return 0; error: diff --git a/kernel/module.c b/kernel/module.c index dea01ac9cb74..bd695bfdc5c4 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3118,7 +3118,11 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->ftrace_callsites), &mod->num_ftrace_callsites); #endif - +#ifdef CONFIG_BPF_KPROBE_OVERRIDE + mod->kprobe_ei_funcs = section_objs(info, "_kprobe_error_inject_list", + sizeof(*mod->kprobe_ei_funcs), + &mod->num_kprobe_ei_funcs); +#endif mod->extable = section_objs(info, "__ex_table", sizeof(*mod->extable), &mod->num_exentries); -- cgit v1.2.2 From 9802d86585db91655c7d1929a4f6bbe0952ea88e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 11 Dec 2017 11:36:48 -0500 Subject: bpf: add a bpf_override_function helper Error injection is sloppy and very ad-hoc. BPF could fill this niche perfectly with it's kprobe functionality. We could make sure errors are only triggered in specific call chains that we care about with very specific situations. Accomplish this with the bpf_override_funciton helper. This will modify the probe'd callers return value to the specified value and set the PC to an override function that simply returns, bypassing the originally probed function. This gives us a nice clean way to implement systematic error injection for all of our code paths. Acked-by: Alexei Starovoitov Acked-by: Ingo Molnar Signed-off-by: Josef Bacik Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 3 +++ kernel/bpf/verifier.c | 2 ++ kernel/events/core.c | 7 ++++++ kernel/trace/Kconfig | 11 +++++++++ kernel/trace/bpf_trace.c | 35 +++++++++++++++++++++++++++++ kernel/trace/trace_kprobe.c | 55 +++++++++++++++++++++++++++++++++++++++------ kernel/trace/trace_probe.h | 12 ++++++++++ 7 files changed, 118 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b16c6f8f42b6..d32bebf4f2de 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1320,6 +1320,9 @@ EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) { + if (fp->kprobe_override) + return false; + if (!array->owner_prog_type) { /* There's no owner yet where we could check for * compatibility. diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7afa92e9b409..e807bda7fe29 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4413,6 +4413,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) prog->dst_needed = 1; if (insn->imm == BPF_FUNC_get_prandom_u32) bpf_user_rnd_init_once(); + if (insn->imm == BPF_FUNC_override_return) + prog->kprobe_override = 1; if (insn->imm == BPF_FUNC_tail_call) { /* If we tail call into other programs, we * cannot make any assumptions since they can diff --git a/kernel/events/core.c b/kernel/events/core.c index f10609e539d4..5857c500721b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8080,6 +8080,13 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) return -EINVAL; } + /* Kprobe override only works for kprobes, not uprobes. */ + if (prog->kprobe_override && + !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) { + bpf_prog_put(prog); + return -EINVAL; + } + if (is_tracepoint || is_syscall_tp) { int off = trace_event_get_offsets(event->tp_event); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index af7dad126c13..3e6fd580fe7f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -529,6 +529,17 @@ config FUNCTION_PROFILER If in doubt, say N. +config BPF_KPROBE_OVERRIDE + bool "Enable BPF programs to override a kprobed function" + depends on BPF_EVENTS + depends on KPROBES_ON_FTRACE + depends on HAVE_KPROBE_OVERRIDE + depends on DYNAMIC_FTRACE_WITH_REGS + default n + help + Allows BPF to override the execution of a probed function and + set a different return value. This is used for error injection. + config FTRACE_MCOUNT_RECORD def_bool y depends on DYNAMIC_FTRACE diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b143f2a05aff..e009b7ecf473 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -13,6 +13,10 @@ #include #include #include +#include +#include + +#include "trace_probe.h" #include "trace.h" u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); @@ -76,6 +80,24 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) } EXPORT_SYMBOL_GPL(trace_call_bpf); +#ifdef CONFIG_BPF_KPROBE_OVERRIDE +BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) +{ + __this_cpu_write(bpf_kprobe_override, 1); + regs_set_return_value(regs, rc); + arch_ftrace_kprobe_override_function(regs); + return 0; +} + +static const struct bpf_func_proto bpf_override_return_proto = { + .func = bpf_override_return, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; +#endif + BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr) { int ret; @@ -551,6 +573,10 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return &bpf_get_stackid_proto; case BPF_FUNC_perf_event_read_value: return &bpf_perf_event_read_value_proto; +#ifdef CONFIG_BPF_KPROBE_OVERRIDE + case BPF_FUNC_override_return: + return &bpf_override_return_proto; +#endif default: return tracing_func_proto(func_id); } @@ -768,6 +794,15 @@ int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog_array *new_array; int ret = -EEXIST; + /* + * Kprobe override only works for ftrace based kprobes, and only if they + * are on the opt-in list. + */ + if (prog->kprobe_override && + (!trace_kprobe_ftrace(event->tp_event) || + !trace_kprobe_error_injectable(event->tp_event))) + return -EINVAL; + mutex_lock(&bpf_event_mutex); if (event->prog) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 492700c5fb4d..5db849809a56 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -42,6 +42,7 @@ struct trace_kprobe { (offsetof(struct trace_kprobe, tp.args) + \ (sizeof(struct probe_arg) * (n))) +DEFINE_PER_CPU(int, bpf_kprobe_override); static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk) { @@ -87,6 +88,27 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk) return nhit; } +int trace_kprobe_ftrace(struct trace_event_call *call) +{ + struct trace_kprobe *tk = (struct trace_kprobe *)call->data; + return kprobe_ftrace(&tk->rp.kp); +} + +int trace_kprobe_error_injectable(struct trace_event_call *call) +{ + struct trace_kprobe *tk = (struct trace_kprobe *)call->data; + unsigned long addr; + + if (tk->symbol) { + addr = (unsigned long) + kallsyms_lookup_name(trace_kprobe_symbol(tk)); + addr += tk->rp.kp.offset; + } else { + addr = (unsigned long)tk->rp.kp.addr; + } + return within_kprobe_error_injection_list(addr); +} + static int register_kprobe_event(struct trace_kprobe *tk); static int unregister_kprobe_event(struct trace_kprobe *tk); @@ -1170,7 +1192,7 @@ static int kretprobe_event_define_fields(struct trace_event_call *event_call) #ifdef CONFIG_PERF_EVENTS /* Kprobe profile handler */ -static void +static int kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) { struct trace_event_call *call = &tk->tp.call; @@ -1179,12 +1201,29 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) int size, __size, dsize; int rctx; - if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs)) - return; + if (bpf_prog_array_valid(call)) { + int ret; + + ret = trace_call_bpf(call, regs); + + /* + * We need to check and see if we modified the pc of the + * pt_regs, and if so clear the kprobe and return 1 so that we + * don't do the instruction skipping. Also reset our state so + * we are clean the next pass through. + */ + if (__this_cpu_read(bpf_kprobe_override)) { + __this_cpu_write(bpf_kprobe_override, 0); + reset_current_kprobe(); + return 1; + } + if (!ret) + return 0; + } head = this_cpu_ptr(call->perf_events); if (hlist_empty(head)) - return; + return 0; dsize = __get_data_size(&tk->tp, regs); __size = sizeof(*entry) + tk->tp.size + dsize; @@ -1193,13 +1232,14 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) entry = perf_trace_buf_alloc(size, NULL, &rctx); if (!entry) - return; + return 0; entry->ip = (unsigned long)tk->rp.kp.addr; memset(&entry[1], 0, dsize); store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL); + return 0; } NOKPROBE_SYMBOL(kprobe_perf_func); @@ -1275,6 +1315,7 @@ static int kprobe_register(struct trace_event_call *event, static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) { struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp); + int ret = 0; raw_cpu_inc(*tk->nhit); @@ -1282,9 +1323,9 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) kprobe_trace_func(tk, regs); #ifdef CONFIG_PERF_EVENTS if (tk->tp.flags & TP_FLAG_PROFILE) - kprobe_perf_func(tk, regs); + ret = kprobe_perf_func(tk, regs); #endif - return 0; /* We don't tweek kernel, so just return 0 */ + return ret; } NOKPROBE_SYMBOL(kprobe_dispatcher); diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index fb66e3eaa192..5e54d748c84c 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -252,6 +252,8 @@ struct symbol_cache; unsigned long update_symbol_cache(struct symbol_cache *sc); void free_symbol_cache(struct symbol_cache *sc); struct symbol_cache *alloc_symbol_cache(const char *sym, long offset); +int trace_kprobe_ftrace(struct trace_event_call *call); +int trace_kprobe_error_injectable(struct trace_event_call *call); #else /* uprobes do not support symbol fetch methods */ #define fetch_symbol_u8 NULL @@ -277,6 +279,16 @@ alloc_symbol_cache(const char *sym, long offset) { return NULL; } + +static inline int trace_kprobe_ftrace(struct trace_event_call *call) +{ + return 0; +} + +static inline int trace_kprobe_error_injectable(struct trace_event_call *call) +{ + return 0; +} #endif /* CONFIG_KPROBE_EVENTS */ struct probe_arg { -- cgit v1.2.2 From f4e2298e63d24bb7f5cf0f56f72867973cb7e652 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 13 Dec 2017 10:35:37 -0800 Subject: bpf/tracing: fix kernel/events/core.c compilation error Commit f371b304f12e ("bpf/tracing: allow user space to query prog array on the same tp") introduced a perf ioctl command to query prog array attached to the same perf tracepoint. The commit introduced a compilation error under certain config conditions, e.g., (1). CONFIG_BPF_SYSCALL is not defined, or (2). CONFIG_TRACING is defined but neither CONFIG_UPROBE_EVENTS nor CONFIG_KPROBE_EVENTS is defined. Error message: kernel/events/core.o: In function `perf_ioctl': core.c:(.text+0x98c4): undefined reference to `bpf_event_query_prog_array' This patch fixed this error by guarding the real definition under CONFIG_BPF_EVENTS and provided static inline dummy function if CONFIG_BPF_EVENTS was not defined. It renamed the function from bpf_event_query_prog_array to perf_event_query_prog_array and moved the definition from linux/bpf.h to linux/trace_events.h so the definition is in proximity to other prog_array related functions. Fixes: f371b304f12e ("bpf/tracing: allow user space to query prog array on the same tp") Reported-by: Stephen Rothwell Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/events/core.c | 2 +- kernel/trace/bpf_trace.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 5857c500721b..34fda6aa4606 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4725,7 +4725,7 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon } case PERF_EVENT_IOC_QUERY_BPF: - return bpf_event_query_prog_array(event, (void __user *)arg); + return perf_event_query_prog_array(event, (void __user *)arg); default: return -ENOTTY; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e009b7ecf473..c5dd60cc0751 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -856,7 +856,7 @@ unlock: mutex_unlock(&bpf_event_mutex); } -int bpf_event_query_prog_array(struct perf_event *event, void __user *info) +int perf_event_query_prog_array(struct perf_event *event, void __user *info) { struct perf_event_query_bpf __user *uquery = info; struct perf_event_query_bpf query = {}; -- cgit v1.2.2 From 8b2770a4e1c7a837e1889d4f00026e999da5faa0 Mon Sep 17 00:00:00 2001 From: Wolffhardt Schwabe Date: Thu, 14 Dec 2017 19:13:57 +0100 Subject: fix typo in assignment of fs default overflow gid The patch remains without practical effect since both macros carry identical values. Still, it might become a problem in the future if (for whatever reason) the default overflow uid and gid differ. The DEFAULT_FS_OVERFLOWGID macro was previously unused. Signed-off-by: Wolffhardt Schwabe Signed-off-by: Anatoliy Cherepantsev Signed-off-by: Eric W. Biederman --- kernel/sys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 83ffd7dccf23..f2289de20e19 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -135,7 +135,7 @@ EXPORT_SYMBOL(overflowgid); */ int fs_overflowuid = DEFAULT_FS_OVERFLOWUID; -int fs_overflowgid = DEFAULT_FS_OVERFLOWUID; +int fs_overflowgid = DEFAULT_FS_OVERFLOWGID; EXPORT_SYMBOL(fs_overflowuid); EXPORT_SYMBOL(fs_overflowgid); -- cgit v1.2.2 From cc8b0b92a1699bc32f7fec71daa2bfc90de43a4d Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Dec 2017 17:55:05 -0800 Subject: bpf: introduce function calls (function boundaries) Allow arbitrary function calls from bpf function to another bpf function. Since the beginning of bpf all bpf programs were represented as a single function and program authors were forced to use always_inline for all functions in their C code. That was causing llvm to unnecessary inflate the code size and forcing developers to move code to header files with little code reuse. With a bit of additional complexity teach verifier to recognize arbitrary function calls from one bpf function to another as long as all of functions are presented to the verifier as a single bpf program. New program layout: r6 = r1 // some code .. r1 = .. // arg1 r2 = .. // arg2 call pc+1 // function call pc-relative exit .. = r1 // access arg1 .. = r2 // access arg2 .. call pc+20 // second level of function call ... It allows for better optimized code and finally allows to introduce the core bpf libraries that can be reused in different projects, since programs are no longer limited by single elf file. With function calls bpf can be compiled into multiple .o files. This patch is the first step. It detects programs that contain multiple functions and checks that calls between them are valid. It splits the sequence of bpf instructions (one program) into a set of bpf functions that call each other. Calls to only known functions are allowed. In the future the verifier may allow calls to unresolved functions and will do dynamic linking. This logic supports statically linked bpf functions only. Such function boundary detection could have been done as part of control flow graph building in check_cfg(), but it's cleaner to separate function boundary detection vs control flow checks within a subprogram (function) into logically indepedent steps. Follow up patches may split check_cfg() further, but not check_subprogs(). Only allow bpf-to-bpf calls for root only and for non-hw-offloaded programs. These restrictions can be relaxed in the future. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- kernel/bpf/disasm.c | 8 ++- kernel/bpf/verifier.c | 141 +++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 145 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index e682850c9715..883f88fa5bfc 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -189,8 +189,12 @@ void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, u8 opcode = BPF_OP(insn->code); if (opcode == BPF_CALL) { - verbose(env, "(%02x) call %s#%d\n", insn->code, - func_id_name(insn->imm), insn->imm); + if (insn->src_reg == BPF_PSEUDO_CALL) + verbose(env, "(%02x) call pc%+d\n", insn->code, + insn->imm); + else + verbose(env, "(%02x) call %s#%d\n", insn->code, + func_id_name(insn->imm), insn->imm); } else if (insn->code == (BPF_JMP | BPF_JA)) { verbose(env, "(%02x) goto pc%+d\n", insn->code, insn->off); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e807bda7fe29..1d0f7ff0b9a9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -20,6 +20,8 @@ #include #include #include +#include +#include #include "disasm.h" @@ -636,6 +638,113 @@ enum reg_arg_type { DST_OP_NO_MARK /* same as above, check only, don't mark */ }; +static int cmp_subprogs(const void *a, const void *b) +{ + return *(int *)a - *(int *)b; +} + +static int find_subprog(struct bpf_verifier_env *env, int off) +{ + u32 *p; + + p = bsearch(&off, env->subprog_starts, env->subprog_cnt, + sizeof(env->subprog_starts[0]), cmp_subprogs); + if (!p) + return -ENOENT; + return p - env->subprog_starts; + +} + +static int add_subprog(struct bpf_verifier_env *env, int off) +{ + int insn_cnt = env->prog->len; + int ret; + + if (off >= insn_cnt || off < 0) { + verbose(env, "call to invalid destination\n"); + return -EINVAL; + } + ret = find_subprog(env, off); + if (ret >= 0) + return 0; + if (env->subprog_cnt >= BPF_MAX_SUBPROGS) { + verbose(env, "too many subprograms\n"); + return -E2BIG; + } + env->subprog_starts[env->subprog_cnt++] = off; + sort(env->subprog_starts, env->subprog_cnt, + sizeof(env->subprog_starts[0]), cmp_subprogs, NULL); + return 0; +} + +static int check_subprogs(struct bpf_verifier_env *env) +{ + int i, ret, subprog_start, subprog_end, off, cur_subprog = 0; + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + + /* determine subprog starts. The end is one before the next starts */ + for (i = 0; i < insn_cnt; i++) { + if (insn[i].code != (BPF_JMP | BPF_CALL)) + continue; + if (insn[i].src_reg != BPF_PSEUDO_CALL) + continue; + if (!env->allow_ptr_leaks) { + verbose(env, "function calls to other bpf functions are allowed for root only\n"); + return -EPERM; + } + if (bpf_prog_is_dev_bound(env->prog->aux)) { + verbose(env, "funcation calls in offloaded programs are not supported yet\n"); + return -EINVAL; + } + ret = add_subprog(env, i + insn[i].imm + 1); + if (ret < 0) + return ret; + } + + if (env->log.level > 1) + for (i = 0; i < env->subprog_cnt; i++) + verbose(env, "func#%d @%d\n", i, env->subprog_starts[i]); + + /* now check that all jumps are within the same subprog */ + subprog_start = 0; + if (env->subprog_cnt == cur_subprog) + subprog_end = insn_cnt; + else + subprog_end = env->subprog_starts[cur_subprog++]; + for (i = 0; i < insn_cnt; i++) { + u8 code = insn[i].code; + + if (BPF_CLASS(code) != BPF_JMP) + goto next; + if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) + goto next; + off = i + insn[i].off + 1; + if (off < subprog_start || off >= subprog_end) { + verbose(env, "jump out of range from insn %d to %d\n", i, off); + return -EINVAL; + } +next: + if (i == subprog_end - 1) { + /* to avoid fall-through from one subprog into another + * the last insn of the subprog should be either exit + * or unconditional jump back + */ + if (code != (BPF_JMP | BPF_EXIT) && + code != (BPF_JMP | BPF_JA)) { + verbose(env, "last insn is not an exit or jmp\n"); + return -EINVAL; + } + subprog_start = subprog_end; + if (env->subprog_cnt == cur_subprog) + subprog_end = insn_cnt; + else + subprog_end = env->subprog_starts[cur_subprog++]; + } + } + return 0; +} + static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno) { struct bpf_verifier_state *parent = state->parent; @@ -3284,6 +3393,10 @@ static int check_cfg(struct bpf_verifier_env *env) int ret = 0; int i, t; + ret = check_subprogs(env); + if (ret < 0) + return ret; + insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); if (!insn_state) return -ENOMEM; @@ -3316,6 +3429,14 @@ peek_stack: goto err_free; if (t + 1 < insn_cnt) env->explored_states[t + 1] = STATE_LIST_MARK; + if (insns[t].src_reg == BPF_PSEUDO_CALL) { + env->explored_states[t] = STATE_LIST_MARK; + ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env); + if (ret == 1) + goto peek_stack; + else if (ret < 0) + goto err_free; + } } else if (opcode == BPF_JA) { if (BPF_SRC(insns[t].code) != BPF_K) { ret = -EINVAL; @@ -4245,6 +4366,19 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len, return 0; } +static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len) +{ + int i; + + if (len == 1) + return; + for (i = 0; i < env->subprog_cnt; i++) { + if (env->subprog_starts[i] < off) + continue; + env->subprog_starts[i] += len - 1; + } +} + static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off, const struct bpf_insn *patch, u32 len) { @@ -4255,6 +4389,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of return NULL; if (adjust_insn_aux_data(env, new_prog->len, off, len)) return NULL; + adjust_subprog_starts(env, off, len); return new_prog; } @@ -4408,6 +4543,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) for (i = 0; i < insn_cnt; i++, insn++) { if (insn->code != (BPF_JMP | BPF_CALL)) continue; + if (insn->src_reg == BPF_PSEUDO_CALL) + continue; if (insn->imm == BPF_FUNC_get_route_realm) prog->dst_needed = 1; @@ -4589,12 +4726,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) if (!env->explored_states) goto skip_full_check; + env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); + ret = check_cfg(env); if (ret < 0) goto skip_full_check; - env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); - ret = do_check(env); if (env->cur_state) { free_verifier_state(env->cur_state, true); -- cgit v1.2.2 From f4d7e40a5b7157e1329c3c5b10f60d8289fc2941 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Dec 2017 17:55:06 -0800 Subject: bpf: introduce function calls (verification) Allow arbitrary function calls from bpf function to another bpf function. To recognize such set of bpf functions the verifier does: 1. runs control flow analysis to detect function boundaries 2. proceeds with verification of all functions starting from main(root) function It recognizes that the stack of the caller can be accessed by the callee (if the caller passed a pointer to its stack to the callee) and the callee can store map_value and other pointers into the stack of the caller. 3. keeps track of the stack_depth of each function to make sure that total stack depth is still less than 512 bytes 4. disallows pointers to the callee stack to be stored into the caller stack, since they will be invalid as soon as the callee returns 5. to reuse all of the existing state_pruning logic each function call is considered to be independent call from the verifier point of view. The verifier pretends to inline all function calls it sees are being called. It stores the callsite instruction index as part of the state to make sure that two calls to the same callee from two different places in the caller will be different from state pruning point of view 6. more safety checks are added to liveness analysis Implementation details: . struct bpf_verifier_state is now consists of all stack frames that led to this function . struct bpf_func_state represent one stack frame. It consists of registers in the given frame and its stack . propagate_liveness() logic had a premature optimization where mark_reg_read() and mark_stack_slot_read() were manually inlined with loop iterating over parents for each register or stack slot. Undo this optimization to reuse more complex mark_*_read() logic . skip_callee() logic is not necessary from safety point of view, but without it mark_*_read() markings become too conservative, since after returning from the funciton call a read of r6-r9 will incorrectly propagate the read marks into callee causing inefficient pruning later . mark_*_read() logic is now aware of control flow which makes it more complex. In the future the plan is to rewrite liveness to be hierarchical. So that liveness can be done within basic block only and control flow will be responsible for propagation of liveness information along cfg and between calls. . tail_calls and ld_abs insns are not allowed in the programs with bpf-to-bpf calls . returning stack pointers to the caller or storing them into stack frame of the caller is not allowed Testing: . no difference in cilium processed_insn numbers . large number of tests follows in next patches Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 702 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 551 insertions(+), 151 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1d0f7ff0b9a9..f6e09d84a96f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -229,13 +229,23 @@ static void print_liveness(struct bpf_verifier_env *env, verbose(env, "w"); } +static struct bpf_func_state *func(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg) +{ + struct bpf_verifier_state *cur = env->cur_state; + + return cur->frame[reg->frameno]; +} + static void print_verifier_state(struct bpf_verifier_env *env, - struct bpf_verifier_state *state) + const struct bpf_func_state *state) { - struct bpf_reg_state *reg; + const struct bpf_reg_state *reg; enum bpf_reg_type t; int i; + if (state->frameno) + verbose(env, " frame%d:", state->frameno); for (i = 0; i < MAX_BPF_REG; i++) { reg = &state->regs[i]; t = reg->type; @@ -248,6 +258,8 @@ static void print_verifier_state(struct bpf_verifier_env *env, tnum_is_const(reg->var_off)) { /* reg->off should be 0 for SCALAR_VALUE */ verbose(env, "%lld", reg->var_off.value + reg->off); + if (t == PTR_TO_STACK) + verbose(env, ",call_%d", func(env, reg)->callsite); } else { verbose(env, "(id=%d", reg->id); if (t != SCALAR_VALUE) @@ -303,8 +315,8 @@ static void print_verifier_state(struct bpf_verifier_env *env, verbose(env, "\n"); } -static int copy_stack_state(struct bpf_verifier_state *dst, - const struct bpf_verifier_state *src) +static int copy_stack_state(struct bpf_func_state *dst, + const struct bpf_func_state *src) { if (!src->stack) return 0; @@ -320,13 +332,13 @@ static int copy_stack_state(struct bpf_verifier_state *dst, /* do_check() starts with zero-sized stack in struct bpf_verifier_state to * make it consume minimal amount of memory. check_stack_write() access from - * the program calls into realloc_verifier_state() to grow the stack size. + * the program calls into realloc_func_state() to grow the stack size. * Note there is a non-zero 'parent' pointer inside bpf_verifier_state * which this function copies over. It points to previous bpf_verifier_state * which is never reallocated */ -static int realloc_verifier_state(struct bpf_verifier_state *state, int size, - bool copy_old) +static int realloc_func_state(struct bpf_func_state *state, int size, + bool copy_old) { u32 old_size = state->allocated_stack; struct bpf_stack_state *new_stack; @@ -359,10 +371,21 @@ static int realloc_verifier_state(struct bpf_verifier_state *state, int size, return 0; } +static void free_func_state(struct bpf_func_state *state) +{ + kfree(state->stack); + kfree(state); +} + static void free_verifier_state(struct bpf_verifier_state *state, bool free_self) { - kfree(state->stack); + int i; + + for (i = 0; i <= state->curframe; i++) { + free_func_state(state->frame[i]); + state->frame[i] = NULL; + } if (free_self) kfree(state); } @@ -370,18 +393,46 @@ static void free_verifier_state(struct bpf_verifier_state *state, /* copy verifier state from src to dst growing dst stack space * when necessary to accommodate larger src stack */ -static int copy_verifier_state(struct bpf_verifier_state *dst, - const struct bpf_verifier_state *src) +static int copy_func_state(struct bpf_func_state *dst, + const struct bpf_func_state *src) { int err; - err = realloc_verifier_state(dst, src->allocated_stack, false); + err = realloc_func_state(dst, src->allocated_stack, false); if (err) return err; - memcpy(dst, src, offsetof(struct bpf_verifier_state, allocated_stack)); + memcpy(dst, src, offsetof(struct bpf_func_state, allocated_stack)); return copy_stack_state(dst, src); } +static int copy_verifier_state(struct bpf_verifier_state *dst_state, + const struct bpf_verifier_state *src) +{ + struct bpf_func_state *dst; + int i, err; + + /* if dst has more stack frames then src frame, free them */ + for (i = src->curframe + 1; i <= dst_state->curframe; i++) { + free_func_state(dst_state->frame[i]); + dst_state->frame[i] = NULL; + } + dst_state->curframe = src->curframe; + dst_state->parent = src->parent; + for (i = 0; i <= src->curframe; i++) { + dst = dst_state->frame[i]; + if (!dst) { + dst = kzalloc(sizeof(*dst), GFP_KERNEL); + if (!dst) + return -ENOMEM; + dst_state->frame[i] = dst; + } + err = copy_func_state(dst, src->frame[i]); + if (err) + return err; + } + return 0; +} + static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, int *insn_idx) { @@ -443,6 +494,10 @@ err: static const int caller_saved[CALLER_SAVED_REGS] = { BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 }; +#define CALLEE_SAVED_REGS 5 +static const int callee_saved[CALLEE_SAVED_REGS] = { + BPF_REG_6, BPF_REG_7, BPF_REG_8, BPF_REG_9 +}; static void __mark_reg_not_init(struct bpf_reg_state *reg); @@ -578,6 +633,7 @@ static void __mark_reg_unknown(struct bpf_reg_state *reg) reg->id = 0; reg->off = 0; reg->var_off = tnum_unknown; + reg->frameno = 0; __mark_reg_unbounded(reg); } @@ -614,8 +670,9 @@ static void mark_reg_not_init(struct bpf_verifier_env *env, } static void init_reg_state(struct bpf_verifier_env *env, - struct bpf_reg_state *regs) + struct bpf_func_state *state) { + struct bpf_reg_state *regs = state->regs; int i; for (i = 0; i < MAX_BPF_REG; i++) { @@ -626,12 +683,24 @@ static void init_reg_state(struct bpf_verifier_env *env, /* frame pointer */ regs[BPF_REG_FP].type = PTR_TO_STACK; mark_reg_known_zero(env, regs, BPF_REG_FP); + regs[BPF_REG_FP].frameno = state->frameno; /* 1st arg to a function */ regs[BPF_REG_1].type = PTR_TO_CTX; mark_reg_known_zero(env, regs, BPF_REG_1); } +#define BPF_MAIN_FUNC (-1) +static void init_func_state(struct bpf_verifier_env *env, + struct bpf_func_state *state, + int callsite, int frameno, int subprogno) +{ + state->callsite = callsite; + state->frameno = frameno; + state->subprogno = subprogno; + init_reg_state(env, state); +} + enum reg_arg_type { SRC_OP, /* register is used as source operand */ DST_OP, /* register is used as destination operand */ @@ -745,29 +814,86 @@ next: return 0; } -static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno) +struct bpf_verifier_state *skip_callee(struct bpf_verifier_env *env, + const struct bpf_verifier_state *state, + struct bpf_verifier_state *parent, + u32 regno) { - struct bpf_verifier_state *parent = state->parent; + struct bpf_verifier_state *tmp = NULL; + + /* 'parent' could be a state of caller and + * 'state' could be a state of callee. In such case + * parent->curframe < state->curframe + * and it's ok for r1 - r5 registers + * + * 'parent' could be a callee's state after it bpf_exit-ed. + * In such case parent->curframe > state->curframe + * and it's ok for r0 only + */ + if (parent->curframe == state->curframe || + (parent->curframe < state->curframe && + regno >= BPF_REG_1 && regno <= BPF_REG_5) || + (parent->curframe > state->curframe && + regno == BPF_REG_0)) + return parent; + + if (parent->curframe > state->curframe && + regno >= BPF_REG_6) { + /* for callee saved regs we have to skip the whole chain + * of states that belong to callee and mark as LIVE_READ + * the registers before the call + */ + tmp = parent; + while (tmp && tmp->curframe != state->curframe) { + tmp = tmp->parent; + } + if (!tmp) + goto bug; + parent = tmp; + } else { + goto bug; + } + return parent; +bug: + verbose(env, "verifier bug regno %d tmp %p\n", regno, tmp); + verbose(env, "regno %d parent frame %d current frame %d\n", + regno, parent->curframe, state->curframe); + return 0; +} + +static int mark_reg_read(struct bpf_verifier_env *env, + const struct bpf_verifier_state *state, + struct bpf_verifier_state *parent, + u32 regno) +{ + bool writes = parent == state->parent; /* Observe write marks */ if (regno == BPF_REG_FP) /* We don't need to worry about FP liveness because it's read-only */ - return; + return 0; while (parent) { /* if read wasn't screened by an earlier write ... */ - if (state->regs[regno].live & REG_LIVE_WRITTEN) + if (writes && state->frame[state->curframe]->regs[regno].live & REG_LIVE_WRITTEN) break; + parent = skip_callee(env, state, parent, regno); + if (!parent) + return -EFAULT; /* ... then we depend on parent's value */ - parent->regs[regno].live |= REG_LIVE_READ; + parent->frame[parent->curframe]->regs[regno].live |= REG_LIVE_READ; state = parent; parent = state->parent; + writes = true; } + return 0; } static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, enum reg_arg_type t) { - struct bpf_reg_state *regs = env->cur_state->regs; + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + struct bpf_reg_state *regs = state->regs; if (regno >= MAX_BPF_REG) { verbose(env, "R%d is invalid\n", regno); @@ -780,7 +906,7 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, verbose(env, "R%d !read_ok\n", regno); return -EACCES; } - mark_reg_read(env->cur_state, regno); + return mark_reg_read(env, vstate, vstate->parent, regno); } else { /* check whether register used as dest operand can be written to */ if (regno == BPF_REG_FP) { @@ -815,13 +941,15 @@ static bool is_spillable_regtype(enum bpf_reg_type type) * stack boundary and alignment are checked in check_mem_access() */ static int check_stack_write(struct bpf_verifier_env *env, - struct bpf_verifier_state *state, int off, - int size, int value_regno) + struct bpf_func_state *state, /* func where register points to */ + int off, int size, int value_regno) { + struct bpf_func_state *cur; /* state of the current function */ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; + enum bpf_reg_type type; - err = realloc_verifier_state(state, round_up(slot + 1, BPF_REG_SIZE), - true); + err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE), + true); if (err) return err; /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, @@ -834,8 +962,9 @@ static int check_stack_write(struct bpf_verifier_env *env, return -EACCES; } + cur = env->cur_state->frame[env->cur_state->curframe]; if (value_regno >= 0 && - is_spillable_regtype(state->regs[value_regno].type)) { + is_spillable_regtype((type = cur->regs[value_regno].type))) { /* register containing pointer is being spilled into stack */ if (size != BPF_REG_SIZE) { @@ -843,8 +972,13 @@ static int check_stack_write(struct bpf_verifier_env *env, return -EACCES; } + if (state != cur && type == PTR_TO_STACK) { + verbose(env, "cannot spill pointers to stack into stack frame of the caller\n"); + return -EINVAL; + } + /* save register state */ - state->stack[spi].spilled_ptr = state->regs[value_regno]; + state->stack[spi].spilled_ptr = cur->regs[value_regno]; state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; for (i = 0; i < BPF_REG_SIZE; i++) @@ -860,34 +994,68 @@ static int check_stack_write(struct bpf_verifier_env *env, return 0; } -static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slot) +/* registers of every function are unique and mark_reg_read() propagates + * the liveness in the following cases: + * - from callee into caller for R1 - R5 that were used as arguments + * - from caller into callee for R0 that used as result of the call + * - from caller to the same caller skipping states of the callee for R6 - R9, + * since R6 - R9 are callee saved by implicit function prologue and + * caller's R6 != callee's R6, so when we propagate liveness up to + * parent states we need to skip callee states for R6 - R9. + * + * stack slot marking is different, since stacks of caller and callee are + * accessible in both (since caller can pass a pointer to caller's stack to + * callee which can pass it to another function), hence mark_stack_slot_read() + * has to propagate the stack liveness to all parent states at given frame number. + * Consider code: + * f1() { + * ptr = fp - 8; + * *ptr = ctx; + * call f2 { + * .. = *ptr; + * } + * .. = *ptr; + * } + * First *ptr is reading from f1's stack and mark_stack_slot_read() has + * to mark liveness at the f1's frame and not f2's frame. + * Second *ptr is also reading from f1's stack and mark_stack_slot_read() has + * to propagate liveness to f2 states at f1's frame level and further into + * f1 states at f1's frame level until write into that stack slot + */ +static void mark_stack_slot_read(struct bpf_verifier_env *env, + const struct bpf_verifier_state *state, + struct bpf_verifier_state *parent, + int slot, int frameno) { - struct bpf_verifier_state *parent = state->parent; + bool writes = parent == state->parent; /* Observe write marks */ while (parent) { /* if read wasn't screened by an earlier write ... */ - if (state->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN) + if (writes && state->frame[frameno]->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN) break; /* ... then we depend on parent's value */ - parent->stack[slot].spilled_ptr.live |= REG_LIVE_READ; + parent->frame[frameno]->stack[slot].spilled_ptr.live |= REG_LIVE_READ; state = parent; parent = state->parent; + writes = true; } } static int check_stack_read(struct bpf_verifier_env *env, - struct bpf_verifier_state *state, int off, int size, - int value_regno) + struct bpf_func_state *reg_state /* func where register points to */, + int off, int size, int value_regno) { + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; int i, slot = -off - 1, spi = slot / BPF_REG_SIZE; u8 *stype; - if (state->allocated_stack <= slot) { + if (reg_state->allocated_stack <= slot) { verbose(env, "invalid read from stack off %d+0 size %d\n", off, size); return -EACCES; } - stype = state->stack[spi].slot_type; + stype = reg_state->stack[spi].slot_type; if (stype[0] == STACK_SPILL) { if (size != BPF_REG_SIZE) { @@ -903,13 +1071,14 @@ static int check_stack_read(struct bpf_verifier_env *env, if (value_regno >= 0) { /* restore register state from stack */ - state->regs[value_regno] = state->stack[spi].spilled_ptr; + state->regs[value_regno] = reg_state->stack[spi].spilled_ptr; /* mark reg as written since spilled pointer state likely * has its liveness marks cleared by is_state_visited() * which resets stack/reg liveness for state transitions */ state->regs[value_regno].live |= REG_LIVE_WRITTEN; - mark_stack_slot_read(state, spi); + mark_stack_slot_read(env, vstate, vstate->parent, spi, + reg_state->frameno); } return 0; } else { @@ -947,7 +1116,8 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off, int size, bool zero_size_allowed) { - struct bpf_verifier_state *state = env->cur_state; + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *reg = &state->regs[regno]; int err; @@ -1197,6 +1367,39 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, strict); } +static int update_stack_depth(struct bpf_verifier_env *env, + const struct bpf_func_state *func, + int off) +{ + u16 stack = env->subprog_stack_depth[func->subprogno], total = 0; + struct bpf_verifier_state *cur = env->cur_state; + int i; + + if (stack >= -off) + return 0; + + /* update known max for given subprogram */ + env->subprog_stack_depth[func->subprogno] = -off; + + /* compute the total for current call chain */ + for (i = 0; i <= cur->curframe; i++) { + u32 depth = env->subprog_stack_depth[cur->frame[i]->subprogno]; + + /* round up to 32-bytes, since this is granularity + * of interpreter stack sizes + */ + depth = round_up(depth, 32); + total += depth; + } + + if (total > MAX_BPF_STACK) { + verbose(env, "combined stack size of %d calls is %d. Too large\n", + cur->curframe, total); + return -EACCES; + } + return 0; +} + /* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory * if t==read, value_regno is a register which will receive the value from memory @@ -1207,9 +1410,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn int bpf_size, enum bpf_access_type t, int value_regno) { - struct bpf_verifier_state *state = env->cur_state; struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = regs + regno; + struct bpf_func_state *state; int size, err = 0; size = bpf_size_to_bytes(bpf_size); @@ -1298,8 +1501,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return -EACCES; } - if (env->prog->aux->stack_depth < -off) - env->prog->aux->stack_depth = -off; + state = func(env, reg); + err = update_stack_depth(env, state, off); + if (err) + return err; if (t == BPF_WRITE) err = check_stack_write(env, state, off, size, @@ -1390,7 +1595,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, struct bpf_call_arg_meta *meta) { struct bpf_reg_state *reg = cur_regs(env) + regno; - struct bpf_verifier_state *state = env->cur_state; + struct bpf_func_state *state = func(env, reg); int off, i, slot, spi; if (reg->type != PTR_TO_STACK) { @@ -1421,9 +1626,6 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, return -EACCES; } - if (env->prog->aux->stack_depth < -off) - env->prog->aux->stack_depth = -off; - if (meta && meta->raw_mode) { meta->access_size = access_size; meta->regno = regno; @@ -1441,7 +1643,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, return -EACCES; } } - return 0; + return update_stack_depth(env, state, off); } static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, @@ -1694,6 +1896,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_FUNC_tail_call: if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY) goto error; + if (env->subprog_cnt) { + verbose(env, "tail_calls are not allowed in programs with bpf-to-bpf calls\n"); + return -EINVAL; + } break; case BPF_FUNC_perf_event_read: case BPF_FUNC_perf_event_output: @@ -1755,9 +1961,9 @@ static int check_raw_mode(const struct bpf_func_proto *fn) /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] * are now invalid, so turn them into unknown SCALAR_VALUE. */ -static void clear_all_pkt_pointers(struct bpf_verifier_env *env) +static void __clear_all_pkt_pointers(struct bpf_verifier_env *env, + struct bpf_func_state *state) { - struct bpf_verifier_state *state = env->cur_state; struct bpf_reg_state *regs = state->regs, *reg; int i; @@ -1774,7 +1980,121 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) } } -static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) +static void clear_all_pkt_pointers(struct bpf_verifier_env *env) +{ + struct bpf_verifier_state *vstate = env->cur_state; + int i; + + for (i = 0; i <= vstate->curframe; i++) + __clear_all_pkt_pointers(env, vstate->frame[i]); +} + +static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + int *insn_idx) +{ + struct bpf_verifier_state *state = env->cur_state; + struct bpf_func_state *caller, *callee; + int i, subprog, target_insn; + + if (state->curframe >= MAX_CALL_FRAMES) { + verbose(env, "the call stack of %d frames is too deep\n", + state->curframe); + return -E2BIG; + } + + target_insn = *insn_idx + insn->imm; + subprog = find_subprog(env, target_insn + 1); + if (subprog < 0) { + verbose(env, "verifier bug. No program starts at insn %d\n", + target_insn + 1); + return -EFAULT; + } + + caller = state->frame[state->curframe]; + if (state->frame[state->curframe + 1]) { + verbose(env, "verifier bug. Frame %d already allocated\n", + state->curframe + 1); + return -EFAULT; + } + + callee = kzalloc(sizeof(*callee), GFP_KERNEL); + if (!callee) + return -ENOMEM; + state->frame[state->curframe + 1] = callee; + + /* callee cannot access r0, r6 - r9 for reading and has to write + * into its own stack before reading from it. + * callee can read/write into caller's stack + */ + init_func_state(env, callee, + /* remember the callsite, it will be used by bpf_exit */ + *insn_idx /* callsite */, + state->curframe + 1 /* frameno within this callchain */, + subprog + 1 /* subprog number within this prog */); + + /* copy r1 - r5 args that callee can access */ + for (i = BPF_REG_1; i <= BPF_REG_5; i++) + callee->regs[i] = caller->regs[i]; + + /* after the call regsiters r0 - r5 were scratched */ + for (i = 0; i < CALLER_SAVED_REGS; i++) { + mark_reg_not_init(env, caller->regs, caller_saved[i]); + check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); + } + + /* only increment it after check_reg_arg() finished */ + state->curframe++; + + /* and go analyze first insn of the callee */ + *insn_idx = target_insn; + + if (env->log.level) { + verbose(env, "caller:\n"); + print_verifier_state(env, caller); + verbose(env, "callee:\n"); + print_verifier_state(env, callee); + } + return 0; +} + +static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) +{ + struct bpf_verifier_state *state = env->cur_state; + struct bpf_func_state *caller, *callee; + struct bpf_reg_state *r0; + + callee = state->frame[state->curframe]; + r0 = &callee->regs[BPF_REG_0]; + if (r0->type == PTR_TO_STACK) { + /* technically it's ok to return caller's stack pointer + * (or caller's caller's pointer) back to the caller, + * since these pointers are valid. Only current stack + * pointer will be invalid as soon as function exits, + * but let's be conservative + */ + verbose(env, "cannot return stack pointer to the caller\n"); + return -EINVAL; + } + + state->curframe--; + caller = state->frame[state->curframe]; + /* return to the caller whatever r0 had in the callee */ + caller->regs[BPF_REG_0] = *r0; + + *insn_idx = callee->callsite + 1; + if (env->log.level) { + verbose(env, "returning from callee:\n"); + print_verifier_state(env, callee); + verbose(env, "to caller at %d:\n", *insn_idx); + print_verifier_state(env, caller); + } + /* clear everything in the callee */ + free_func_state(callee); + state->frame[state->curframe + 1] = NULL; + return 0; +} + +static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx) { const struct bpf_func_proto *fn = NULL; struct bpf_reg_state *regs; @@ -1934,7 +2254,9 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, const struct bpf_reg_state *ptr_reg, const struct bpf_reg_state *off_reg) { - struct bpf_reg_state *regs = cur_regs(env), *dst_reg; + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + struct bpf_reg_state *regs = state->regs, *dst_reg; bool known = tnum_is_const(off_reg->var_off); s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value, smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value; @@ -1946,13 +2268,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg = ®s[dst]; if (WARN_ON_ONCE(known && (smin_val != smax_val))) { - print_verifier_state(env, env->cur_state); + print_verifier_state(env, state); verbose(env, "verifier internal error: known but bad sbounds\n"); return -EINVAL; } if (WARN_ON_ONCE(known && (umin_val != umax_val))) { - print_verifier_state(env, env->cur_state); + print_verifier_state(env, state); verbose(env, "verifier internal error: known but bad ubounds\n"); return -EINVAL; @@ -2354,7 +2676,9 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct bpf_reg_state *regs = cur_regs(env), *dst_reg, *src_reg; + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg; struct bpf_reg_state *ptr_reg = NULL, off_reg = {0}; u8 opcode = BPF_OP(insn->code); int rc; @@ -2428,12 +2752,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, /* Got here implies adding two SCALAR_VALUEs */ if (WARN_ON_ONCE(ptr_reg)) { - print_verifier_state(env, env->cur_state); + print_verifier_state(env, state); verbose(env, "verifier internal error: unexpected ptr_reg\n"); return -EINVAL; } if (WARN_ON(!src_reg)) { - print_verifier_state(env, env->cur_state); + print_verifier_state(env, state); verbose(env, "verifier internal error: no src_reg\n"); return -EINVAL; } @@ -2587,14 +2911,15 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } -static void find_good_pkt_pointers(struct bpf_verifier_state *state, +static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, struct bpf_reg_state *dst_reg, enum bpf_reg_type type, bool range_right_open) { + struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *regs = state->regs, *reg; u16 new_range; - int i; + int i, j; if (dst_reg->off < 0 || (dst_reg->off == 0 && range_right_open)) @@ -2664,12 +2989,15 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, /* keep the maximum range already checked */ regs[i].range = max(regs[i].range, new_range); - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] != STACK_SPILL) - continue; - reg = &state->stack[i].spilled_ptr; - if (reg->type == type && reg->id == dst_reg->id) - reg->range = max(reg->range, new_range); + for (j = 0; j <= vstate->curframe; j++) { + state = vstate->frame[j]; + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] != STACK_SPILL) + continue; + reg = &state->stack[i].spilled_ptr; + if (reg->type == type && reg->id == dst_reg->id) + reg->range = max(reg->range, new_range); + } } } @@ -2907,20 +3235,24 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, /* The logic is similar to find_good_pkt_pointers(), both could eventually * be folded together at some point. */ -static void mark_map_regs(struct bpf_verifier_state *state, u32 regno, +static void mark_map_regs(struct bpf_verifier_state *vstate, u32 regno, bool is_null) { + struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *regs = state->regs; u32 id = regs[regno].id; - int i; + int i, j; for (i = 0; i < MAX_BPF_REG; i++) mark_map_reg(regs, i, id, is_null); - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] != STACK_SPILL) - continue; - mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null); + for (j = 0; j <= vstate->curframe; j++) { + state = vstate->frame[j]; + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] != STACK_SPILL) + continue; + mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null); + } } } @@ -3020,8 +3352,10 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { - struct bpf_verifier_state *other_branch, *this_branch = env->cur_state; - struct bpf_reg_state *regs = this_branch->regs, *dst_reg; + struct bpf_verifier_state *this_branch = env->cur_state; + struct bpf_verifier_state *other_branch; + struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs; + struct bpf_reg_state *dst_reg, *other_branch_regs; u8 opcode = BPF_OP(insn->code); int err; @@ -3084,6 +3418,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx); if (!other_branch) return -EFAULT; + other_branch_regs = other_branch->frame[other_branch->curframe]->regs; /* detect if we are comparing against a constant value so we can adjust * our min/max values for our dst register. @@ -3096,22 +3431,22 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (dst_reg->type == SCALAR_VALUE && regs[insn->src_reg].type == SCALAR_VALUE) { if (tnum_is_const(regs[insn->src_reg].var_off)) - reg_set_min_max(&other_branch->regs[insn->dst_reg], + reg_set_min_max(&other_branch_regs[insn->dst_reg], dst_reg, regs[insn->src_reg].var_off.value, opcode); else if (tnum_is_const(dst_reg->var_off)) - reg_set_min_max_inv(&other_branch->regs[insn->src_reg], + reg_set_min_max_inv(&other_branch_regs[insn->src_reg], ®s[insn->src_reg], dst_reg->var_off.value, opcode); else if (opcode == BPF_JEQ || opcode == BPF_JNE) /* Comparing for equality, we can combine knowledge */ - reg_combine_min_max(&other_branch->regs[insn->src_reg], - &other_branch->regs[insn->dst_reg], + reg_combine_min_max(&other_branch_regs[insn->src_reg], + &other_branch_regs[insn->dst_reg], ®s[insn->src_reg], ®s[insn->dst_reg], opcode); } } else if (dst_reg->type == SCALAR_VALUE) { - reg_set_min_max(&other_branch->regs[insn->dst_reg], + reg_set_min_max(&other_branch_regs[insn->dst_reg], dst_reg, insn->imm, opcode); } @@ -3132,7 +3467,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, return -EACCES; } if (env->log.level) - print_verifier_state(env, this_branch); + print_verifier_state(env, this_branch->frame[this_branch->curframe]); return 0; } @@ -3217,6 +3552,18 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EINVAL; } + if (env->subprog_cnt) { + /* when program has LD_ABS insn JITs and interpreter assume + * that r1 == ctx == skb which is not the case for callees + * that can have arbitrary arguments. It's problematic + * for main prog as well since JITs would need to analyze + * all functions in order to make proper register save/restore + * decisions in the main prog. Hence disallow LD_ABS with calls + */ + verbose(env, "BPF_LD_[ABS|IND] instructions cannot be mixed with bpf-to-bpf calls\n"); + return -EINVAL; + } + if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || BPF_SIZE(insn->code) == BPF_DW || (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { @@ -3555,11 +3902,21 @@ static bool check_ids(u32 old_id, u32 cur_id, struct idpair *idmap) static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, struct idpair *idmap) { + bool equal; + if (!(rold->live & REG_LIVE_READ)) /* explored state didn't use this */ return true; - if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, live)) == 0) + equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, frameno)) == 0; + + if (rold->type == PTR_TO_STACK) + /* two stack pointers are equal only if they're pointing to + * the same stack frame, since fp-8 in foo != fp-8 in bar + */ + return equal && rold->frameno == rcur->frameno; + + if (equal) return true; if (rold->type == NOT_INIT) @@ -3632,7 +3989,6 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, tnum_in(rold->var_off, rcur->var_off); case PTR_TO_CTX: case CONST_PTR_TO_MAP: - case PTR_TO_STACK: case PTR_TO_PACKET_END: /* Only valid matches are exact, which memcmp() above * would have accepted @@ -3647,8 +4003,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, return false; } -static bool stacksafe(struct bpf_verifier_state *old, - struct bpf_verifier_state *cur, +static bool stacksafe(struct bpf_func_state *old, + struct bpf_func_state *cur, struct idpair *idmap) { int i, spi; @@ -3724,9 +4080,8 @@ static bool stacksafe(struct bpf_verifier_state *old, * whereas register type in current state is meaningful, it means that * the current state will reach 'bpf_exit' instruction safely */ -static bool states_equal(struct bpf_verifier_env *env, - struct bpf_verifier_state *old, - struct bpf_verifier_state *cur) +static bool func_states_equal(struct bpf_func_state *old, + struct bpf_func_state *cur) { struct idpair *idmap; bool ret = false; @@ -3750,71 +4105,76 @@ out_free: return ret; } +static bool states_equal(struct bpf_verifier_env *env, + struct bpf_verifier_state *old, + struct bpf_verifier_state *cur) +{ + int i; + + if (old->curframe != cur->curframe) + return false; + + /* for states to be equal callsites have to be the same + * and all frame states need to be equivalent + */ + for (i = 0; i <= old->curframe; i++) { + if (old->frame[i]->callsite != cur->frame[i]->callsite) + return false; + if (!func_states_equal(old->frame[i], cur->frame[i])) + return false; + } + return true; +} + /* A write screens off any subsequent reads; but write marks come from the - * straight-line code between a state and its parent. When we arrive at a - * jump target (in the first iteration of the propagate_liveness() loop), - * we didn't arrive by the straight-line code, so read marks in state must - * propagate to parent regardless of state's write marks. + * straight-line code between a state and its parent. When we arrive at an + * equivalent state (jump target or such) we didn't arrive by the straight-line + * code, so read marks in the state must propagate to the parent regardless + * of the state's write marks. That's what 'parent == state->parent' comparison + * in mark_reg_read() and mark_stack_slot_read() is for. */ -static bool do_propagate_liveness(const struct bpf_verifier_state *state, - struct bpf_verifier_state *parent) +static int propagate_liveness(struct bpf_verifier_env *env, + const struct bpf_verifier_state *vstate, + struct bpf_verifier_state *vparent) { - bool writes = parent == state->parent; /* Observe write marks */ - bool touched = false; /* any changes made? */ - int i; + int i, frame, err = 0; + struct bpf_func_state *state, *parent; - if (!parent) - return touched; + if (vparent->curframe != vstate->curframe) { + WARN(1, "propagate_live: parent frame %d current frame %d\n", + vparent->curframe, vstate->curframe); + return -EFAULT; + } /* Propagate read liveness of registers... */ BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); /* We don't need to worry about FP liveness because it's read-only */ for (i = 0; i < BPF_REG_FP; i++) { - if (parent->regs[i].live & REG_LIVE_READ) + if (vparent->frame[vparent->curframe]->regs[i].live & REG_LIVE_READ) continue; - if (writes && (state->regs[i].live & REG_LIVE_WRITTEN)) - continue; - if (state->regs[i].live & REG_LIVE_READ) { - parent->regs[i].live |= REG_LIVE_READ; - touched = true; + if (vstate->frame[vstate->curframe]->regs[i].live & REG_LIVE_READ) { + err = mark_reg_read(env, vstate, vparent, i); + if (err) + return err; } } + /* ... and stack slots */ - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && - i < parent->allocated_stack / BPF_REG_SIZE; i++) { - if (parent->stack[i].slot_type[0] != STACK_SPILL) - continue; - if (state->stack[i].slot_type[0] != STACK_SPILL) - continue; - if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ) - continue; - if (writes && - (state->stack[i].spilled_ptr.live & REG_LIVE_WRITTEN)) - continue; - if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) { - parent->stack[i].spilled_ptr.live |= REG_LIVE_READ; - touched = true; + for (frame = 0; frame <= vstate->curframe; frame++) { + state = vstate->frame[frame]; + parent = vparent->frame[frame]; + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && + i < parent->allocated_stack / BPF_REG_SIZE; i++) { + if (parent->stack[i].slot_type[0] != STACK_SPILL) + continue; + if (state->stack[i].slot_type[0] != STACK_SPILL) + continue; + if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ) + continue; + if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) + mark_stack_slot_read(env, vstate, vparent, i, frame); } } - return touched; -} - -/* "parent" is "a state from which we reach the current state", but initially - * it is not the state->parent (i.e. "the state whose straight-line code leads - * to the current state"), instead it is the state that happened to arrive at - * a (prunable) equivalent of the current state. See comment above - * do_propagate_liveness() for consequences of this. - * This function is just a more efficient way of calling mark_reg_read() or - * mark_stack_slot_read() on each reg in "parent" that is read in "state", - * though it requires that parent != state->parent in the call arguments. - */ -static void propagate_liveness(const struct bpf_verifier_state *state, - struct bpf_verifier_state *parent) -{ - while (do_propagate_liveness(state, parent)) { - /* Something changed, so we need to feed those changes onward */ - state = parent; - parent = state->parent; - } + return err; } static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) @@ -3822,7 +4182,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) struct bpf_verifier_state_list *new_sl; struct bpf_verifier_state_list *sl; struct bpf_verifier_state *cur = env->cur_state; - int i, err; + int i, j, err; sl = env->explored_states[insn_idx]; if (!sl) @@ -3843,7 +4203,9 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * they'll be immediately forgotten as we're pruning * this state and will pop a new one. */ - propagate_liveness(&sl->state, cur); + err = propagate_liveness(env, &sl->state, cur); + if (err) + return err; return 1; } sl = sl->next; @@ -3851,9 +4213,10 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) /* there were no equivalent states, remember current one. * technically the current state is not proven to be safe yet, - * but it will either reach bpf_exit (which means it's safe) or - * it will be rejected. Since there are no loops, we won't be - * seeing this 'insn_idx' instruction again on the way to bpf_exit + * but it will either reach outer most bpf_exit (which means it's safe) + * or it will be rejected. Since there are no loops, we won't be + * seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx) + * again on the way to bpf_exit */ new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL); if (!new_sl) @@ -3877,10 +4240,16 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * explored_states can get read marks.) */ for (i = 0; i < BPF_REG_FP; i++) - cur->regs[i].live = REG_LIVE_NONE; - for (i = 0; i < cur->allocated_stack / BPF_REG_SIZE; i++) - if (cur->stack[i].slot_type[0] == STACK_SPILL) - cur->stack[i].spilled_ptr.live = REG_LIVE_NONE; + cur->frame[cur->curframe]->regs[i].live = REG_LIVE_NONE; + + /* all stack frames are accessible from callee, clear them all */ + for (j = 0; j <= cur->curframe; j++) { + struct bpf_func_state *frame = cur->frame[j]; + + for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) + if (frame->stack[i].slot_type[0] == STACK_SPILL) + frame->stack[i].spilled_ptr.live = REG_LIVE_NONE; + } return 0; } @@ -3898,7 +4267,7 @@ static int do_check(struct bpf_verifier_env *env) struct bpf_verifier_state *state; struct bpf_insn *insns = env->prog->insnsi; struct bpf_reg_state *regs; - int insn_cnt = env->prog->len; + int insn_cnt = env->prog->len, i; int insn_idx, prev_insn_idx = 0; int insn_processed = 0; bool do_print_state = false; @@ -3906,9 +4275,18 @@ static int do_check(struct bpf_verifier_env *env) state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL); if (!state) return -ENOMEM; - env->cur_state = state; - init_reg_state(env, state->regs); + state->curframe = 0; state->parent = NULL; + state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); + if (!state->frame[0]) { + kfree(state); + return -ENOMEM; + } + env->cur_state = state; + init_func_state(env, state->frame[0], + BPF_MAIN_FUNC /* callsite */, + 0 /* frameno */, + 0 /* subprogno, zero == main subprog */); insn_idx = 0; for (;;) { struct bpf_insn *insn; @@ -3955,7 +4333,7 @@ static int do_check(struct bpf_verifier_env *env) else verbose(env, "\nfrom %d to %d:", prev_insn_idx, insn_idx); - print_verifier_state(env, state); + print_verifier_state(env, state->frame[state->curframe]); do_print_state = false; } @@ -4088,13 +4466,17 @@ static int do_check(struct bpf_verifier_env *env) if (opcode == BPF_CALL) { if (BPF_SRC(insn->code) != BPF_K || insn->off != 0 || - insn->src_reg != BPF_REG_0 || + (insn->src_reg != BPF_REG_0 && + insn->src_reg != BPF_PSEUDO_CALL) || insn->dst_reg != BPF_REG_0) { verbose(env, "BPF_CALL uses reserved fields\n"); return -EINVAL; } - err = check_call(env, insn->imm, insn_idx); + if (insn->src_reg == BPF_PSEUDO_CALL) + err = check_func_call(env, insn, &insn_idx); + else + err = check_helper_call(env, insn->imm, insn_idx); if (err) return err; @@ -4119,6 +4501,16 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } + if (state->curframe) { + /* exit from nested function */ + prev_insn_idx = insn_idx; + err = prepare_func_exit(env, &insn_idx); + if (err) + return err; + do_print_state = true; + continue; + } + /* eBPF calling convetion is such that R0 is used * to return the value from eBPF program. * Make sure that it's readable at this time @@ -4179,8 +4571,16 @@ process_bpf_exit: insn_idx++; } - verbose(env, "processed %d insns, stack depth %d\n", insn_processed, - env->prog->aux->stack_depth); + verbose(env, "processed %d insns, stack depth ", insn_processed); + for (i = 0; i < env->subprog_cnt + 1; i++) { + u32 depth = env->subprog_stack_depth[i]; + + verbose(env, "%d", depth); + if (i + 1 < env->subprog_cnt + 1) + verbose(env, "+"); + } + verbose(env, "\n"); + env->prog->aux->stack_depth = env->subprog_stack_depth[0]; return 0; } -- cgit v1.2.2 From cc2b14d51053eb055c06f45e1a5cdbfcf2b79e94 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Dec 2017 17:55:08 -0800 Subject: bpf: teach verifier to recognize zero initialized stack programs with function calls are often passing various pointers via stack. When all calls are inlined llvm flattens stack accesses and optimizes away extra branches. When functions are not inlined it becomes the job of the verifier to recognize zero initialized stack to avoid exploring paths that program will not take. The following program would fail otherwise: ptr = &buffer_on_stack; *ptr = 0; ... func_call(.., ptr, ...) { if (..) *ptr = bpf_map_lookup(); } ... if (*ptr != 0) { // Access (*ptr)->field is valid. // Without stack_zero tracking such (*ptr)->field access // will be rejected } since stack slots are no longer uniform invalid | spill | misc add liveness marking to all slots, but do it in 8 byte chunks. So if nothing was read or written in [fp-16, fp-9] range it will be marked as LIVE_NONE. If any byte in that range was read, it will be marked LIVE_READ and stacksafe() check will perform byte-by-byte verification. If all bytes in the range were written the slot will be marked as LIVE_WRITTEN. This significantly speeds up state equality comparison and reduces total number of states processed. before after bpf_lb-DLB_L3.o 2051 2003 bpf_lb-DLB_L4.o 3287 3164 bpf_lb-DUNKNOWN.o 1080 1080 bpf_lxc-DDROP_ALL.o 24980 12361 bpf_lxc-DUNKNOWN.o 34308 16605 bpf_netdev.o 15404 10962 bpf_overlay.o 7191 6679 Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 129 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 101 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f6e09d84a96f..df1ded3faf1d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -311,6 +311,8 @@ static void print_verifier_state(struct bpf_verifier_env *env, verbose(env, "=%s", reg_type_str[state->stack[i].spilled_ptr.type]); } + if (state->stack[i].slot_type[0] == STACK_ZERO) + verbose(env, " fp%d=0", (-i - 1) * BPF_REG_SIZE); } verbose(env, "\n"); } @@ -522,6 +524,13 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg) __mark_reg_known(reg, 0); } +static void __mark_reg_const_zero(struct bpf_reg_state *reg) +{ + __mark_reg_known(reg, 0); + reg->off = 0; + reg->type = SCALAR_VALUE; +} + static void mark_reg_known_zero(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno) { @@ -937,6 +946,12 @@ static bool is_spillable_regtype(enum bpf_reg_type type) } } +/* Does this register contain a constant zero? */ +static bool register_is_null(struct bpf_reg_state *reg) +{ + return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0); +} + /* check_stack_read/write functions track spill/fill of registers, * stack boundary and alignment are checked in check_mem_access() */ @@ -984,12 +999,30 @@ static int check_stack_write(struct bpf_verifier_env *env, for (i = 0; i < BPF_REG_SIZE; i++) state->stack[spi].slot_type[i] = STACK_SPILL; } else { + u8 type = STACK_MISC; + /* regular write of data into stack */ state->stack[spi].spilled_ptr = (struct bpf_reg_state) {}; + /* only mark the slot as written if all 8 bytes were written + * otherwise read propagation may incorrectly stop too soon + * when stack slots are partially written. + * This heuristic means that read propagation will be + * conservative, since it will add reg_live_read marks + * to stack slots all the way to first state when programs + * writes+reads less than 8 bytes + */ + if (size == BPF_REG_SIZE) + state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; + + /* when we zero initialize stack slots mark them as such */ + if (value_regno >= 0 && + register_is_null(&cur->regs[value_regno])) + type = STACK_ZERO; + for (i = 0; i < size; i++) state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] = - STACK_MISC; + type; } return 0; } @@ -1030,6 +1063,14 @@ static void mark_stack_slot_read(struct bpf_verifier_env *env, bool writes = parent == state->parent; /* Observe write marks */ while (parent) { + if (parent->frame[frameno]->allocated_stack <= slot * BPF_REG_SIZE) + /* since LIVE_WRITTEN mark is only done for full 8-byte + * write the read marks are conservative and parent + * state may not even have the stack allocated. In such case + * end the propagation, since the loop reached beginning + * of the function + */ + break; /* if read wasn't screened by an earlier write ... */ if (writes && state->frame[frameno]->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN) break; @@ -1077,21 +1118,38 @@ static int check_stack_read(struct bpf_verifier_env *env, * which resets stack/reg liveness for state transitions */ state->regs[value_regno].live |= REG_LIVE_WRITTEN; - mark_stack_slot_read(env, vstate, vstate->parent, spi, - reg_state->frameno); } + mark_stack_slot_read(env, vstate, vstate->parent, spi, + reg_state->frameno); return 0; } else { + int zeros = 0; + for (i = 0; i < size; i++) { - if (stype[(slot - i) % BPF_REG_SIZE] != STACK_MISC) { - verbose(env, "invalid read from stack off %d+%d size %d\n", - off, i, size); - return -EACCES; + if (stype[(slot - i) % BPF_REG_SIZE] == STACK_MISC) + continue; + if (stype[(slot - i) % BPF_REG_SIZE] == STACK_ZERO) { + zeros++; + continue; } + verbose(env, "invalid read from stack off %d+%d size %d\n", + off, i, size); + return -EACCES; + } + mark_stack_slot_read(env, vstate, vstate->parent, spi, + reg_state->frameno); + if (value_regno >= 0) { + if (zeros == size) { + /* any size read into register is zero extended, + * so the whole register == const_zero + */ + __mark_reg_const_zero(&state->regs[value_regno]); + } else { + /* have read misc data from the stack */ + mark_reg_unknown(env, state->regs, value_regno); + } + state->regs[value_regno].live |= REG_LIVE_WRITTEN; } - if (value_regno >= 0) - /* have read misc data from the stack */ - mark_reg_unknown(env, state->regs, value_regno); return 0; } } @@ -1578,12 +1636,6 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins BPF_SIZE(insn->code), BPF_WRITE, -1); } -/* Does this register contain a constant zero? */ -static bool register_is_null(struct bpf_reg_state *reg) -{ - return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0); -} - /* when register 'regno' is passed into function that will read 'access_size' * bytes from that pointer, make sure that it's within stack boundary * and all elements of stack are initialized. @@ -1633,15 +1685,30 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, } for (i = 0; i < access_size; i++) { + u8 *stype; + slot = -(off + i) - 1; spi = slot / BPF_REG_SIZE; - if (state->allocated_stack <= slot || - state->stack[spi].slot_type[slot % BPF_REG_SIZE] != - STACK_MISC) { - verbose(env, "invalid indirect read from stack off %d+%d size %d\n", - off, i, access_size); - return -EACCES; + if (state->allocated_stack <= slot) + goto err; + stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE]; + if (*stype == STACK_MISC) + goto mark; + if (*stype == STACK_ZERO) { + /* helper can write anything into the stack */ + *stype = STACK_MISC; + goto mark; } +err: + verbose(env, "invalid indirect read from stack off %d+%d size %d\n", + off, i, access_size); + return -EACCES; +mark: + /* reading any byte out of 8-byte 'spill_slot' will cause + * the whole slot to be marked as 'read' + */ + mark_stack_slot_read(env, env->cur_state, env->cur_state->parent, + spi, state->frameno); } return update_stack_depth(env, state, off); } @@ -4022,8 +4089,19 @@ static bool stacksafe(struct bpf_func_state *old, for (i = 0; i < old->allocated_stack; i++) { spi = i / BPF_REG_SIZE; + if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) + /* explored state didn't use this */ + return true; + if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID) continue; + /* if old state was safe with misc data in the stack + * it will be safe with zero-initialized stack. + * The opposite is not true + */ + if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC && + cur->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_ZERO) + continue; if (old->stack[spi].slot_type[i % BPF_REG_SIZE] != cur->stack[spi].slot_type[i % BPF_REG_SIZE]) /* Ex: old explored (safe) state has STACK_SPILL in @@ -4164,10 +4242,6 @@ static int propagate_liveness(struct bpf_verifier_env *env, parent = vparent->frame[frame]; for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && i < parent->allocated_stack / BPF_REG_SIZE; i++) { - if (parent->stack[i].slot_type[0] != STACK_SPILL) - continue; - if (state->stack[i].slot_type[0] != STACK_SPILL) - continue; if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ) continue; if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) @@ -4247,8 +4321,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) struct bpf_func_state *frame = cur->frame[j]; for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) - if (frame->stack[i].slot_type[0] == STACK_SPILL) - frame->stack[i].spilled_ptr.live = REG_LIVE_NONE; + frame->stack[i].spilled_ptr.live = REG_LIVE_NONE; } return 0; } -- cgit v1.2.2 From 1ea47e01ad6ea0fe99697c54c2413d81dd21fe32 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Dec 2017 17:55:13 -0800 Subject: bpf: add support for bpf_call to interpreter though bpf_call is still the same call instruction and calling convention 'bpf to bpf' and 'bpf to helper' is the same the interpreter has to oparate on 'struct bpf_insn *'. To distinguish these two cases add a kernel internal opcode and mark call insns with it. This opcode is seen by interpreter only. JITs will never see it. Also add tiny bit of debug code to aid interpreter debugging. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- kernel/bpf/core.c | 90 +++++++++++++++++++++++++++++++++++++++++---------- kernel/bpf/verifier.c | 36 +++++++++++++++++++++ 2 files changed, 109 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index d32bebf4f2de..dc12c4fd006e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -217,30 +217,40 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) return 0; } -static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn) -{ - return BPF_CLASS(insn->code) == BPF_JMP && - /* Call and Exit are both special jumps with no - * target inside the BPF instruction image. - */ - BPF_OP(insn->code) != BPF_CALL && - BPF_OP(insn->code) != BPF_EXIT; -} - static void bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta) { struct bpf_insn *insn = prog->insnsi; u32 i, insn_cnt = prog->len; + bool pseudo_call; + u8 code; + int off; for (i = 0; i < insn_cnt; i++, insn++) { - if (!bpf_is_jmp_and_has_target(insn)) + code = insn->code; + if (BPF_CLASS(code) != BPF_JMP) continue; + if (BPF_OP(code) == BPF_EXIT) + continue; + if (BPF_OP(code) == BPF_CALL) { + if (insn->src_reg == BPF_PSEUDO_CALL) + pseudo_call = true; + else + continue; + } else { + pseudo_call = false; + } + off = pseudo_call ? insn->imm : insn->off; /* Adjust offset of jmps if we cross boundaries. */ - if (i < pos && i + insn->off + 1 > pos) - insn->off += delta; - else if (i > pos + delta && i + insn->off + 1 <= pos + delta) - insn->off -= delta; + if (i < pos && i + off + 1 > pos) + off += delta; + else if (i > pos + delta && i + off + 1 <= pos + delta) + off -= delta; + + if (pseudo_call) + insn->imm = off; + else + insn->off = off; } } @@ -774,8 +784,7 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); * * Decode and execute eBPF instructions. */ -static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, - u64 *stack) +static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) { u64 tmp; static const void *jumptable[256] = { @@ -835,6 +844,7 @@ static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG, /* Call instruction */ [BPF_JMP | BPF_CALL] = &&JMP_CALL, + [BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS, [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL, /* Jumps */ [BPF_JMP | BPF_JA] = &&JMP_JA, @@ -1025,6 +1035,13 @@ select_insn: BPF_R4, BPF_R5); CONT; + JMP_CALL_ARGS: + BPF_R0 = (__bpf_call_base_args + insn->imm)(BPF_R1, BPF_R2, + BPF_R3, BPF_R4, + BPF_R5, + insn + insn->off + 1); + CONT; + JMP_TAIL_CALL: { struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2; struct bpf_array *array = container_of(map, struct bpf_array, map); @@ -1297,6 +1314,23 @@ static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn return ___bpf_prog_run(regs, insn, stack); \ } +#define PROG_NAME_ARGS(stack_size) __bpf_prog_run_args##stack_size +#define DEFINE_BPF_PROG_RUN_ARGS(stack_size) \ +static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \ + const struct bpf_insn *insn) \ +{ \ + u64 stack[stack_size / sizeof(u64)]; \ + u64 regs[MAX_BPF_REG]; \ +\ + FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ + BPF_R1 = r1; \ + BPF_R2 = r2; \ + BPF_R3 = r3; \ + BPF_R4 = r4; \ + BPF_R5 = r5; \ + return ___bpf_prog_run(regs, insn, stack); \ +} + #define EVAL1(FN, X) FN(X) #define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y) #define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y) @@ -1308,6 +1342,10 @@ EVAL6(DEFINE_BPF_PROG_RUN, 32, 64, 96, 128, 160, 192); EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384); EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512); +EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 32, 64, 96, 128, 160, 192); +EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 224, 256, 288, 320, 352, 384); +EVAL4(DEFINE_BPF_PROG_RUN_ARGS, 416, 448, 480, 512); + #define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size), static unsigned int (*interpreters[])(const void *ctx, @@ -1316,6 +1354,24 @@ EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192) EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) }; +#undef PROG_NAME_LIST +#define PROG_NAME_LIST(stack_size) PROG_NAME_ARGS(stack_size), +static u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, + const struct bpf_insn *insn) = { +EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192) +EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) +EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) +}; +#undef PROG_NAME_LIST + +void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth) +{ + stack_depth = max_t(u32, stack_depth, 1); + insn->off = (s16) insn->imm; + insn->imm = interpreters_args[(round_up(stack_depth, 32) / 32) - 1] - + __bpf_call_base_args; + insn->code = BPF_JMP | BPF_CALL_ARGS; +} bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index df1ded3faf1d..cdc1f043c69b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1458,6 +1458,21 @@ static int update_stack_depth(struct bpf_verifier_env *env, return 0; } +static int get_callee_stack_depth(struct bpf_verifier_env *env, + const struct bpf_insn *insn, int idx) +{ + int start = idx + insn->imm + 1, subprog; + + subprog = find_subprog(env, start); + if (subprog < 0) { + WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", + start); + return -EFAULT; + } + subprog++; + return env->subprog_stack_depth[subprog]; +} + /* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory * if t==read, value_regno is a register which will receive the value from memory @@ -4997,6 +5012,24 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) return 0; } +static int fixup_call_args(struct bpf_verifier_env *env) +{ + struct bpf_prog *prog = env->prog; + struct bpf_insn *insn = prog->insnsi; + int i, depth; + + for (i = 0; i < prog->len; i++, insn++) { + if (insn->code != (BPF_JMP | BPF_CALL) || + insn->src_reg != BPF_PSEUDO_CALL) + continue; + depth = get_callee_stack_depth(env, insn, i); + if (depth < 0) + return depth; + bpf_patch_call_args(insn, depth); + } + return 0; +} + /* fixup insn->imm field of bpf_call instructions * and inline eligible helpers as explicit sequence of BPF instructions * @@ -5225,6 +5258,9 @@ skip_full_check: if (ret == 0) ret = fixup_bpf_calls(env); + if (ret == 0) + ret = fixup_call_args(env); + if (log->level && bpf_verifier_log_full(log)) ret = -ENOSPC; if (log->level && !log->ubuf) { -- cgit v1.2.2 From 60b58afc96c9df71871df2dbad42037757ceef26 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Dec 2017 17:55:14 -0800 Subject: bpf: fix net.core.bpf_jit_enable race global bpf_jit_enable variable is tested multiple times in JITs, blinding and verifier core. The malicious root can try to toggle it while loading the programs. This race condition was accounted for and there should be no issues, but it's safer to avoid this race condition. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- kernel/bpf/core.c | 3 ++- kernel/bpf/verifier.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index dc12c4fd006e..bda911644b1c 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -94,6 +94,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) fp->pages = size / PAGE_SIZE; fp->aux = aux; fp->aux->prog = fp; + fp->jit_requested = ebpf_jit_enabled(); INIT_LIST_HEAD_RCU(&fp->aux->ksym_lnode); @@ -721,7 +722,7 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) struct bpf_insn *insn; int i, rewritten; - if (!bpf_jit_blinding_enabled()) + if (!bpf_jit_blinding_enabled(prog)) return prog; clone = bpf_prog_clone_create(prog, GFP_USER); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index cdc1f043c69b..8e0e4cd0d5e4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5080,7 +5080,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup * handlers are currently limited to 64 bit only. */ - if (ebpf_jit_enabled() && BITS_PER_LONG == 64 && + if (prog->jit_requested && BITS_PER_LONG == 64 && insn->imm == BPF_FUNC_map_lookup_elem) { map_ptr = env->insn_aux_data[i + delta].map_ptr; if (map_ptr == BPF_MAP_PTR_POISON || -- cgit v1.2.2 From 1c2a088a6626d4f51d2f2c97b0cbedbfbf3637f6 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Dec 2017 17:55:15 -0800 Subject: bpf: x64: add JIT support for multi-function programs Typical JIT does several passes over bpf instructions to compute total size and relative offsets of jumps and calls. With multitple bpf functions calling each other all relative calls will have invalid offsets intially therefore we need to additional last pass over the program to emit calls with correct offsets. For example in case of three bpf functions: main: call foo call bpf_map_lookup exit foo: call bar exit bar: exit We will call bpf_int_jit_compile() indepedently for main(), foo() and bar() x64 JIT typically does 4-5 passes to converge. After these initial passes the image for these 3 functions will be good except call targets, since start addresses of foo() and bar() are unknown when we were JITing main() (note that call bpf_map_lookup will be resolved properly during initial passes). Once start addresses of 3 functions are known we patch call_insn->imm to point to right functions and call bpf_int_jit_compile() again which needs only one pass. Additional safety checks are done to make sure this last pass doesn't produce image that is larger or smaller than previous pass. When constant blinding is on it's applied to all functions at the first pass, since doing it once again at the last pass can change size of the JITed code. Tested on x64 and arm64 hw with JIT on/off, blinding on/off. x64 jits bpf-to-bpf calls correctly while arm64 falls back to interpreter. All other JITs that support normal BPF_CALL will behave the same way since bpf-to-bpf call is equivalent to bpf-to-kernel call from JITs point of view. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- kernel/bpf/core.c | 13 +++++- kernel/bpf/syscall.c | 3 +- kernel/bpf/verifier.c | 126 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 139 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index bda911644b1c..768e0a02d8c8 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -722,7 +722,7 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) struct bpf_insn *insn; int i, rewritten; - if (!bpf_jit_blinding_enabled(prog)) + if (!bpf_jit_blinding_enabled(prog) || prog->blinded) return prog; clone = bpf_prog_clone_create(prog, GFP_USER); @@ -764,6 +764,7 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) i += insn_delta; } + clone->blinded = 1; return clone; } #endif /* CONFIG_BPF_JIT */ @@ -1629,11 +1630,19 @@ int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, static void bpf_prog_free_deferred(struct work_struct *work) { struct bpf_prog_aux *aux; + int i; aux = container_of(work, struct bpf_prog_aux, work); if (bpf_prog_is_dev_bound(aux)) bpf_prog_offload_destroy(aux->prog); - bpf_jit_free(aux->prog); + for (i = 0; i < aux->func_cnt; i++) + bpf_jit_free(aux->func[i]); + if (aux->func_cnt) { + kfree(aux->func); + bpf_prog_unlock_free(aux->prog); + } else { + bpf_jit_free(aux->prog); + } } /* Free internal BPF program */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2c4cfeaa8d5e..e2e1c78ce1dc 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1194,7 +1194,8 @@ static int bpf_prog_load(union bpf_attr *attr) goto free_used_maps; /* eBPF program is ready to be JITed */ - prog = bpf_prog_select_runtime(prog, &err); + if (!prog->bpf_func) + prog = bpf_prog_select_runtime(prog, &err); if (err < 0) goto free_used_maps; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8e0e4cd0d5e4..48b2901cf483 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5012,12 +5012,138 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) return 0; } +static int jit_subprogs(struct bpf_verifier_env *env) +{ + struct bpf_prog *prog = env->prog, **func, *tmp; + int i, j, subprog_start, subprog_end = 0, len, subprog; + struct bpf_insn *insn = prog->insnsi; + void *old_bpf_func; + int err = -ENOMEM; + + if (env->subprog_cnt == 0) + return 0; + + for (i = 0; i < prog->len; i++, insn++) { + if (insn->code != (BPF_JMP | BPF_CALL) || + insn->src_reg != BPF_PSEUDO_CALL) + continue; + subprog = find_subprog(env, i + insn->imm + 1); + if (subprog < 0) { + WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", + i + insn->imm + 1); + return -EFAULT; + } + /* temporarily remember subprog id inside insn instead of + * aux_data, since next loop will split up all insns into funcs + */ + insn->off = subprog + 1; + /* remember original imm in case JIT fails and fallback + * to interpreter will be needed + */ + env->insn_aux_data[i].call_imm = insn->imm; + /* point imm to __bpf_call_base+1 from JITs point of view */ + insn->imm = 1; + } + + func = kzalloc(sizeof(prog) * (env->subprog_cnt + 1), GFP_KERNEL); + if (!func) + return -ENOMEM; + + for (i = 0; i <= env->subprog_cnt; i++) { + subprog_start = subprog_end; + if (env->subprog_cnt == i) + subprog_end = prog->len; + else + subprog_end = env->subprog_starts[i]; + + len = subprog_end - subprog_start; + func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER); + if (!func[i]) + goto out_free; + memcpy(func[i]->insnsi, &prog->insnsi[subprog_start], + len * sizeof(struct bpf_insn)); + func[i]->len = len; + func[i]->is_func = 1; + /* Use bpf_prog_F_tag to indicate functions in stack traces. + * Long term would need debug info to populate names + */ + func[i]->aux->name[0] = 'F'; + func[i]->aux->stack_depth = env->subprog_stack_depth[i]; + func[i]->jit_requested = 1; + func[i] = bpf_int_jit_compile(func[i]); + if (!func[i]->jited) { + err = -ENOTSUPP; + goto out_free; + } + cond_resched(); + } + /* at this point all bpf functions were successfully JITed + * now populate all bpf_calls with correct addresses and + * run last pass of JIT + */ + for (i = 0; i <= env->subprog_cnt; i++) { + insn = func[i]->insnsi; + for (j = 0; j < func[i]->len; j++, insn++) { + if (insn->code != (BPF_JMP | BPF_CALL) || + insn->src_reg != BPF_PSEUDO_CALL) + continue; + subprog = insn->off; + insn->off = 0; + insn->imm = (u64 (*)(u64, u64, u64, u64, u64)) + func[subprog]->bpf_func - + __bpf_call_base; + } + } + for (i = 0; i <= env->subprog_cnt; i++) { + old_bpf_func = func[i]->bpf_func; + tmp = bpf_int_jit_compile(func[i]); + if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) { + verbose(env, "JIT doesn't support bpf-to-bpf calls\n"); + err = -EFAULT; + goto out_free; + } + cond_resched(); + } + + /* finally lock prog and jit images for all functions and + * populate kallsysm + */ + for (i = 0; i <= env->subprog_cnt; i++) { + bpf_prog_lock_ro(func[i]); + bpf_prog_kallsyms_add(func[i]); + } + prog->jited = 1; + prog->bpf_func = func[0]->bpf_func; + prog->aux->func = func; + prog->aux->func_cnt = env->subprog_cnt + 1; + return 0; +out_free: + for (i = 0; i <= env->subprog_cnt; i++) + if (func[i]) + bpf_jit_free(func[i]); + kfree(func); + /* cleanup main prog to be interpreted */ + prog->jit_requested = 0; + for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { + if (insn->code != (BPF_JMP | BPF_CALL) || + insn->src_reg != BPF_PSEUDO_CALL) + continue; + insn->off = 0; + insn->imm = env->insn_aux_data[i].call_imm; + } + return err; +} + static int fixup_call_args(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; struct bpf_insn *insn = prog->insnsi; int i, depth; + if (env->prog->jit_requested) + if (jit_subprogs(env) == 0) + return 0; + for (i = 0; i < prog->len; i++, insn++) { if (insn->code != (BPF_JMP | BPF_CALL) || insn->src_reg != BPF_PSEUDO_CALL) -- cgit v1.2.2 From 46df3d209db080395a98fc0875bd05e45e8f44e0 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 15 Dec 2017 21:42:57 -0500 Subject: trace: reenable preemption if we modify the ip Things got moved around between the original bpf_override_return patches and the final version, and now the ftrace kprobe dispatcher assumes if you modified the ip that you also enabled preemption. Make a comment of this and enable preemption, this fixes the lockdep splat that happened when using this feature. Fixes: 9802d86585db ("bpf: add a bpf_override_function helper") Signed-off-by: Josef Bacik Signed-off-by: Daniel Borkmann --- kernel/trace/trace_kprobe.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 5db849809a56..91f4b57dab82 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1322,8 +1322,15 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) if (tk->tp.flags & TP_FLAG_TRACE) kprobe_trace_func(tk, regs); #ifdef CONFIG_PERF_EVENTS - if (tk->tp.flags & TP_FLAG_PROFILE) + if (tk->tp.flags & TP_FLAG_PROFILE) { ret = kprobe_perf_func(tk, regs); + /* + * The ftrace kprobe handler leaves it up to us to re-enable + * preemption here before returning if we've modified the ip. + */ + if (ret) + preempt_enable_no_resched(); + } #endif return ret; } -- cgit v1.2.2 From e90004d56bf805db7d4401fe51a80a93c311cfe6 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 18 Dec 2017 14:03:12 +0000 Subject: bpf: fix spelling mistake: "funcation"-> "function" Trivial fix to spelling mistake in error message text. Signed-off-by: Colin Ian King Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 48b2901cf483..56ef21c71bd3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -772,7 +772,7 @@ static int check_subprogs(struct bpf_verifier_env *env) return -EPERM; } if (bpf_prog_is_dev_bound(env->prog->aux)) { - verbose(env, "funcation calls in offloaded programs are not supported yet\n"); + verbose(env, "function calls in offloaded programs are not supported yet\n"); return -EINVAL; } ret = add_subprog(env, i + insn[i].imm + 1); -- cgit v1.2.2 From fa2d41adb953235c4acaa98f6c980fd9eabe0062 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 18 Dec 2017 17:47:07 +0000 Subject: bpf: make function skip_callee static and return NULL rather than 0 Function skip_callee is local to the source and does not need to be in global scope, so make it static. Also return NULL rather than 0. Cleans up two sparse warnings: symbol 'skip_callee' was not declared. Should it be static? Using plain integer as NULL pointer Signed-off-by: Colin Ian King Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 56ef21c71bd3..52c9b32d58bf 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -823,6 +823,7 @@ next: return 0; } +static struct bpf_verifier_state *skip_callee(struct bpf_verifier_env *env, const struct bpf_verifier_state *state, struct bpf_verifier_state *parent, @@ -867,7 +868,7 @@ bug: verbose(env, "verifier bug regno %d tmp %p\n", regno, tmp); verbose(env, "regno %d parent frame %d current frame %d\n", regno, parent->curframe, state->curframe); - return 0; + return NULL; } static int mark_reg_read(struct bpf_verifier_env *env, -- cgit v1.2.2 From 06ef0ccb5a36e1feba9b413ff59a04ecc4407c1c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 18 Dec 2017 10:13:44 -0800 Subject: bpf/cgroup: fix a verification error for a CGROUP_DEVICE type prog The tools/testing/selftests/bpf test program test_dev_cgroup fails with the following error when compiled with llvm 6.0. (I did not try with earlier versions.) libbpf: load bpf program failed: Permission denied libbpf: -- BEGIN DUMP LOG --- libbpf: 0: (61) r2 = *(u32 *)(r1 +4) 1: (b7) r0 = 0 2: (55) if r2 != 0x1 goto pc+8 R0=inv0 R1=ctx(id=0,off=0,imm=0) R2=inv1 R10=fp0 3: (69) r2 = *(u16 *)(r1 +0) invalid bpf_context access off=0 size=2 ... The culprit is the following statement in dev_cgroup.c: short type = ctx->access_type & 0xFFFF; This code is typical as the ctx->access_type is assigned as below in kernel/bpf/cgroup.c: struct bpf_cgroup_dev_ctx ctx = { .access_type = (access << 16) | dev_type, .major = major, .minor = minor, }; The compiler converts it to u16 access while the verifier cgroup_dev_is_valid_access rejects any non u32 access. This patch permits the field access_type to be accessible with type u16 and u8 as well. Signed-off-by: Yonghong Song Tested-by: Roman Gushchin Signed-off-by: Daniel Borkmann --- kernel/bpf/cgroup.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index b789ab78d28f..c1c0b60d3f2f 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -568,6 +568,8 @@ static bool cgroup_dev_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { + const int size_default = sizeof(__u32); + if (type == BPF_WRITE) return false; @@ -576,8 +578,17 @@ static bool cgroup_dev_is_valid_access(int off, int size, /* The verifier guarantees that size > 0. */ if (off % size != 0) return false; - if (size != sizeof(__u32)) - return false; + + switch (off) { + case bpf_ctx_range(struct bpf_cgroup_dev_ctx, access_type): + bpf_ctx_record_field_size(info, size_default); + if (!bpf_ctx_narrow_access_ok(off, size, size_default)) + return false; + break; + default: + if (size != size_default) + return false; + } return true; } -- cgit v1.2.2 From ffd2e8df8d138e7436e218e0a9d3447a18b888e6 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 1 Dec 2017 11:50:33 -0600 Subject: resource: Set type of "reserve=" user-specified resources When we reserve regions because the user specified a "reserve=" parameter, set the resource type to either IORESOURCE_IO (for regions below 0x10000) or IORESOURCE_MEM. The test for 0x10000 is just a heuristic; obviously there can be memory below 0x10000 as well. Improve documentation of the "reserve=" parameter. Signed-off-by: Bjorn Helgaas --- kernel/resource.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 54ba6de3757c..ba3252f7c319 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1478,7 +1478,7 @@ void __devm_release_region(struct device *dev, struct resource *parent, EXPORT_SYMBOL(__devm_release_region); /* - * Called from init/main.c to reserve IO ports. + * Reserve I/O ports or memory based on "reserve=" kernel parameter. */ #define MAXRESERVE 4 static int __init reserve_setup(char *str) @@ -1489,26 +1489,38 @@ static int __init reserve_setup(char *str) for (;;) { unsigned int io_start, io_num; int x = reserved; + struct resource *parent; - if (get_option (&str, &io_start) != 2) + if (get_option(&str, &io_start) != 2) break; - if (get_option (&str, &io_num) == 0) + if (get_option(&str, &io_num) == 0) break; if (x < MAXRESERVE) { struct resource *res = reserve + x; + + /* + * If the region starts below 0x10000, we assume it's + * I/O port space; otherwise assume it's memory. + */ + if (io_start < 0x10000) { + res->flags = IORESOURCE_IO; + parent = &ioport_resource; + } else { + res->flags = IORESOURCE_MEM; + parent = &iomem_resource; + } res->name = "reserved"; res->start = io_start; res->end = io_start + io_num - 1; - res->flags = IORESOURCE_BUSY; + res->flags |= IORESOURCE_BUSY; res->desc = IORES_DESC_NONE; res->child = NULL; - if (request_resource(res->start >= 0x10000 ? &iomem_resource : &ioport_resource, res) == 0) + if (request_resource(parent, res) == 0) reserved = x+1; } } return 1; } - __setup("reserve=", reserve_setup); /* -- cgit v1.2.2 From f37e2334bca5c5653ad15cadc7d68c78324b956b Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 1 Dec 2017 14:07:18 -0600 Subject: resource: Set type when reserving new regions Set resource structs inserted by __reserve_region_with_split() to have the correct type. Setting the type doesn't fix any functional problem but makes %pR on the resource work better. Signed-off-by: Bjorn Helgaas --- kernel/resource.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index ba3252f7c319..8c527d83ca76 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1022,6 +1022,7 @@ static void __init __reserve_region_with_split(struct resource *root, struct resource *conflict; struct resource *res = alloc_resource(GFP_ATOMIC); struct resource *next_res = NULL; + int type = resource_type(root); if (!res) return; @@ -1029,7 +1030,7 @@ static void __init __reserve_region_with_split(struct resource *root, res->name = name; res->start = start; res->end = end; - res->flags = IORESOURCE_BUSY; + res->flags = type | IORESOURCE_BUSY; res->desc = IORES_DESC_NONE; while (1) { @@ -1064,7 +1065,7 @@ static void __init __reserve_region_with_split(struct resource *root, next_res->name = name; next_res->start = conflict->end + 1; next_res->end = end; - next_res->flags = IORESOURCE_BUSY; + next_res->flags = type | IORESOURCE_BUSY; next_res->desc = IORES_DESC_NONE; } } else { -- cgit v1.2.2 From 4f74d80971bce93d9e608c40324d662c70eb4664 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 20 Dec 2017 13:42:56 +0100 Subject: bpf: fix kallsyms handling for subprogs Right now kallsyms handling is not working with JITed subprogs. The reason is that when in 1c2a088a6626 ("bpf: x64: add JIT support for multi-function programs") in jit_subprogs() they are passed to bpf_prog_kallsyms_add(), then their prog type is 0, which BPF core will think it's a cBPF program as only cBPF programs have a 0 type. Thus, they need to inherit the type from the main prog. Once that is fixed, they are indeed added to the BPF kallsyms infra, but their tag is 0. Therefore, since intention is to add them as bpf_prog_F_, we need to pass them to bpf_prog_calc_tag() first. And once this is resolved, there is a use-after-free on prog cleanup: we remove the kallsyms entry from the main prog, later walk all subprogs and call bpf_jit_free() on them. However, the kallsyms linkage was never released on them. Thus, do that for all subprogs right in __bpf_prog_put() when refcount hits 0. Fixes: 1c2a088a6626 ("bpf: x64: add JIT support for multi-function programs") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 6 ++++++ kernel/bpf/verifier.c | 3 +++ 2 files changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e2e1c78ce1dc..30e728dcd35d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -937,10 +937,16 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) { if (atomic_dec_and_test(&prog->aux->refcnt)) { + int i; + trace_bpf_prog_put_rcu(prog); /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog, do_idr_lock); + + for (i = 0; i < prog->aux->func_cnt; i++) + bpf_prog_kallsyms_del(prog->aux->func[i]); bpf_prog_kallsyms_del(prog); + call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); } } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 52c9b32d58bf..3c3eec58b3e8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5063,7 +5063,10 @@ static int jit_subprogs(struct bpf_verifier_env *env) goto out_free; memcpy(func[i]->insnsi, &prog->insnsi[subprog_start], len * sizeof(struct bpf_insn)); + func[i]->type = prog->type; func[i]->len = len; + if (bpf_prog_calc_tag(func[i])) + goto out_free; func[i]->is_func = 1; /* Use bpf_prog_F_tag to indicate functions in stack traces. * Long term would need debug info to populate names -- cgit v1.2.2 From 7105e828c087de970fcb5a9509db51bfe6bd7894 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 20 Dec 2017 13:42:57 +0100 Subject: bpf: allow for correlation of maps and helpers in dump Currently a dump of an xlated prog (post verifier stage) doesn't correlate used helpers as well as maps. The prog info lists involved map ids, however there's no correlation of where in the program they are used as of today. Likewise, bpftool does not correlate helper calls with the target functions. The latter can be done w/o any kernel changes through kallsyms, and also has the advantage that this works with inlined helpers and BPF calls. Example, via interpreter: # tc filter show dev foo ingress filter protocol all pref 49152 bpf chain 0 filter protocol all pref 49152 bpf chain 0 handle 0x1 foo.o:[ingress] \ direct-action not_in_hw id 1 tag c74773051b364165 <-- prog id:1 * Output before patch (calls/maps remain unclear): # bpftool prog dump xlated id 1 <-- dump prog id:1 0: (b7) r1 = 2 1: (63) *(u32 *)(r10 -4) = r1 2: (bf) r2 = r10 3: (07) r2 += -4 4: (18) r1 = 0xffff95c47a8d4800 6: (85) call unknown#73040 7: (15) if r0 == 0x0 goto pc+18 8: (bf) r2 = r10 9: (07) r2 += -4 10: (bf) r1 = r0 11: (85) call unknown#73040 12: (15) if r0 == 0x0 goto pc+23 [...] * Output after patch: # bpftool prog dump xlated id 1 0: (b7) r1 = 2 1: (63) *(u32 *)(r10 -4) = r1 2: (bf) r2 = r10 3: (07) r2 += -4 4: (18) r1 = map[id:2] <-- map id:2 6: (85) call bpf_map_lookup_elem#73424 <-- helper call 7: (15) if r0 == 0x0 goto pc+18 8: (bf) r2 = r10 9: (07) r2 += -4 10: (bf) r1 = r0 11: (85) call bpf_map_lookup_elem#73424 12: (15) if r0 == 0x0 goto pc+23 [...] # bpftool map show id 2 <-- show/dump/etc map id:2 2: hash_of_maps flags 0x0 key 4B value 4B max_entries 3 memlock 4096B Example, JITed, same prog: # tc filter show dev foo ingress filter protocol all pref 49152 bpf chain 0 filter protocol all pref 49152 bpf chain 0 handle 0x1 foo.o:[ingress] \ direct-action not_in_hw id 3 tag c74773051b364165 jited # bpftool prog show id 3 3: sched_cls tag c74773051b364165 loaded_at Dec 19/13:48 uid 0 xlated 384B jited 257B memlock 4096B map_ids 2 # bpftool prog dump xlated id 3 0: (b7) r1 = 2 1: (63) *(u32 *)(r10 -4) = r1 2: (bf) r2 = r10 3: (07) r2 += -4 4: (18) r1 = map[id:2] <-- map id:2 6: (85) call __htab_map_lookup_elem#77408 <-+ inlined rewrite 7: (15) if r0 == 0x0 goto pc+2 | 8: (07) r0 += 56 | 9: (79) r0 = *(u64 *)(r0 +0) <-+ 10: (15) if r0 == 0x0 goto pc+24 11: (bf) r2 = r10 12: (07) r2 += -4 [...] Example, same prog, but kallsyms disabled (in that case we are also not allowed to pass any relative offsets, etc, so prog becomes pointer sanitized on dump): # sysctl kernel.kptr_restrict=2 kernel.kptr_restrict = 2 # bpftool prog dump xlated id 3 0: (b7) r1 = 2 1: (63) *(u32 *)(r10 -4) = r1 2: (bf) r2 = r10 3: (07) r2 += -4 4: (18) r1 = map[id:2] 6: (85) call bpf_unspec#0 7: (15) if r0 == 0x0 goto pc+2 [...] Example, BPF calls via interpreter: # bpftool prog dump xlated id 1 0: (85) call pc+2#__bpf_prog_run_args32 1: (b7) r0 = 1 2: (95) exit 3: (b7) r0 = 2 4: (95) exit Example, BPF calls via JIT: # sysctl net.core.bpf_jit_enable=1 net.core.bpf_jit_enable = 1 # sysctl net.core.bpf_jit_kallsyms=1 net.core.bpf_jit_kallsyms = 1 # bpftool prog dump xlated id 1 0: (85) call pc+2#bpf_prog_3b185187f1855c4c_F 1: (b7) r0 = 1 2: (95) exit 3: (b7) r0 = 2 4: (95) exit And finally, an example for tail calls that is now working as well wrt correlation: # bpftool prog dump xlated id 2 [...] 10: (b7) r2 = 8 11: (85) call bpf_trace_printk#-41312 12: (bf) r1 = r6 13: (18) r2 = map[id:1] 15: (b7) r3 = 0 16: (85) call bpf_tail_call#12 17: (b7) r1 = 42 18: (6b) *(u16 *)(r6 +46) = r1 19: (b7) r0 = 0 20: (95) exit # bpftool map show id 1 1: prog_array flags 0x0 key 4B value 4B max_entries 1 memlock 4096B Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 4 ++- kernel/bpf/disasm.c | 65 +++++++++++++++++++++++++++++++------- kernel/bpf/disasm.h | 29 ++++++++++++++--- kernel/bpf/syscall.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++++--- kernel/bpf/verifier.c | 30 +++++++++++++++--- 5 files changed, 189 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 768e0a02d8c8..70a534549cd3 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -771,7 +771,9 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) /* Base function for offset calculation. Needs to go into .text section, * therefore keeping it non-static as well; will also be used by JITs - * anyway later on, so do not let the compiler omit it. + * anyway later on, so do not let the compiler omit it. This also needs + * to go into kallsyms for correlation from e.g. bpftool, so naming + * must not change. */ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) { diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index 883f88fa5bfc..8740406df2cd 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -21,10 +21,39 @@ static const char * const func_id_str[] = { }; #undef __BPF_FUNC_STR_FN -const char *func_id_name(int id) +static const char *__func_get_name(const struct bpf_insn_cbs *cbs, + const struct bpf_insn *insn, + char *buff, size_t len) { BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID); + if (insn->src_reg != BPF_PSEUDO_CALL && + insn->imm >= 0 && insn->imm < __BPF_FUNC_MAX_ID && + func_id_str[insn->imm]) + return func_id_str[insn->imm]; + + if (cbs && cbs->cb_call) + return cbs->cb_call(cbs->private_data, insn); + + if (insn->src_reg == BPF_PSEUDO_CALL) + snprintf(buff, len, "%+d", insn->imm); + + return buff; +} + +static const char *__func_imm_name(const struct bpf_insn_cbs *cbs, + const struct bpf_insn *insn, + u64 full_imm, char *buff, size_t len) +{ + if (cbs && cbs->cb_imm) + return cbs->cb_imm(cbs->private_data, insn, full_imm); + + snprintf(buff, len, "0x%llx", (unsigned long long)full_imm); + return buff; +} + +const char *func_id_name(int id) +{ if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id]) return func_id_str[id]; else @@ -83,7 +112,7 @@ static const char *const bpf_jmp_string[16] = { [BPF_EXIT >> 4] = "exit", }; -static void print_bpf_end_insn(bpf_insn_print_cb verbose, +static void print_bpf_end_insn(bpf_insn_print_t verbose, struct bpf_verifier_env *env, const struct bpf_insn *insn) { @@ -92,9 +121,12 @@ static void print_bpf_end_insn(bpf_insn_print_cb verbose, insn->imm, insn->dst_reg); } -void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, - const struct bpf_insn *insn, bool allow_ptr_leaks) +void print_bpf_insn(const struct bpf_insn_cbs *cbs, + struct bpf_verifier_env *env, + const struct bpf_insn *insn, + bool allow_ptr_leaks) { + const bpf_insn_print_t verbose = cbs->cb_print; u8 class = BPF_CLASS(insn->code); if (class == BPF_ALU || class == BPF_ALU64) { @@ -175,12 +207,15 @@ void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, */ u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD; + char tmp[64]; if (map_ptr && !allow_ptr_leaks) imm = 0; - verbose(env, "(%02x) r%d = 0x%llx\n", insn->code, - insn->dst_reg, (unsigned long long)imm); + verbose(env, "(%02x) r%d = %s\n", + insn->code, insn->dst_reg, + __func_imm_name(cbs, insn, imm, + tmp, sizeof(tmp))); } else { verbose(env, "BUG_ld_%02x\n", insn->code); return; @@ -189,12 +224,20 @@ void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, u8 opcode = BPF_OP(insn->code); if (opcode == BPF_CALL) { - if (insn->src_reg == BPF_PSEUDO_CALL) - verbose(env, "(%02x) call pc%+d\n", insn->code, - insn->imm); - else + char tmp[64]; + + if (insn->src_reg == BPF_PSEUDO_CALL) { + verbose(env, "(%02x) call pc%s\n", + insn->code, + __func_get_name(cbs, insn, + tmp, sizeof(tmp))); + } else { + strcpy(tmp, "unknown"); verbose(env, "(%02x) call %s#%d\n", insn->code, - func_id_name(insn->imm), insn->imm); + __func_get_name(cbs, insn, + tmp, sizeof(tmp)), + insn->imm); + } } else if (insn->code == (BPF_JMP | BPF_JA)) { verbose(env, "(%02x) goto pc%+d\n", insn->code, insn->off); diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h index 8de977e420b6..e0857d016f89 100644 --- a/kernel/bpf/disasm.h +++ b/kernel/bpf/disasm.h @@ -17,16 +17,35 @@ #include #include #include +#ifndef __KERNEL__ +#include +#include +#endif + +struct bpf_verifier_env; extern const char *const bpf_alu_string[16]; extern const char *const bpf_class_string[8]; const char *func_id_name(int id); -struct bpf_verifier_env; -typedef void (*bpf_insn_print_cb)(struct bpf_verifier_env *env, - const char *, ...); -void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, - const struct bpf_insn *insn, bool allow_ptr_leaks); +typedef void (*bpf_insn_print_t)(struct bpf_verifier_env *env, + const char *, ...); +typedef const char *(*bpf_insn_revmap_call_t)(void *private_data, + const struct bpf_insn *insn); +typedef const char *(*bpf_insn_print_imm_t)(void *private_data, + const struct bpf_insn *insn, + __u64 full_imm); + +struct bpf_insn_cbs { + bpf_insn_print_t cb_print; + bpf_insn_revmap_call_t cb_call; + bpf_insn_print_imm_t cb_imm; + void *private_data; +}; +void print_bpf_insn(const struct bpf_insn_cbs *cbs, + struct bpf_verifier_env *env, + const struct bpf_insn *insn, + bool allow_ptr_leaks); #endif diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 30e728dcd35d..007802c5ca7d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1558,6 +1558,67 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr) return fd; } +static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog, + unsigned long addr) +{ + int i; + + for (i = 0; i < prog->aux->used_map_cnt; i++) + if (prog->aux->used_maps[i] == (void *)addr) + return prog->aux->used_maps[i]; + return NULL; +} + +static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) +{ + const struct bpf_map *map; + struct bpf_insn *insns; + u64 imm; + int i; + + insns = kmemdup(prog->insnsi, bpf_prog_insn_size(prog), + GFP_USER); + if (!insns) + return insns; + + for (i = 0; i < prog->len; i++) { + if (insns[i].code == (BPF_JMP | BPF_TAIL_CALL)) { + insns[i].code = BPF_JMP | BPF_CALL; + insns[i].imm = BPF_FUNC_tail_call; + /* fall-through */ + } + if (insns[i].code == (BPF_JMP | BPF_CALL) || + insns[i].code == (BPF_JMP | BPF_CALL_ARGS)) { + if (insns[i].code == (BPF_JMP | BPF_CALL_ARGS)) + insns[i].code = BPF_JMP | BPF_CALL; + if (!bpf_dump_raw_ok()) + insns[i].imm = 0; + continue; + } + + if (insns[i].code != (BPF_LD | BPF_IMM | BPF_DW)) + continue; + + imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm; + map = bpf_map_from_imm(prog, imm); + if (map) { + insns[i].src_reg = BPF_PSEUDO_MAP_FD; + insns[i].imm = map->id; + insns[i + 1].imm = 0; + continue; + } + + if (!bpf_dump_raw_ok() && + imm == (unsigned long)prog->aux) { + insns[i].imm = 0; + insns[i + 1].imm = 0; + continue; + } + } + + return insns; +} + static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, const union bpf_attr *attr, union bpf_attr __user *uattr) @@ -1608,18 +1669,34 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, ulen = info.jited_prog_len; info.jited_prog_len = prog->jited_len; if (info.jited_prog_len && ulen) { - uinsns = u64_to_user_ptr(info.jited_prog_insns); - ulen = min_t(u32, info.jited_prog_len, ulen); - if (copy_to_user(uinsns, prog->bpf_func, ulen)) - return -EFAULT; + if (bpf_dump_raw_ok()) { + uinsns = u64_to_user_ptr(info.jited_prog_insns); + ulen = min_t(u32, info.jited_prog_len, ulen); + if (copy_to_user(uinsns, prog->bpf_func, ulen)) + return -EFAULT; + } else { + info.jited_prog_insns = 0; + } } ulen = info.xlated_prog_len; info.xlated_prog_len = bpf_prog_insn_size(prog); if (info.xlated_prog_len && ulen) { + struct bpf_insn *insns_sanitized; + bool fault; + + if (prog->blinded && !bpf_dump_raw_ok()) { + info.xlated_prog_insns = 0; + goto done; + } + insns_sanitized = bpf_insn_prepare_dump(prog); + if (!insns_sanitized) + return -ENOMEM; uinsns = u64_to_user_ptr(info.xlated_prog_insns); ulen = min_t(u32, info.xlated_prog_len, ulen); - if (copy_to_user(uinsns, prog->insnsi, ulen)) + fault = copy_to_user(uinsns, insns_sanitized, ulen); + kfree(insns_sanitized); + if (fault) return -EFAULT; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3c3eec58b3e8..4ae46b2cba88 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4427,9 +4427,12 @@ static int do_check(struct bpf_verifier_env *env) } if (env->log.level) { + const struct bpf_insn_cbs cbs = { + .cb_print = verbose, + }; + verbose(env, "%d: ", insn_idx); - print_bpf_insn(verbose, env, insn, - env->allow_ptr_leaks); + print_bpf_insn(&cbs, env, insn, env->allow_ptr_leaks); } err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx); @@ -5017,14 +5020,14 @@ static int jit_subprogs(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog, **func, *tmp; int i, j, subprog_start, subprog_end = 0, len, subprog; - struct bpf_insn *insn = prog->insnsi; + struct bpf_insn *insn; void *old_bpf_func; int err = -ENOMEM; if (env->subprog_cnt == 0) return 0; - for (i = 0; i < prog->len; i++, insn++) { + for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { if (insn->code != (BPF_JMP | BPF_CALL) || insn->src_reg != BPF_PSEUDO_CALL) continue; @@ -5116,6 +5119,25 @@ static int jit_subprogs(struct bpf_verifier_env *env) bpf_prog_lock_ro(func[i]); bpf_prog_kallsyms_add(func[i]); } + + /* Last step: make now unused interpreter insns from main + * prog consistent for later dump requests, so they can + * later look the same as if they were interpreted only. + */ + for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { + unsigned long addr; + + if (insn->code != (BPF_JMP | BPF_CALL) || + insn->src_reg != BPF_PSEUDO_CALL) + continue; + insn->off = env->insn_aux_data[i].call_imm; + subprog = find_subprog(env, i + insn->off + 1); + addr = (unsigned long)func[subprog + 1]->bpf_func; + addr &= PAGE_MASK; + insn->imm = (u64 (*)(u64, u64, u64, u64, u64)) + addr - __bpf_call_base; + } + prog->jited = 1; prog->bpf_func = func[0]->bpf_func; prog->aux->func = func; -- cgit v1.2.2 From fd05e57bb35ad5eb7e261b64e5cf46445250f842 Mon Sep 17 00:00:00 2001 From: Gianluca Borello Date: Sat, 23 Dec 2017 10:09:55 +0000 Subject: bpf: fix stacksafe exploration when comparing states Commit cc2b14d51053 ("bpf: teach verifier to recognize zero initialized stack") introduced a very relaxed check when comparing stacks of different states, effectively returning a positive result in many cases where it shouldn't. This can create problems in cases such as this following C pseudocode: long var; long *x = bpf_map_lookup(...); if (!x) return; if (*x != 0xbeef) var = 0; else var = 1; /* This is the key part, calling a helper causes an explored state * to be saved with the information that "var" is on the stack as * STACK_ZERO, since the helper is first met by the verifier after * the "var = 0" assignment. This state will however be wrongly used * also for the "var = 1" case, so the verifier assumes "var" is always * 0 and will replace the NULL assignment with nops, because the * search pruning prevents it from exploring the faulty branch. */ bpf_ktime_get_ns(); if (var) *(long *)0 = 0xbeef; Fix the issue by making sure that the stack is fully explored before returning a positive comparison result. Also attach a couple tests that highlight the bad behavior. In the first test, without this fix instructions 16 and 17 are replaced with nops instead of being rejected by the verifier. The second test, instead, allows a program to make a potentially illegal read from the stack. Fixes: cc2b14d51053 ("bpf: teach verifier to recognize zero initialized stack") Signed-off-by: Gianluca Borello Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4ae46b2cba88..82ae580440b8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4107,7 +4107,7 @@ static bool stacksafe(struct bpf_func_state *old, if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) /* explored state didn't use this */ - return true; + continue; if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID) continue; -- cgit v1.2.2 From 70a87ffea8acc390ae315fa1930e849f656bdb88 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 25 Dec 2017 13:15:40 -0800 Subject: bpf: fix maximum stack depth tracking logic Instead of computing max stack depth for current call chain during the main verifier pass track stack depth of each function independently and after do_check() is done do another pass over all instructions analyzing depth of all possible call stacks. Fixes: f4d7e40a5b71 ("bpf: introduce function calls (verification)") Reported-by: Jann Horn Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 82 +++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 66 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 82ae580440b8..738e919efdf0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1430,33 +1430,80 @@ static int update_stack_depth(struct bpf_verifier_env *env, const struct bpf_func_state *func, int off) { - u16 stack = env->subprog_stack_depth[func->subprogno], total = 0; - struct bpf_verifier_state *cur = env->cur_state; - int i; + u16 stack = env->subprog_stack_depth[func->subprogno]; if (stack >= -off) return 0; /* update known max for given subprogram */ env->subprog_stack_depth[func->subprogno] = -off; + return 0; +} - /* compute the total for current call chain */ - for (i = 0; i <= cur->curframe; i++) { - u32 depth = env->subprog_stack_depth[cur->frame[i]->subprogno]; - - /* round up to 32-bytes, since this is granularity - * of interpreter stack sizes - */ - depth = round_up(depth, 32); - total += depth; - } +/* starting from main bpf function walk all instructions of the function + * and recursively walk all callees that given function can call. + * Ignore jump and exit insns. + * Since recursion is prevented by check_cfg() this algorithm + * only needs a local stack of MAX_CALL_FRAMES to remember callsites + */ +static int check_max_stack_depth(struct bpf_verifier_env *env) +{ + int depth = 0, frame = 0, subprog = 0, i = 0, subprog_end; + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + int ret_insn[MAX_CALL_FRAMES]; + int ret_prog[MAX_CALL_FRAMES]; - if (total > MAX_BPF_STACK) { +process_func: + /* round up to 32-bytes, since this is granularity + * of interpreter stack size + */ + depth += round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32); + if (depth > MAX_BPF_STACK) { verbose(env, "combined stack size of %d calls is %d. Too large\n", - cur->curframe, total); + frame + 1, depth); return -EACCES; } - return 0; +continue_func: + if (env->subprog_cnt == subprog) + subprog_end = insn_cnt; + else + subprog_end = env->subprog_starts[subprog]; + for (; i < subprog_end; i++) { + if (insn[i].code != (BPF_JMP | BPF_CALL)) + continue; + if (insn[i].src_reg != BPF_PSEUDO_CALL) + continue; + /* remember insn and function to return to */ + ret_insn[frame] = i + 1; + ret_prog[frame] = subprog; + + /* find the callee */ + i = i + insn[i].imm + 1; + subprog = find_subprog(env, i); + if (subprog < 0) { + WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", + i); + return -EFAULT; + } + subprog++; + frame++; + if (frame >= MAX_CALL_FRAMES) { + WARN_ONCE(1, "verifier bug. Call stack is too deep\n"); + return -EFAULT; + } + goto process_func; + } + /* end of for() loop means the last insn of the 'subprog' + * was reached. Doesn't matter whether it was JA or EXIT + */ + if (frame == 0) + return 0; + depth -= round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32); + frame--; + i = ret_insn[frame]; + subprog = ret_prog[frame]; + goto continue_func; } static int get_callee_stack_depth(struct bpf_verifier_env *env, @@ -5403,6 +5450,9 @@ skip_full_check: if (ret == 0) sanitize_dead_code(env); + if (ret == 0) + ret = check_max_stack_depth(env); + if (ret == 0) /* program is valid, convert *(u32*)(ctx + off) accesses */ ret = convert_ctx_accesses(env); -- cgit v1.2.2 From aada9ce644e53410954daa6beb1f7c4ca158abd7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 25 Dec 2017 13:15:42 -0800 Subject: bpf: fix max call depth check fix off by one error in max call depth check and add a test Fixes: f4d7e40a5b71 ("bpf: introduce function calls (verification)") Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 738e919efdf0..52ad60b3b8be 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2126,9 +2126,9 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_func_state *caller, *callee; int i, subprog, target_insn; - if (state->curframe >= MAX_CALL_FRAMES) { + if (state->curframe + 1 >= MAX_CALL_FRAMES) { verbose(env, "the call stack of %d frames is too deep\n", - state->curframe); + state->curframe + 2); return -E2BIG; } -- cgit v1.2.2 From e0d3974ac77b0d581b92affe5851fc40ad2f42a4 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 27 Dec 2017 18:39:03 -0800 Subject: bpf: offload: don't require rtnl for dev list manipulation We don't need the RTNL lock for all operations on offload state. We only need to hold it around ndo calls. The device offload initialization doesn't require it. The soon-to-come querying of the offload info will only need it partially. We will also be able to remove the waitqueue in following patches. Use struct rw_semaphore because map offload will require sleeping with the semaphore held for read. Suggested-by: Kirill Tkhai Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- kernel/bpf/offload.c | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 8455b89d1bbf..032079754d88 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -20,13 +20,16 @@ #include #include #include +#include -/* protected by RTNL */ +/* Protects bpf_prog_offload_devs and offload members of all progs. + * RTNL lock cannot be taken when holding this lock. + */ +static DECLARE_RWSEM(bpf_devs_lock); static LIST_HEAD(bpf_prog_offload_devs); int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) { - struct net *net = current->nsproxy->net_ns; struct bpf_dev_offload *offload; if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS && @@ -43,19 +46,26 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) offload->prog = prog; init_waitqueue_head(&offload->verifier_done); - rtnl_lock(); - offload->netdev = __dev_get_by_index(net, attr->prog_ifindex); - if (!offload->netdev) { - rtnl_unlock(); - kfree(offload); - return -EINVAL; - } + offload->netdev = dev_get_by_index(current->nsproxy->net_ns, + attr->prog_ifindex); + if (!offload->netdev) + goto err_free; + down_write(&bpf_devs_lock); + if (offload->netdev->reg_state != NETREG_REGISTERED) + goto err_unlock; prog->aux->offload = offload; list_add_tail(&offload->offloads, &bpf_prog_offload_devs); - rtnl_unlock(); + dev_put(offload->netdev); + up_write(&bpf_devs_lock); return 0; +err_unlock: + up_write(&bpf_devs_lock); + dev_put(offload->netdev); +err_free: + kfree(offload); + return -EINVAL; } static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd, @@ -126,7 +136,9 @@ void bpf_prog_offload_destroy(struct bpf_prog *prog) wake_up(&offload->verifier_done); rtnl_lock(); + down_write(&bpf_devs_lock); __bpf_prog_offload_destroy(prog); + up_write(&bpf_devs_lock); rtnl_unlock(); kfree(offload); @@ -181,11 +193,13 @@ static int bpf_offload_notification(struct notifier_block *notifier, if (netdev->reg_state != NETREG_UNREGISTERING) break; + down_write(&bpf_devs_lock); list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, offloads) { if (offload->netdev == netdev) __bpf_prog_offload_destroy(offload->prog); } + up_write(&bpf_devs_lock); break; default: break; -- cgit v1.2.2 From 9a18eedb145d080d542766af1d7513ebfccd1604 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 27 Dec 2017 18:39:04 -0800 Subject: bpf: offload: don't use prog->aux->offload as boolean We currently use aux->offload to indicate that program is bound to a specific device. This forces us to keep the offload structure around even after the device is gone. Add a bool member to struct bpf_prog_aux to indicate if offload was requested. Suggested-by: Alexei Starovoitov Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 007802c5ca7d..e0afc2e39fd5 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1157,6 +1157,8 @@ static int bpf_prog_load(union bpf_attr *attr) if (!prog) return -ENOMEM; + prog->aux->offload_requested = !!attr->prog_ifindex; + err = security_bpf_prog_alloc(prog->aux); if (err) goto free_prog_nouncharge; @@ -1178,7 +1180,7 @@ static int bpf_prog_load(union bpf_attr *attr) atomic_set(&prog->aux->refcnt, 1); prog->gpl_compatible = is_gpl ? 1 : 0; - if (attr->prog_ifindex) { + if (bpf_prog_is_dev_bound(prog->aux)) { err = bpf_prog_offload_init(prog, attr); if (err) goto free_prog; -- cgit v1.2.2 From cae1927c0b4a93ae15de824faca1f6f611a44fcd Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 27 Dec 2017 18:39:05 -0800 Subject: bpf: offload: allow netdev to disappear while verifier is running To allow verifier instruction callbacks without any extra locking NETDEV_UNREGISTER notification would wait on a waitqueue for verifier to finish. This design decision was made when rtnl lock was providing all the locking. Use the read/write lock instead and remove the workqueue. Verifier will now call into the offload code, so dev_ops are moved to offload structure. Since verifier calls are all under bpf_prog_is_dev_bound() we no longer need static inline implementations to please builds with CONFIG_NET=n. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/offload.c | 30 ++++++++++++++++-------------- kernel/bpf/verifier.c | 20 +++++++------------- 2 files changed, 23 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 032079754d88..69ddc3899bab 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -44,7 +44,6 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) return -ENOMEM; offload->prog = prog; - init_waitqueue_head(&offload->verifier_done); offload->netdev = dev_get_by_index(current->nsproxy->net_ns, attr->prog_ifindex); @@ -97,15 +96,28 @@ int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env) if (err) goto exit_unlock; - env->dev_ops = data.verifier.ops; - + env->prog->aux->offload->dev_ops = data.verifier.ops; env->prog->aux->offload->dev_state = true; - env->prog->aux->offload->verifier_running = true; exit_unlock: rtnl_unlock(); return err; } +int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, + int insn_idx, int prev_insn_idx) +{ + struct bpf_dev_offload *offload; + int ret = -ENODEV; + + down_read(&bpf_devs_lock); + offload = env->prog->aux->offload; + if (offload->netdev) + ret = offload->dev_ops->insn_hook(env, insn_idx, prev_insn_idx); + up_read(&bpf_devs_lock); + + return ret; +} + static void __bpf_prog_offload_destroy(struct bpf_prog *prog) { struct bpf_dev_offload *offload = prog->aux->offload; @@ -117,9 +129,6 @@ static void __bpf_prog_offload_destroy(struct bpf_prog *prog) data.offload.prog = prog; - if (offload->verifier_running) - wait_event(offload->verifier_done, !offload->verifier_running); - if (offload->dev_state) WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data)); @@ -132,9 +141,6 @@ void bpf_prog_offload_destroy(struct bpf_prog *prog) { struct bpf_dev_offload *offload = prog->aux->offload; - offload->verifier_running = false; - wake_up(&offload->verifier_done); - rtnl_lock(); down_write(&bpf_devs_lock); __bpf_prog_offload_destroy(prog); @@ -146,15 +152,11 @@ void bpf_prog_offload_destroy(struct bpf_prog *prog) static int bpf_prog_offload_translate(struct bpf_prog *prog) { - struct bpf_dev_offload *offload = prog->aux->offload; struct netdev_bpf data = {}; int ret; data.offload.prog = prog; - offload->verifier_running = false; - wake_up(&offload->verifier_done); - rtnl_lock(); ret = __bpf_offload_ndo(prog, BPF_OFFLOAD_TRANSLATE, &data); rtnl_unlock(); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 98d8637cf70d..a2b211262c25 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4438,15 +4438,6 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) return 0; } -static int ext_analyzer_insn_hook(struct bpf_verifier_env *env, - int insn_idx, int prev_insn_idx) -{ - if (env->dev_ops && env->dev_ops->insn_hook) - return env->dev_ops->insn_hook(env, insn_idx, prev_insn_idx); - - return 0; -} - static int do_check(struct bpf_verifier_env *env) { struct bpf_verifier_state *state; @@ -4531,9 +4522,12 @@ static int do_check(struct bpf_verifier_env *env) print_bpf_insn(&cbs, env, insn, env->allow_ptr_leaks); } - err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx); - if (err) - return err; + if (bpf_prog_is_dev_bound(env->prog->aux)) { + err = bpf_prog_offload_verify_insn(env, insn_idx, + prev_insn_idx); + if (err) + return err; + } regs = cur_regs(env); env->insn_aux_data[insn_idx].seen = true; @@ -5463,7 +5457,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) env->strict_alignment = true; - if (env->prog->aux->offload) { + if (bpf_prog_is_dev_bound(env->prog->aux)) { ret = bpf_prog_offload_verifier_prep(env); if (ret) goto err_unlock; -- cgit v1.2.2 From ce3b9db4db0e0e2b9761c56d08615ea0159e4a1b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 27 Dec 2017 18:39:06 -0800 Subject: bpf: offload: free prog->aux->offload when device disappears All bpf offload operations should now be under bpf_devs_lock, it's safe to free and clear the entire offload structure, not only the netdev pointer. __bpf_prog_offload_destroy() will no longer be called multiple times. Suggested-by: Alexei Starovoitov Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/offload.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 69ddc3899bab..3126e1a842e6 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -70,12 +70,14 @@ err_free: static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd, struct netdev_bpf *data) { - struct net_device *netdev = prog->aux->offload->netdev; + struct bpf_dev_offload *offload = prog->aux->offload; + struct net_device *netdev; ASSERT_RTNL(); - if (!netdev) + if (!offload) return -ENODEV; + netdev = offload->netdev; if (!netdev->netdev_ops->ndo_bpf) return -EOPNOTSUPP; @@ -111,7 +113,7 @@ int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, down_read(&bpf_devs_lock); offload = env->prog->aux->offload; - if (offload->netdev) + if (offload) ret = offload->dev_ops->insn_hook(env, insn_idx, prev_insn_idx); up_read(&bpf_devs_lock); @@ -123,31 +125,24 @@ static void __bpf_prog_offload_destroy(struct bpf_prog *prog) struct bpf_dev_offload *offload = prog->aux->offload; struct netdev_bpf data = {}; - /* Caution - if netdev is destroyed before the program, this function - * will be called twice. - */ - data.offload.prog = prog; if (offload->dev_state) WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data)); - offload->dev_state = false; list_del_init(&offload->offloads); - offload->netdev = NULL; + kfree(offload); + prog->aux->offload = NULL; } void bpf_prog_offload_destroy(struct bpf_prog *prog) { - struct bpf_dev_offload *offload = prog->aux->offload; - rtnl_lock(); down_write(&bpf_devs_lock); - __bpf_prog_offload_destroy(prog); + if (prog->aux->offload) + __bpf_prog_offload_destroy(prog); up_write(&bpf_devs_lock); rtnl_unlock(); - - kfree(offload); } static int bpf_prog_offload_translate(struct bpf_prog *prog) -- cgit v1.2.2 From ad8ad79f4f6078f456792f7f8d344da2be9bc74f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 27 Dec 2017 18:39:07 -0800 Subject: bpf: offload: free program id when device disappears Bound programs are quite useless after their device disappears. They are simply waiting for reference count to go to zero, don't list them in BPF_PROG_GET_NEXT_ID by freeing their ID early. Note that orphaned offload programs will return -ENODEV on BPF_OBJ_GET_INFO_BY_FD so user will never see ID 0. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/offload.c | 3 +++ kernel/bpf/syscall.c | 9 +++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 3126e1a842e6..e4f1668a021c 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -130,6 +130,9 @@ static void __bpf_prog_offload_destroy(struct bpf_prog *prog) if (offload->dev_state) WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data)); + /* Make sure BPF_PROG_GET_NEXT_ID can't find this dead program */ + bpf_prog_free_id(prog, true); + list_del_init(&offload->offloads); kfree(offload); prog->aux->offload = NULL; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e0afc2e39fd5..e02dafa6f402 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -905,9 +905,13 @@ static int bpf_prog_alloc_id(struct bpf_prog *prog) return id > 0 ? 0 : id; } -static void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) +void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) { - /* cBPF to eBPF migrations are currently not in the idr store. */ + /* cBPF to eBPF migrations are currently not in the idr store. + * Offloaded programs are removed from the store when their device + * disappears - even if someone grabs an fd to them they are unusable, + * simply waiting for refcnt to drop to be freed. + */ if (!prog->aux->id) return; @@ -917,6 +921,7 @@ static void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) __acquire(&prog_idr_lock); idr_remove(&prog_idr, prog->aux->id); + prog->aux->id = 0; if (do_idr_lock) spin_unlock_bh(&prog_idr_lock); -- cgit v1.2.2 From 675fc275a3a2d905535207237402c6d8dcb5fa4b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 27 Dec 2017 18:39:09 -0800 Subject: bpf: offload: report device information for offloaded programs Report to the user ifindex and namespace information of offloaded programs. If device has disappeared return -ENODEV. Specify the namespace using dev/inode combination. CC: Eric W. Biederman Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- kernel/bpf/offload.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 6 ++++++ 2 files changed, 65 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index e4f1668a021c..040d4e0edf3f 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -16,9 +16,11 @@ #include #include #include +#include #include #include #include +#include #include #include @@ -176,6 +178,63 @@ int bpf_prog_offload_compile(struct bpf_prog *prog) return bpf_prog_offload_translate(prog); } +struct ns_get_path_bpf_prog_args { + struct bpf_prog *prog; + struct bpf_prog_info *info; +}; + +static struct ns_common *bpf_prog_offload_info_fill_ns(void *private_data) +{ + struct ns_get_path_bpf_prog_args *args = private_data; + struct bpf_prog_aux *aux = args->prog->aux; + struct ns_common *ns; + struct net *net; + + rtnl_lock(); + down_read(&bpf_devs_lock); + + if (aux->offload) { + args->info->ifindex = aux->offload->netdev->ifindex; + net = dev_net(aux->offload->netdev); + get_net(net); + ns = &net->ns; + } else { + args->info->ifindex = 0; + ns = NULL; + } + + up_read(&bpf_devs_lock); + rtnl_unlock(); + + return ns; +} + +int bpf_prog_offload_info_fill(struct bpf_prog_info *info, + struct bpf_prog *prog) +{ + struct ns_get_path_bpf_prog_args args = { + .prog = prog, + .info = info, + }; + struct inode *ns_inode; + struct path ns_path; + void *res; + + res = ns_get_path_cb(&ns_path, bpf_prog_offload_info_fill_ns, &args); + if (IS_ERR(res)) { + if (!info->ifindex) + return -ENODEV; + return PTR_ERR(res); + } + + ns_inode = ns_path.dentry->d_inode; + info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev); + info->netns_ino = ns_inode->i_ino; + path_put(&ns_path); + + return 0; +} + const struct bpf_prog_ops bpf_offload_prog_ops = { }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e02dafa6f402..ebf0fb23e237 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1707,6 +1707,12 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, return -EFAULT; } + if (bpf_prog_is_dev_bound(prog->aux)) { + err = bpf_prog_offload_info_fill(&info, prog); + if (err) + return err; + } + done: if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) -- cgit v1.2.2 From 82975c46da8275a2a212cd94049bbef9bb961da2 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 2 Jan 2018 11:25:26 +0000 Subject: perf: Export perf_event_update_userpage Export perf_event_update_userpage() so that PMU driver using them, can be built as modules. Acked-by: Peter Zilstra Signed-off-by: Suzuki K Poulose Signed-off-by: Will Deacon --- kernel/events/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 4df5b695bf0d..64ec2c9d2c44 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4904,6 +4904,7 @@ void perf_event_update_userpage(struct perf_event *event) unlock: rcu_read_unlock(); } +EXPORT_SYMBOL_GPL(perf_event_update_userpage); static int perf_mmap_fault(struct vm_fault *vmf) { -- cgit v1.2.2 From 0b44bf9a6f5cde099ae21b4aa94553484203769a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 17 Aug 2017 15:45:38 -0500 Subject: signal: Simplify and fix kdb_send_sig - Rename from kdb_send_sig_info to kdb_send_sig As there is no meaningful siginfo sent - Use SEND_SIG_PRIV instead of generating a siginfo for a kdb signal. The generated siginfo had a bogus rationale and was not correct in the face of pid namespaces. SEND_SIG_PRIV is simpler and actually correct. - As the code grabs siglock just send the signal with siglock held instead of dropping siglock and attempting to grab it again. - Move the sig_valid test into kdb_kill where it can generate a good error message. Signed-off-by: Eric W. Biederman --- kernel/debug/kdb/kdb_main.c | 10 ++-------- kernel/debug/kdb/kdb_private.h | 2 +- kernel/signal.c | 14 +++++++------- 3 files changed, 10 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index c8146d53ca67..dbb0781a0533 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2441,7 +2441,6 @@ static int kdb_kill(int argc, const char **argv) long sig, pid; char *endp; struct task_struct *p; - struct siginfo info; if (argc != 2) return KDB_ARGCOUNT; @@ -2449,7 +2448,7 @@ static int kdb_kill(int argc, const char **argv) sig = simple_strtol(argv[1], &endp, 0); if (*endp) return KDB_BADINT; - if (sig >= 0) { + if ((sig >= 0) || !valid_signal(-sig)) { kdb_printf("Invalid signal parameter.<-signal>\n"); return 0; } @@ -2470,12 +2469,7 @@ static int kdb_kill(int argc, const char **argv) return 0; } p = p->group_leader; - info.si_signo = sig; - info.si_errno = 0; - info.si_code = SI_USER; - info.si_pid = pid; /* same capabilities as process being signalled */ - info.si_uid = 0; /* kdb has root authority */ - kdb_send_sig_info(p, &info); + kdb_send_sig(p, sig); return 0; } diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index fc224fbcf954..1e5a502ba4a7 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -208,7 +208,7 @@ extern unsigned long kdb_task_state(const struct task_struct *p, extern void kdb_ps_suppressed(void); extern void kdb_ps1(const struct task_struct *p); extern void kdb_print_nameval(const char *name, unsigned long val); -extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info); +extern void kdb_send_sig(struct task_struct *p, int sig); extern void kdb_meminfo_proc_show(void); extern char *kdb_getstr(char *, size_t, const char *); extern void kdb_gdb_state_pass(char *buf); diff --git a/kernel/signal.c b/kernel/signal.c index 9558664bd9ec..c894162ec96c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3684,26 +3684,25 @@ void __init signals_init(void) #ifdef CONFIG_KGDB_KDB #include /* - * kdb_send_sig_info - Allows kdb to send signals without exposing + * kdb_send_sig - Allows kdb to send signals without exposing * signal internals. This function checks if the required locks are * available before calling the main signal code, to avoid kdb * deadlocks. */ -void -kdb_send_sig_info(struct task_struct *t, struct siginfo *info) +void kdb_send_sig(struct task_struct *t, int sig) { static struct task_struct *kdb_prev_t; - int sig, new_t; + int new_t, ret; if (!spin_trylock(&t->sighand->siglock)) { kdb_printf("Can't do kill command now.\n" "The sigmask lock is held somewhere else in " "kernel, try again later\n"); return; } - spin_unlock(&t->sighand->siglock); new_t = kdb_prev_t != t; kdb_prev_t = t; if (t->state != TASK_RUNNING && new_t) { + spin_unlock(&t->sighand->siglock); kdb_printf("Process is not RUNNING, sending a signal from " "kdb risks deadlock\n" "on the run queue locks. " @@ -3712,8 +3711,9 @@ kdb_send_sig_info(struct task_struct *t, struct siginfo *info) "the deadlock.\n"); return; } - sig = info->si_signo; - if (send_sig_info(sig, info, t)) + ret = send_signal(sig, SEND_SIG_PRIV, t, false); + spin_unlock(&t->sighand->siglock); + if (ret) kdb_printf("Fail to deliver Signal %d to process %d.\n", sig, t->pid); else -- cgit v1.2.2 From cca10d58d25d271f05e1115132b4c2d913bb652e Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 21 Dec 2017 14:41:49 +0900 Subject: printk: add console_msg_format command line option 0day and kernelCI automatically parse kernel log - basically some sort of grepping using the pre-defined text patterns - in order to detect and report regressions/errors. There are several sources they get the kernel logs from: a) dmesg or /proc/ksmg This is the preferred way. Because `dmesg --raw' (see later Note) and /proc/kmsg output contains facility and log level, which greatly simplifies grepping for EMERG/ALERT/CRIT/ERR messages. b) serial consoles This option is harder to maintain, because serial console messages don't contain facility and log level. This patch introduces a `console_msg_format=' command line option, to switch between different message formatting on serial consoles. For the time being we have just two options - default and syslog. The "default" option just keeps the existing format. While the "syslog" option makes serial console messages to appear in syslog format [syslog() syscall], matching the `dmesg -S --raw' and `cat /proc/kmsg' output formats: - facility and log level - time stamp (depends on printk_time/PRINTK_TIME) - message <%u>[time stamp] text\n NOTE: while Kevin and Fengguang talk about "dmesg --raw", it's actually "dmesg -S --raw" that always prints messages in syslog format [per Petr Mladek]. Running "dmesg --raw" may produce output in non-syslog format sometimes. console_msg_format=syslog enables syslog format, thus in documentation we mention "dmesg -S --raw", not "dmesg --raw". Per Kevin Hilman: : Right now we can get this info from a "dmesg --raw" after bootup, : but it would be really nice in certain automation frameworks to : have a kernel command-line option to enable printing of loglevels : in default boot log. : : This is especially useful when ingesting kernel logs into advanced : search/analytics frameworks (I'm playing with and ELK stack: Elastic : Search, Logstash, Kibana). : : The other important reason for having this on the command line is that : for testing linux-next (and other bleeding edge developer branches), : it's common that we never make it to userspace, so can't even run : "dmesg --raw" (or equivalent.) So we really want this on the primary : boot (serial) console. Per Fengguang Wu, 0day scripts should quickly benefit from that feature, because they will be able to switch to a more reliable parsing, based on messages' facility and log levels [1]: `#{grep} -a -E -e '^<[0123]>' -e '^kern :(err |crit |alert |emerg )' instead of doing text pattern matching `#{grep} -a -F -f /lkp/printk-error-messages #{kmsg_file} | grep -a -v -E -f #{LKP_SRC}/etc/oops-pattern | grep -a -v -F -f #{LKP_SRC}/etc/kmsg-blacklist` [1] https://github.com/fengguang/lkp-tests/blob/master/lib/dmesg.rb Link: http://lkml.kernel.org/r/20171221054149.4398-1-sergey.senozhatsky@gmail.com To: Steven Rostedt Cc: Linus Torvalds Cc: Fengguang Wu Cc: Kevin Hilman Cc: Mark Brown Cc: Greg Kroah-Hartman Cc: Andrew Morton Cc: LKML Signed-off-by: Sergey Senozhatsky Reviewed-by: Fengguang Wu Reviewed-by: Kevin Hilman Tested-by: Kevin Hilman Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 5d81206a572d..568729e0dc2c 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -277,6 +277,13 @@ EXPORT_SYMBOL(console_set_on_cmdline); /* Flag: console code may call schedule() */ static int console_may_schedule; +enum con_msg_format_flags { + MSG_FORMAT_DEFAULT = 0, + MSG_FORMAT_SYSLOG = (1 << 0), +}; + +static int console_msg_format = MSG_FORMAT_DEFAULT; + /* * The printk log buffer consists of a chain of concatenated variable * length records. Every record starts with a record header, containing @@ -1913,6 +1920,17 @@ static int __add_preferred_console(char *name, int idx, char *options, c->index = idx; return 0; } + +static int __init console_msg_format_setup(char *str) +{ + if (!strcmp(str, "syslog")) + console_msg_format = MSG_FORMAT_SYSLOG; + if (!strcmp(str, "default")) + console_msg_format = MSG_FORMAT_DEFAULT; + return 1; +} +__setup("console_msg_format=", console_msg_format_setup); + /* * Set up a console. Called via do_early_param() in init/main.c * for each "console=" parameter in the boot command line. @@ -2215,7 +2233,10 @@ skip: goto skip; } - len += msg_print_text(msg, false, text + len, sizeof(text) - len); + len += msg_print_text(msg, + console_msg_format & MSG_FORMAT_SYSLOG, + text + len, + sizeof(text) - len); if (nr_ext_console_drivers) { ext_len = msg_print_ext_header(ext_text, sizeof(ext_text), -- cgit v1.2.2 From c20a71a7a3841a6198721a0b4276f3298b38bbcf Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 3 Jan 2018 17:57:39 -0800 Subject: bpf: sockmap remove unused function This was added for some work that was eventually factored out but the helper call was missed. Remove it now and add it back later if needed. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 5ee2e41893d9..3f662ee23a34 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -96,14 +96,6 @@ static inline struct smap_psock *smap_psock_sk(const struct sock *sk) return rcu_dereference_sk_user_data(sk); } -/* compute the linear packet data range [data, data_end) for skb when - * sk_skb type programs are in use. - */ -static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb) -{ - TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb); -} - enum __sk_action { __SK_DROP = 0, __SK_PASS, -- cgit v1.2.2 From 5f103c5d4dbadec0f2cacd39b6429e1b8a8cf983 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 3 Jan 2018 17:57:56 -0800 Subject: bpf: only build sockmap with CONFIG_INET The sockmap infrastructure is only aware of TCP sockets at the moment. In the future we plan to add UDP. In both cases CONFIG_NET should be built-in. So lets only build sockmap if CONFIG_INET is enabled. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- kernel/bpf/Makefile | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index e691da0b3bab..a713fd23ec88 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -9,9 +9,11 @@ obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o obj-$(CONFIG_BPF_SYSCALL) += offload.o ifeq ($(CONFIG_STREAM_PARSER),y) +ifeq ($(CONFIG_INET),y) obj-$(CONFIG_BPF_SYSCALL) += sockmap.o endif endif +endif ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif -- cgit v1.2.2 From 08b21fbf4bc4befbd77cd19422a81149ba8cda00 Mon Sep 17 00:00:00 2001 From: Cheah Kok Cheong Date: Thu, 21 Dec 2017 19:35:30 +0800 Subject: padata: add SPDX identifier Add SPDX license identifier according to the type of license text found in the file. Cc: Philippe Ombredanne Signed-off-by: Cheah Kok Cheong Acked-by: Steffen Klassert Signed-off-by: Herbert Xu --- kernel/padata.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 57c0074d50cc..d568cc56405f 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * padata.c - generic interface to process data streams in parallel * -- cgit v1.2.2 From 3b14f08d169400a5a635b299a273db23d2be8e49 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 12 Dec 2017 16:34:53 +0900 Subject: irq debug: do not use print_symbol() print_symbol() is a very old API that has been obsoleted by %pS format specifier in a normal printk() call. Replace print_symbol() with a direct printk("%pS") call and avoid using continuous lines. Link: http://lkml.kernel.org/r/20171212073453.21455-1-sergey.senozhatsky@gmail.com To: Andrew Morton To: Russell King To: Catalin Marinas To: Mark Salter To: Tony Luck To: David Howells To: Yoshinori Sato To: Guan Xuetao To: Borislav Petkov To: Greg Kroah-Hartman To: Thomas Gleixner To: Peter Zijlstra To: Vineet Gupta To: Fengguang Wu To: David Laight Cc: Steven Rostedt Cc: Petr Mladek Cc: LKML Cc: linux-arm-kernel@lists.infradead.org Cc: linux-c6x-dev@linux-c6x.org Cc: linux-ia64@vger.kernel.org Cc: linux-am33-list@redhat.com Cc: linux-sh@vger.kernel.org Cc: linux-edac@vger.kernel.org Cc: x86@kernel.org Cc: linux-snps-arc@lists.infradead.org Signed-off-by: Sergey Senozhatsky [pmladek@suse.com: updated commit message] Signed-off-by: Petr Mladek --- kernel/irq/debug.h | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h index 17f05ef8f575..7e06dd275c17 100644 --- a/kernel/irq/debug.h +++ b/kernel/irq/debug.h @@ -3,8 +3,6 @@ * Debugging printout: */ -#include - #define ___P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f) #define ___PS(f) if (desc->istate & f) printk("%14s set\n", #f) /* FIXME */ @@ -14,14 +12,14 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) { printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n", irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); - printk("->handle_irq(): %p, ", desc->handle_irq); - print_symbol("%s\n", (unsigned long)desc->handle_irq); - printk("->irq_data.chip(): %p, ", desc->irq_data.chip); - print_symbol("%s\n", (unsigned long)desc->irq_data.chip); + printk("->handle_irq(): %p, %pS\n", + desc->handle_irq, desc->handle_irq); + printk("->irq_data.chip(): %p, %pS\n", + desc->irq_data.chip, desc->irq_data.chip); printk("->action(): %p\n", desc->action); if (desc->action) { - printk("->action->handler(): %p, ", desc->action->handler); - print_symbol("%s\n", (unsigned long)desc->action->handler); + printk("->action->handler(): %p, %pS\n", + desc->action->handler, desc->action->handler); } ___P(IRQ_LEVEL); -- cgit v1.2.2 From a4a0683fd5e64e029421a465525352f01d57f27a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 1 Dec 2017 17:22:19 -0500 Subject: bpf_obj_do_pin(): switch to vfs_mkobj(), quit abusing ->mknod() Signed-off-by: Al Viro --- kernel/bpf/inode.c | 50 ++++++++++++++++++++++---------------------------- 1 file changed, 22 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 01aaef1a77c5..2b75faccc771 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -150,39 +150,29 @@ static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) return 0; } -static int bpf_mkobj_ops(struct inode *dir, struct dentry *dentry, - umode_t mode, const struct inode_operations *iops) +static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw, + const struct inode_operations *iops) { - struct inode *inode; - - inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFREG); + struct inode *dir = dentry->d_parent->d_inode; + struct inode *inode = bpf_get_inode(dir->i_sb, dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); inode->i_op = iops; - inode->i_private = dentry->d_fsdata; + inode->i_private = raw; bpf_dentry_finalize(dentry, inode, dir); return 0; } -static int bpf_mkobj(struct inode *dir, struct dentry *dentry, umode_t mode, - dev_t devt) +static int bpf_mkprog(struct dentry *dentry, umode_t mode, void *arg) { - enum bpf_type type = MINOR(devt); - - if (MAJOR(devt) != UNNAMED_MAJOR || !S_ISREG(mode) || - dentry->d_fsdata == NULL) - return -EPERM; + return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops); +} - switch (type) { - case BPF_TYPE_PROG: - return bpf_mkobj_ops(dir, dentry, mode, &bpf_prog_iops); - case BPF_TYPE_MAP: - return bpf_mkobj_ops(dir, dentry, mode, &bpf_map_iops); - default: - return -EPERM; - } +static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) +{ + return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops); } static struct dentry * @@ -218,7 +208,6 @@ static int bpf_symlink(struct inode *dir, struct dentry *dentry, static const struct inode_operations bpf_dir_iops = { .lookup = bpf_lookup, - .mknod = bpf_mkobj, .mkdir = bpf_mkdir, .symlink = bpf_symlink, .rmdir = simple_rmdir, @@ -234,7 +223,6 @@ static int bpf_obj_do_pin(const struct filename *pathname, void *raw, struct inode *dir; struct path path; umode_t mode; - dev_t devt; int ret; dentry = kern_path_create(AT_FDCWD, pathname->name, &path, 0); @@ -242,9 +230,8 @@ static int bpf_obj_do_pin(const struct filename *pathname, void *raw, return PTR_ERR(dentry); mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask()); - devt = MKDEV(UNNAMED_MAJOR, type); - ret = security_path_mknod(&path, dentry, mode, devt); + ret = security_path_mknod(&path, dentry, mode, 0); if (ret) goto out; @@ -254,9 +241,16 @@ static int bpf_obj_do_pin(const struct filename *pathname, void *raw, goto out; } - dentry->d_fsdata = raw; - ret = vfs_mknod(dir, dentry, mode, devt); - dentry->d_fsdata = NULL; + switch (type) { + case BPF_TYPE_PROG: + ret = vfs_mkobj(dentry, mode, bpf_mkprog, raw); + break; + case BPF_TYPE_MAP: + ret = vfs_mkobj(dentry, mode, bpf_mkmap, raw); + break; + default: + ret = -EPERM; + } out: done_path_create(&path, dentry); return ret; -- cgit v1.2.2 From 16f07c551e3ac433874031433231140d38e69ccd Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 4 Jan 2018 13:55:03 -0800 Subject: bpf: implement syscall command BPF_MAP_GET_NEXT_KEY for stacktrace map Currently, bpf syscall command BPF_MAP_GET_NEXT_KEY is not supported for stacktrace map. However, there are use cases where user space wants to enumerate all stacktrace map entries where BPF_MAP_GET_NEXT_KEY command will be really helpful. In addition, if user space wants to delete all map entries in order to save memory and does not want to close the map file descriptor, BPF_MAP_GET_NEXT_KEY may help improve performance if map entries are sparsely populated. The implementation has similar behavior for BPF_MAP_GET_NEXT_KEY implementation in hashtab. If user provides a NULL key pointer or an invalid key, the first key is returned. Otherwise, the first valid key after the input parameter "key" is returned, or -ENOENT if no valid key can be found. Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/stackmap.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index a15bc636cc98..6c63c2222ea8 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -226,9 +226,33 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) return 0; } -static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +static int stack_map_get_next_key(struct bpf_map *map, void *key, + void *next_key) { - return -EINVAL; + struct bpf_stack_map *smap = container_of(map, + struct bpf_stack_map, map); + u32 id; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (!key) { + id = 0; + } else { + id = *(u32 *)key; + if (id >= smap->n_buckets || !smap->buckets[id]) + id = 0; + else + id++; + } + + while (id < smap->n_buckets && !smap->buckets[id]) + id++; + + if (id >= smap->n_buckets) + return -ENOENT; + + *(u32 *)next_key = id; + return 0; } static int stack_map_update_elem(struct bpf_map *map, void *key, void *value, -- cgit v1.2.2 From 983c751532738b83c4c5b51192ebc6fbfe9330f7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 8 Jan 2018 05:38:32 -0800 Subject: workqueue: separate out init_rescuer() Separate out init_rescuer() from __alloc_workqueue_key() to prepare for early init support for WQ_MEM_RECLAIM. This patch doesn't introduce any functional changes. Signed-off-by: Tejun Heo Cc: Paul McKenney --- kernel/workqueue.c | 56 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 43d18cb46308..5baac7c8a5f9 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3939,6 +3939,37 @@ static int wq_clamp_max_active(int max_active, unsigned int flags, return clamp_val(max_active, 1, lim); } +/* + * Workqueues which may be used during memory reclaim should have a rescuer + * to guarantee forward progress. + */ +static int init_rescuer(struct workqueue_struct *wq) +{ + struct worker *rescuer; + int ret; + + if (!(wq->flags & WQ_MEM_RECLAIM)) + return 0; + + rescuer = alloc_worker(NUMA_NO_NODE); + if (!rescuer) + return -ENOMEM; + + rescuer->rescue_wq = wq; + rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", wq->name); + ret = PTR_ERR_OR_ZERO(rescuer->task); + if (ret) { + kfree(rescuer); + return ret; + } + + wq->rescuer = rescuer; + kthread_bind_mask(rescuer->task, cpu_possible_mask); + wake_up_process(rescuer->task); + + return 0; +} + struct workqueue_struct *__alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active, @@ -4001,29 +4032,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, if (alloc_and_link_pwqs(wq) < 0) goto err_free_wq; - /* - * Workqueues which may be used during memory reclaim should - * have a rescuer to guarantee forward progress. - */ - if (flags & WQ_MEM_RECLAIM) { - struct worker *rescuer; - - rescuer = alloc_worker(NUMA_NO_NODE); - if (!rescuer) - goto err_destroy; - - rescuer->rescue_wq = wq; - rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", - wq->name); - if (IS_ERR(rescuer->task)) { - kfree(rescuer); - goto err_destroy; - } - - wq->rescuer = rescuer; - kthread_bind_mask(rescuer->task, cpu_possible_mask); - wake_up_process(rescuer->task); - } + if (init_rescuer(wq) < 0) + goto err_destroy; if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq)) goto err_destroy; -- cgit v1.2.2 From 40c17f75dfa9ec163478efd3f7443fd6af9992c9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 8 Jan 2018 05:38:37 -0800 Subject: workqueue: allow WQ_MEM_RECLAIM on early init workqueues Workqueues can be created early during boot before workqueue subsystem in fully online - work items are queued waiting for later full initialization. However, early init wasn't supported for WQ_MEM_RECLAIM workqueues causing unnecessary annoyances for a subset of users. Expand early init support to include WQ_MEM_RECLAIM workqueues. Signed-off-by: Tejun Heo Cc: Paul McKenney --- kernel/workqueue.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5baac7c8a5f9..c86cc1ed678b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4032,7 +4032,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, if (alloc_and_link_pwqs(wq) < 0) goto err_free_wq; - if (init_rescuer(wq) < 0) + if (wq_online && init_rescuer(wq) < 0) goto err_destroy; if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq)) @@ -5639,6 +5639,8 @@ int __init workqueue_init(void) * archs such as power and arm64. As per-cpu pools created * previously could be missing node hint and unbound pools NUMA * affinity, fix them up. + * + * Also, while iterating workqueues, create rescuers if requested. */ wq_numa_init(); @@ -5650,8 +5652,12 @@ int __init workqueue_init(void) } } - list_for_each_entry(wq, &workqueues, list) + list_for_each_entry(wq, &workqueues, list) { wq_update_unbound_numa(wq, smp_processor_id(), true); + WARN(init_rescuer(wq), + "workqueue: failed to create early rescuer for %s", + wq->name); + } mutex_unlock(&wq_pool_mutex); -- cgit v1.2.2 From 5896351ea9360072f8bdd9eee186861a9d13db6d Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 8 Jan 2018 07:51:17 -0800 Subject: bpf: fix verifier GPF in kmalloc failure path syzbot reported the following panic in the verifier triggered by kmalloc error injection: kasan: GPF could be caused by NULL-ptr deref or user memory access RIP: 0010:copy_func_state kernel/bpf/verifier.c:403 [inline] RIP: 0010:copy_verifier_state+0x364/0x590 kernel/bpf/verifier.c:431 Call Trace: pop_stack+0x8c/0x270 kernel/bpf/verifier.c:449 push_stack kernel/bpf/verifier.c:491 [inline] check_cond_jmp_op kernel/bpf/verifier.c:3598 [inline] do_check+0x4b60/0xa050 kernel/bpf/verifier.c:4731 bpf_check+0x3296/0x58c0 kernel/bpf/verifier.c:5489 bpf_prog_load+0xa2a/0x1b00 kernel/bpf/syscall.c:1198 SYSC_bpf kernel/bpf/syscall.c:1807 [inline] SyS_bpf+0x1044/0x4420 kernel/bpf/syscall.c:1769 when copy_verifier_state() aborts in the middle due to kmalloc failure some of the frames could have been partially copied while current free_verifier_state() loop for (i = 0; i <= state->curframe; i++) assumed that all frames are non-null. Simply fix it by adding 'if (!state)' to free_func_state(). Also avoid stressing copy frame logic more if kzalloc fails in push_stack() free env->cur_state right away. Fixes: f4d7e40a5b71 ("bpf: introduce function calls (verification)") Reported-by: syzbot+32ac5a3e473f2e01cfc7@syzkaller.appspotmail.com Reported-by: syzbot+fa99e24f3c29d269a7d5@syzkaller.appspotmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a2b211262c25..d921ab387b0b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -375,6 +375,8 @@ static int realloc_func_state(struct bpf_func_state *state, int size, static void free_func_state(struct bpf_func_state *state) { + if (!state) + return; kfree(state->stack); kfree(state); } @@ -487,6 +489,8 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, } return &elem->st; err: + free_verifier_state(env->cur_state, true); + env->cur_state = NULL; /* pop all elements and return */ while (!pop_stack(env, NULL, NULL)); return NULL; -- cgit v1.2.2 From 24e6d5a59ac7d31adc0322de2d0117dfa370936f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 29 Dec 2017 08:53:53 +0100 Subject: mm: pass the vmem_altmap to arch_add_memory and __add_pages We can just pass this on instead of having to do a radix tree lookup without proper locking 2 levels into the callchain. Signed-off-by: Christoph Hellwig Signed-off-by: Dan Williams --- kernel/memremap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 403ab9cdb949..8488cdeead16 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -382,6 +382,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, if (altmap) { memcpy(&page_map->altmap, altmap, sizeof(*altmap)); pgmap->altmap = &page_map->altmap; + altmap = pgmap->altmap; } pgmap->ref = ref; pgmap->res = &page_map->res; @@ -427,7 +428,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, goto err_pfn_remap; mem_hotplug_begin(); - error = arch_add_memory(nid, align_start, align_size, false); + error = arch_add_memory(nid, align_start, align_size, altmap, false); if (!error) move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], align_start >> PAGE_SHIFT, -- cgit v1.2.2 From da024512a1fa5c979257e442130ee1d468285057 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 29 Dec 2017 08:53:55 +0100 Subject: mm: pass the vmem_altmap to arch_remove_memory and __remove_pages We can just pass this on instead of having to do a radix tree lookup without proper locking 2 levels into the callchain. Signed-off-by: Christoph Hellwig Signed-off-by: Dan Williams --- kernel/memremap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 8488cdeead16..380fca1c4a02 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -304,7 +304,7 @@ static void devm_memremap_pages_release(struct device *dev, void *data) align_size = ALIGN(resource_size(res), SECTION_SIZE); mem_hotplug_begin(); - arch_remove_memory(align_start, align_size); + arch_remove_memory(align_start, align_size, pgmap->altmap); mem_hotplug_done(); untrack_pfn(NULL, PHYS_PFN(align_start), align_size); -- cgit v1.2.2 From a99583e780c751003ac9c0105eec9a3b23ec3bc4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 29 Dec 2017 08:53:57 +0100 Subject: mm: pass the vmem_altmap to memmap_init_zone Pass the vmem_altmap two levels down instead of needing a lookup. Signed-off-by: Christoph Hellwig Signed-off-by: Dan Williams --- kernel/memremap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 380fca1c4a02..64b12c806cc5 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -432,7 +432,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, if (!error) move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], align_start >> PAGE_SHIFT, - align_size >> PAGE_SHIFT); + align_size >> PAGE_SHIFT, altmap); mem_hotplug_done(); if (error) goto err_add_memory; -- cgit v1.2.2 From 0822acb86cf340cd45b3af6436cec7e3bb24ebd2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 29 Dec 2017 08:54:00 +0100 Subject: mm: move get_dev_pagemap out of line MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a pretty big function, which should be out of line in general, and a no-op stub if CONFIG_ZONE_DEVICЕ is not set. Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Signed-off-by: Dan Williams --- kernel/memremap.c | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 64b12c806cc5..3df6cd4ffb40 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -314,7 +314,7 @@ static void devm_memremap_pages_release(struct device *dev, void *data) } /* assumes rcu_read_lock() held at entry */ -struct dev_pagemap *find_dev_pagemap(resource_size_t phys) +static struct dev_pagemap *find_dev_pagemap(resource_size_t phys) { struct page_map *page_map; @@ -501,8 +501,40 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) return pgmap ? pgmap->altmap : NULL; } -#endif /* CONFIG_ZONE_DEVICE */ +/** + * get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn + * @pfn: page frame number to lookup page_map + * @pgmap: optional known pgmap that already has a reference + * + * @pgmap allows the overhead of a lookup to be bypassed when @pfn lands in the + * same mapping. + */ +struct dev_pagemap *get_dev_pagemap(unsigned long pfn, + struct dev_pagemap *pgmap) +{ + const struct resource *res = pgmap ? pgmap->res : NULL; + resource_size_t phys = PFN_PHYS(pfn); + + /* + * In the cached case we're already holding a live reference so + * we can simply do a blind increment + */ + if (res && phys >= res->start && phys <= res->end) { + percpu_ref_get(pgmap->ref); + return pgmap; + } + + /* fall back to slow path lookup */ + rcu_read_lock(); + pgmap = find_dev_pagemap(phys); + if (pgmap && !percpu_ref_tryget_live(pgmap->ref)) + pgmap = NULL; + rcu_read_unlock(); + + return pgmap; +} +#endif /* CONFIG_ZONE_DEVICE */ #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) void put_zone_device_private_or_public_page(struct page *page) -- cgit v1.2.2 From 832d7aa051106c927cae05ced29d3fd31459ed21 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 29 Dec 2017 08:54:01 +0100 Subject: mm: optimize dev_pagemap reference counting around get_dev_pagemap Change the calling convention so that get_dev_pagemap always consumes the previous reference instead of doing this using an explicit earlier call to put_dev_pagemap in the callers. The callers will still need to put the final reference after finishing the loop over the pages. Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Signed-off-by: Dan Williams --- kernel/memremap.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 3df6cd4ffb40..891c77487a6a 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -507,22 +507,23 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) * @pfn: page frame number to lookup page_map * @pgmap: optional known pgmap that already has a reference * - * @pgmap allows the overhead of a lookup to be bypassed when @pfn lands in the - * same mapping. + * If @pgmap is non-NULL and covers @pfn it will be returned as-is. If @pgmap + * is non-NULL but does not cover @pfn the reference to it will be released. */ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, struct dev_pagemap *pgmap) { - const struct resource *res = pgmap ? pgmap->res : NULL; resource_size_t phys = PFN_PHYS(pfn); /* - * In the cached case we're already holding a live reference so - * we can simply do a blind increment + * In the cached case we're already holding a live reference. */ - if (res && phys >= res->start && phys <= res->end) { - percpu_ref_get(pgmap->ref); - return pgmap; + if (pgmap) { + const struct resource *res = pgmap ? pgmap->res : NULL; + + if (res && phys >= res->start && phys <= res->end) + return pgmap; + put_dev_pagemap(pgmap); } /* fall back to slow path lookup */ -- cgit v1.2.2 From 0628b8c650718f4dfedfcdc9ed136bf7e394aae7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 29 Dec 2017 08:54:02 +0100 Subject: memremap: remove to_vmem_altmap All callers are gone now. Signed-off-by: Christoph Hellwig Signed-off-by: Dan Williams --- kernel/memremap.c | 26 -------------------------- 1 file changed, 26 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 891c77487a6a..b09517439dec 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -476,32 +476,6 @@ void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns) altmap->alloc -= nr_pfns; } -struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) -{ - /* - * 'memmap_start' is the virtual address for the first "struct - * page" in this range of the vmemmap array. In the case of - * CONFIG_SPARSEMEM_VMEMMAP a page_to_pfn conversion is simple - * pointer arithmetic, so we can perform this to_vmem_altmap() - * conversion without concern for the initialization state of - * the struct page fields. - */ - struct page *page = (struct page *) memmap_start; - struct dev_pagemap *pgmap; - - /* - * Unconditionally retrieve a dev_pagemap associated with the - * given physical address, this is only for use in the - * arch_{add|remove}_memory() for setting up and tearing down - * the memmap. - */ - rcu_read_lock(); - pgmap = find_dev_pagemap(__pfn_to_phys(page_to_pfn(page))); - rcu_read_unlock(); - - return pgmap ? pgmap->altmap : NULL; -} - /** * get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn * @pfn: page frame number to lookup page_map -- cgit v1.2.2 From 7003e3b1f64d0195ea9d31aed0b096ad38f3cb54 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 29 Dec 2017 08:54:03 +0100 Subject: memremap: simplify duplicate region handling in devm_memremap_pages __radix_tree_insert already checks for duplicates and returns -EEXIST in that case, so remove the duplicate (and racy) duplicates check. Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Signed-off-by: Dan Williams --- kernel/memremap.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index b09517439dec..12e78528fea4 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -396,17 +396,6 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, align_end = align_start + align_size - 1; foreach_order_pgoff(res, order, pgoff) { - struct dev_pagemap *dup; - - rcu_read_lock(); - dup = find_dev_pagemap(res->start + PFN_PHYS(pgoff)); - rcu_read_unlock(); - if (dup) { - dev_err(dev, "%s: %pr collides with mapping for %s\n", - __func__, res, dev_name(dup->dev)); - error = -EBUSY; - break; - } error = __radix_tree_insert(&pgmap_radix, PHYS_PFN(res->start) + pgoff, order, page_map); if (error) { -- cgit v1.2.2 From e7744aa25cffe26d3767c9ffcf4e130cca1dff00 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Fri, 29 Dec 2017 08:54:04 +0100 Subject: memremap: drop private struct page_map 'struct page_map' is a private structure of 'struct dev_pagemap' but the latter replicates all the same fields as the former so there isn't much value in it. Thus drop it in favour of a completely public struct. This is a clean up in preperation for a more generally useful 'devm_memeremap_pages' interface. Signed-off-by: Logan Gunthorpe Signed-off-by: Christoph Hellwig Signed-off-by: Dan Williams --- kernel/memremap.c | 68 +++++++++++++++++++++---------------------------------- 1 file changed, 26 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 12e78528fea4..9207c44cce20 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -188,13 +188,6 @@ static RADIX_TREE(pgmap_radix, GFP_KERNEL); #define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1) #define SECTION_SIZE (1UL << PA_SECTION_SHIFT) -struct page_map { - struct resource res; - struct percpu_ref *ref; - struct dev_pagemap pgmap; - struct vmem_altmap altmap; -}; - static unsigned long order_at(struct resource *res, unsigned long pgoff) { unsigned long phys_pgoff = PHYS_PFN(res->start) + pgoff; @@ -260,22 +253,21 @@ static void pgmap_radix_release(struct resource *res) synchronize_rcu(); } -static unsigned long pfn_first(struct page_map *page_map) +static unsigned long pfn_first(struct dev_pagemap *pgmap) { - struct dev_pagemap *pgmap = &page_map->pgmap; - const struct resource *res = &page_map->res; - struct vmem_altmap *altmap = pgmap->altmap; + const struct resource *res = &pgmap->res; + struct vmem_altmap *altmap = &pgmap->altmap; unsigned long pfn; pfn = res->start >> PAGE_SHIFT; - if (altmap) + if (pgmap->altmap_valid) pfn += vmem_altmap_offset(altmap); return pfn; } -static unsigned long pfn_end(struct page_map *page_map) +static unsigned long pfn_end(struct dev_pagemap *pgmap) { - const struct resource *res = &page_map->res; + const struct resource *res = &pgmap->res; return (res->start + resource_size(res)) >> PAGE_SHIFT; } @@ -285,13 +277,12 @@ static unsigned long pfn_end(struct page_map *page_map) static void devm_memremap_pages_release(struct device *dev, void *data) { - struct page_map *page_map = data; - struct resource *res = &page_map->res; + struct dev_pagemap *pgmap = data; + struct resource *res = &pgmap->res; resource_size_t align_start, align_size; - struct dev_pagemap *pgmap = &page_map->pgmap; unsigned long pfn; - for_each_device_pfn(pfn, page_map) + for_each_device_pfn(pfn, pgmap) put_page(pfn_to_page(pfn)); if (percpu_ref_tryget_live(pgmap->ref)) { @@ -304,24 +295,22 @@ static void devm_memremap_pages_release(struct device *dev, void *data) align_size = ALIGN(resource_size(res), SECTION_SIZE); mem_hotplug_begin(); - arch_remove_memory(align_start, align_size, pgmap->altmap); + arch_remove_memory(align_start, align_size, pgmap->altmap_valid ? + &pgmap->altmap : NULL); mem_hotplug_done(); untrack_pfn(NULL, PHYS_PFN(align_start), align_size); pgmap_radix_release(res); - dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc, - "%s: failed to free all reserved pages\n", __func__); + dev_WARN_ONCE(dev, pgmap->altmap.alloc, + "%s: failed to free all reserved pages\n", __func__); } /* assumes rcu_read_lock() held at entry */ static struct dev_pagemap *find_dev_pagemap(resource_size_t phys) { - struct page_map *page_map; - WARN_ON_ONCE(!rcu_read_lock_held()); - page_map = radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys)); - return page_map ? &page_map->pgmap : NULL; + return radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys)); } /** @@ -349,7 +338,6 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, unsigned long pfn, pgoff, order; pgprot_t pgprot = PAGE_KERNEL; struct dev_pagemap *pgmap; - struct page_map *page_map; int error, nid, is_ram, i = 0; align_start = res->start & ~(SECTION_SIZE - 1); @@ -370,22 +358,20 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, if (!ref) return ERR_PTR(-EINVAL); - page_map = devres_alloc_node(devm_memremap_pages_release, - sizeof(*page_map), GFP_KERNEL, dev_to_node(dev)); - if (!page_map) + pgmap = devres_alloc_node(devm_memremap_pages_release, + sizeof(*pgmap), GFP_KERNEL, dev_to_node(dev)); + if (!pgmap) return ERR_PTR(-ENOMEM); - pgmap = &page_map->pgmap; - memcpy(&page_map->res, res, sizeof(*res)); + memcpy(&pgmap->res, res, sizeof(*res)); pgmap->dev = dev; if (altmap) { - memcpy(&page_map->altmap, altmap, sizeof(*altmap)); - pgmap->altmap = &page_map->altmap; - altmap = pgmap->altmap; + memcpy(&pgmap->altmap, altmap, sizeof(*altmap)); + pgmap->altmap_valid = true; + altmap = &pgmap->altmap; } pgmap->ref = ref; - pgmap->res = &page_map->res; pgmap->type = MEMORY_DEVICE_HOST; pgmap->page_fault = NULL; pgmap->page_free = NULL; @@ -397,7 +383,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, foreach_order_pgoff(res, order, pgoff) { error = __radix_tree_insert(&pgmap_radix, - PHYS_PFN(res->start) + pgoff, order, page_map); + PHYS_PFN(res->start) + pgoff, order, pgmap); if (error) { dev_err(dev, "%s: failed: %d\n", __func__, error); break; @@ -426,7 +412,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, if (error) goto err_add_memory; - for_each_device_pfn(pfn, page_map) { + for_each_device_pfn(pfn, pgmap) { struct page *page = pfn_to_page(pfn); /* @@ -441,7 +427,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, if (!(++i % 1024)) cond_resched(); } - devres_add(dev, page_map); + devres_add(dev, pgmap); return __va(res->start); err_add_memory: @@ -449,7 +435,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, err_pfn_remap: err_radix: pgmap_radix_release(res); - devres_free(page_map); + devres_free(pgmap); return ERR_PTR(error); } EXPORT_SYMBOL(devm_memremap_pages); @@ -482,9 +468,7 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, * In the cached case we're already holding a live reference. */ if (pgmap) { - const struct resource *res = pgmap ? pgmap->res : NULL; - - if (res && phys >= res->start && phys <= res->end) + if (phys >= pgmap->res.start && phys <= pgmap->res.end) return pgmap; put_dev_pagemap(pgmap); } -- cgit v1.2.2 From e8d5134833006a46fcbefc5f4a84d0b62bd520e7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 29 Dec 2017 08:54:05 +0100 Subject: memremap: change devm_memremap_pages interface to use struct dev_pagemap This new interface is similar to how struct device (and many others) work. The caller initializes a 'struct dev_pagemap' as required and calls 'devm_memremap_pages'. This allows the pagemap structure to be embedded in another structure and thus container_of can be used. In this way application specific members can be stored in a containing struct. This will be used by the P2P infrastructure and HMM could probably be cleaned up to use it as well (instead of having it's own, similar 'hmm_devmem_pages_create' function). Signed-off-by: Logan Gunthorpe Signed-off-by: Christoph Hellwig Signed-off-by: Dan Williams --- kernel/memremap.c | 51 +++++++++++++++++++++------------------------------ 1 file changed, 21 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 9207c44cce20..a9a948cd3d7f 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -275,9 +275,10 @@ static unsigned long pfn_end(struct dev_pagemap *pgmap) #define for_each_device_pfn(pfn, map) \ for (pfn = pfn_first(map); pfn < pfn_end(map); pfn++) -static void devm_memremap_pages_release(struct device *dev, void *data) +static void devm_memremap_pages_release(void *data) { struct dev_pagemap *pgmap = data; + struct device *dev = pgmap->dev; struct resource *res = &pgmap->res; resource_size_t align_start, align_size; unsigned long pfn; @@ -316,29 +317,34 @@ static struct dev_pagemap *find_dev_pagemap(resource_size_t phys) /** * devm_memremap_pages - remap and provide memmap backing for the given resource * @dev: hosting device for @res - * @res: "host memory" address range - * @ref: a live per-cpu reference count - * @altmap: optional descriptor for allocating the memmap from @res + * @pgmap: pointer to a struct dev_pgmap * * Notes: - * 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time - * (or devm release event). The expected order of events is that @ref has + * 1/ At a minimum the res, ref and type members of @pgmap must be initialized + * by the caller before passing it to this function + * + * 2/ The altmap field may optionally be initialized, in which case altmap_valid + * must be set to true + * + * 3/ pgmap.ref must be 'live' on entry and 'dead' before devm_memunmap_pages() + * time (or devm release event). The expected order of events is that ref has * been through percpu_ref_kill() before devm_memremap_pages_release(). The * wait for the completion of all references being dropped and * percpu_ref_exit() must occur after devm_memremap_pages_release(). * - * 2/ @res is expected to be a host memory range that could feasibly be + * 4/ res is expected to be a host memory range that could feasibly be * treated as a "System RAM" range, i.e. not a device mmio range, but * this is not enforced. */ -void *devm_memremap_pages(struct device *dev, struct resource *res, - struct percpu_ref *ref, struct vmem_altmap *altmap) +void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) { resource_size_t align_start, align_size, align_end; + struct vmem_altmap *altmap = pgmap->altmap_valid ? + &pgmap->altmap : NULL; unsigned long pfn, pgoff, order; pgprot_t pgprot = PAGE_KERNEL; - struct dev_pagemap *pgmap; int error, nid, is_ram, i = 0; + struct resource *res = &pgmap->res; align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) @@ -355,27 +361,10 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, if (is_ram == REGION_INTERSECTS) return __va(res->start); - if (!ref) + if (!pgmap->ref) return ERR_PTR(-EINVAL); - pgmap = devres_alloc_node(devm_memremap_pages_release, - sizeof(*pgmap), GFP_KERNEL, dev_to_node(dev)); - if (!pgmap) - return ERR_PTR(-ENOMEM); - - memcpy(&pgmap->res, res, sizeof(*res)); - pgmap->dev = dev; - if (altmap) { - memcpy(&pgmap->altmap, altmap, sizeof(*altmap)); - pgmap->altmap_valid = true; - altmap = &pgmap->altmap; - } - pgmap->ref = ref; - pgmap->type = MEMORY_DEVICE_HOST; - pgmap->page_fault = NULL; - pgmap->page_free = NULL; - pgmap->data = NULL; mutex_lock(&pgmap_lock); error = 0; @@ -423,11 +412,13 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, */ list_del(&page->lru); page->pgmap = pgmap; - percpu_ref_get(ref); + percpu_ref_get(pgmap->ref); if (!(++i % 1024)) cond_resched(); } - devres_add(dev, pgmap); + + devm_add_action(dev, devm_memremap_pages_release, pgmap); + return __va(res->start); err_add_memory: -- cgit v1.2.2 From e697c5b90e97792187e45f8d78fb2bfa62eb0496 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 29 Dec 2017 08:54:06 +0100 Subject: memremap: merge find_dev_pagemap into get_dev_pagemap There is only one caller of the trivial function find_dev_pagemap left, so just merge it into the caller. Signed-off-by: Christoph Hellwig Signed-off-by: Dan Williams --- kernel/memremap.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index a9a948cd3d7f..ada31b0d76d4 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -306,14 +306,6 @@ static void devm_memremap_pages_release(void *data) "%s: failed to free all reserved pages\n", __func__); } -/* assumes rcu_read_lock() held at entry */ -static struct dev_pagemap *find_dev_pagemap(resource_size_t phys) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - - return radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys)); -} - /** * devm_memremap_pages - remap and provide memmap backing for the given resource * @dev: hosting device for @res @@ -466,7 +458,7 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, /* fall back to slow path lookup */ rcu_read_lock(); - pgmap = find_dev_pagemap(phys); + pgmap = radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys)); if (pgmap && !percpu_ref_tryget_live(pgmap->ref)) pgmap = NULL; rcu_read_unlock(); -- cgit v1.2.2 From b865ea64304ed591b7ab92d74efb12eff5ff4cbb Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Fri, 10 Nov 2017 08:48:25 +0900 Subject: sections: split dereference_function_descriptor() There are two format specifiers to print out a pointer in symbolic format: '%pS/%ps' and '%pF/%pf'. On most architectures, the two mean exactly the same thing, but some architectures (ia64, ppc64, parisc64) use an indirect pointer for C function pointers, where the function pointer points to a function descriptor (which in turn contains the actual pointer to the code). The '%pF/%pf, when used appropriately, automatically does the appropriate function descriptor dereference on such architectures. The "when used appropriately" part is tricky. Basically this is a subtle ABI detail, specific to some platforms, that made it to the API level and people can be unaware of it and miss the whole "we need to dereference the function" business out. [1] proves that point (note that it fixes only '%pF' and '%pS', there might be '%pf' and '%ps' cases as well). It appears that we can handle everything within the affected arches and make '%pS/%ps' smart enough to retire '%pF/%pf'. Function descriptors live in .opd elf section and all affected arches (ia64, ppc64, parisc64) handle it properly for kernel and modules. So we, technically, can decide if the dereference is needed by simply looking at the pointer: if it belongs to .opd section then we need to dereference it. The kernel and modules have their own .opd sections, obviously, that's why we need to split dereference_function_descriptor() and use separate kernel and module dereference arch callbacks. This patch does the first step, it a) adds dereference_kernel_function_descriptor() function. b) adds a weak alias to dereference_module_function_descriptor() function. So, for the time being, we will have: 1) dereference_function_descriptor() A generic function, that simply dereferences the pointer. There is bunch of places that call it: kgdbts, init/main.c, extable, etc. 2) dereference_kernel_function_descriptor() A function to call on kernel symbols that does kernel .opd section address range test. 3) dereference_module_function_descriptor() A function to call on modules' symbols that does modules' .opd section address range test. [1] https://marc.info/?l=linux-kernel&m=150472969730573 Link: http://lkml.kernel.org/r/20171109234830.5067-2-sergey.senozhatsky@gmail.com To: Fenghua Yu To: Benjamin Herrenschmidt To: Paul Mackerras To: Michael Ellerman To: James Bottomley Cc: Andrew Morton Cc: Jessica Yu Cc: Steven Rostedt Cc: linux-ia64@vger.kernel.org Cc: linux-parisc@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Sergey Senozhatsky Tested-by: Tony Luck #ia64 Tested-by: Santosh Sivaraj #powerpc Tested-by: Helge Deller #parisc64 Signed-off-by: Petr Mladek --- kernel/module.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index f0411a271765..65f6561d70e1 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3938,6 +3938,12 @@ static const char *get_ksymbol(struct module *mod, return symname(kallsyms, best); } +void * __weak dereference_module_function_descriptor(struct module *mod, + void *ptr) +{ + return ptr; +} + /* For kallsyms to ask for address resolution. NULL means not found. Careful * not to lock to avoid deadlock on oopses, simply disable preemption. */ const char *module_address_lookup(unsigned long addr, -- cgit v1.2.2 From 04b8eb7a4ccd9ef9343e2720ccf2a5db8cfe2f67 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 6 Dec 2017 13:36:49 +0900 Subject: symbol lookup: introduce dereference_symbol_descriptor() dereference_symbol_descriptor() invokes appropriate ARCH specific function descriptor dereference callbacks: - dereference_kernel_function_descriptor() if the pointer is a kernel symbol; - dereference_module_function_descriptor() if the pointer is a module symbol. This is the last step needed to make '%pS/%ps' smart enough to handle function descriptor dereference on affected ARCHs and to retire '%pF/%pf'. To refresh it: Some architectures (ia64, ppc64, parisc64) use an indirect pointer for C function pointers - the function pointer points to a function descriptor and we need to dereference it to get the actual function pointer. Function descriptors live in .opd elf section and all affected ARCHs (ia64, ppc64, parisc64) handle it properly for kernel and modules. So we, technically, can decide if the dereference is needed by simply looking at the pointer: if it belongs to .opd section then we need to dereference it. The kernel and modules have their own .opd sections, obviously, that's why we need to split dereference_function_descriptor() and use separate kernel and module dereference arch callbacks. Link: http://lkml.kernel.org/r/20171206043649.GB15885@jagdpanzerIV Cc: Fenghua Yu Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: James Bottomley Cc: Andrew Morton Cc: Jessica Yu Cc: Steven Rostedt Cc: linux-ia64@vger.kernel.org Cc: linux-parisc@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Sergey Senozhatsky Tested-by: Tony Luck #ia64 Tested-by: Santosh Sivaraj #powerpc Tested-by: Helge Deller #parisc64 Signed-off-by: Petr Mladek --- kernel/kallsyms.c | 35 ----------------------------------- 1 file changed, 35 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 531ffa984bc2..0e4c0922908a 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -12,7 +12,6 @@ * compression (see scripts/kallsyms.c for a more complete description) */ #include -#include #include #include #include @@ -20,15 +19,12 @@ #include #include #include /* for cond_resched */ -#include #include #include #include #include #include -#include - /* * These will be re-linked against their real values * during the second link stage. @@ -52,37 +48,6 @@ extern const u16 kallsyms_token_index[] __weak; extern const unsigned long kallsyms_markers[] __weak; -static inline int is_kernel_inittext(unsigned long addr) -{ - if (addr >= (unsigned long)_sinittext - && addr <= (unsigned long)_einittext) - return 1; - return 0; -} - -static inline int is_kernel_text(unsigned long addr) -{ - if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) || - arch_is_kernel_text(addr)) - return 1; - return in_gate_area_no_mm(addr); -} - -static inline int is_kernel(unsigned long addr) -{ - if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end) - return 1; - return in_gate_area_no_mm(addr); -} - -static int is_ksym_addr(unsigned long addr) -{ - if (IS_ENABLED(CONFIG_KALLSYMS_ALL)) - return is_kernel(addr); - - return is_kernel_text(addr) || is_kernel_inittext(addr); -} - /* * Expand a compressed symbol data into the resulting uncompressed string, * if uncompressed string is too long (>= maxlen), it will be truncated, -- cgit v1.2.2 From 430e68d10baf55e4c40d4dd1de8201c1caf5dddd Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 10 Jan 2018 12:26:06 +0000 Subject: bpf: export function to write into verifier log buffer Rename the BPF verifier `verbose()` to `bpf_verifier_log_write()` and export it, so that other components (in particular, drivers for BPF offload) can reuse the user buffer log to dump error messages at verification time. Renaming `verbose()` was necessary in order to avoid a name so generic to be exported to the global namespace. However to prevent too much pain for backports, the calls to `verbose()` in the kernel BPF verifier were not changed. Instead, use function aliasing to make `verbose` point to `bpf_verifier_log_write`. Another solution could consist in making a wrapper around `verbose()`, but since it is a variadic function, I don't see a clean way without creating two identical wrappers, one for the verifier and one to export. Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d921ab387b0b..3b2b47666180 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -169,11 +169,11 @@ struct bpf_call_arg_meta { static DEFINE_MUTEX(bpf_verifier_lock); /* log_level controls verbosity level of eBPF verifier. - * verbose() is used to dump the verification trace to the log, so the user - * can figure out what's wrong with the program + * bpf_verifier_log_write() is used to dump the verification trace to the log, + * so the user can figure out what's wrong with the program */ -static __printf(2, 3) void verbose(struct bpf_verifier_env *env, - const char *fmt, ...) +__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, + const char *fmt, ...) { struct bpf_verifer_log *log = &env->log; unsigned int n; @@ -197,6 +197,14 @@ static __printf(2, 3) void verbose(struct bpf_verifier_env *env, else log->ubuf = NULL; } +EXPORT_SYMBOL_GPL(bpf_verifier_log_write); +/* Historically bpf_verifier_log_write was called verbose, but the name was too + * generic for symbol export. The function was renamed, but not the calls in + * the verifier to avoid complicating backports. Hence the alias below. + */ +static __printf(2, 3) void verbose(struct bpf_verifier_env *env, + const char *fmt, ...) + __attribute__((alias("bpf_verifier_log_write"))); static bool type_is_pkt_pointer(enum bpf_reg_type type) { -- cgit v1.2.2 From d0807da78e11d46f18399cbf8c4028c731346766 Mon Sep 17 00:00:00 2001 From: Miroslav Benes Date: Wed, 10 Jan 2018 11:01:28 +0100 Subject: livepatch: Remove immediate feature Immediate flag has been used to disable per-task consistency and patch all tasks immediately. It could be useful if the patch doesn't change any function or data semantics. However, it causes problems on its own. The consistency problem is currently broken with respect to immediate patches. func a patches 1i 2i 3 When the patch 3 is applied, only 2i function is checked (by stack checking facility). There might be a task sleeping in 1i though. Such task is migrated to 3, because we do not check 1i in klp_check_stack_func() at all. Coming atomic replace feature would be easier to implement and more reliable without immediate. Thus, remove immediate feature completely and save us from the problems. Note that force feature has the similar problem. However it is considered as a last resort. If used, administrator should not apply any new live patches and should plan for reboot into an updated kernel. The architectures would now need to provide HAVE_RELIABLE_STACKTRACE to fully support livepatch. Signed-off-by: Miroslav Benes Acked-by: Josh Poimboeuf Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 12 +---------- kernel/livepatch/transition.c | 49 +++++-------------------------------------- 2 files changed, 6 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 1c3c9b27c916..41be6061b92f 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -366,11 +366,6 @@ static int __klp_enable_patch(struct klp_patch *patch) /* * A reference is taken on the patch module to prevent it from being * unloaded. - * - * Note: For immediate (no consistency model) patches we don't allow - * patch modules to unload since there is no safe/sane method to - * determine if a thread is still running in the patched code contained - * in the patch module once the ftrace registration is successful. */ if (!try_module_get(patch->mod)) return -ENODEV; @@ -890,12 +885,7 @@ int klp_register_patch(struct klp_patch *patch) if (!klp_initialized()) return -ENODEV; - /* - * Architectures without reliable stack traces have to set - * patch->immediate because there's currently no way to patch kthreads - * with the consistency model. - */ - if (!klp_have_reliable_stack() && !patch->immediate) { + if (!klp_have_reliable_stack()) { pr_err("This architecture doesn't have support for the livepatch consistency model.\n"); return -ENOSYS; } diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index be5bfa533ee8..7c6631e693bc 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -82,7 +82,6 @@ static void klp_complete_transition(void) struct klp_func *func; struct task_struct *g, *task; unsigned int cpu; - bool immediate_func = false; pr_debug("'%s': completing %s transition\n", klp_transition_patch->mod->name, @@ -104,16 +103,9 @@ static void klp_complete_transition(void) klp_synchronize_transition(); } - if (klp_transition_patch->immediate) - goto done; - - klp_for_each_object(klp_transition_patch, obj) { - klp_for_each_func(obj, func) { + klp_for_each_object(klp_transition_patch, obj) + klp_for_each_func(obj, func) func->transition = false; - if (func->immediate) - immediate_func = true; - } - } /* Prevent klp_ftrace_handler() from seeing KLP_UNDEFINED state */ if (klp_target_state == KLP_PATCHED) @@ -132,7 +124,6 @@ static void klp_complete_transition(void) task->patch_state = KLP_UNDEFINED; } -done: klp_for_each_object(klp_transition_patch, obj) { if (!klp_is_object_loaded(obj)) continue; @@ -146,16 +137,11 @@ done: klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); /* - * See complementary comment in __klp_enable_patch() for why we - * keep the module reference for immediate patches. - * - * klp_forced or immediate_func set implies unbounded increase of - * module's ref count if the module is disabled/enabled in a loop. + * klp_forced set implies unbounded increase of module's ref count if + * the module is disabled/enabled in a loop. */ - if (!klp_forced && !klp_transition_patch->immediate && - !immediate_func && klp_target_state == KLP_UNPATCHED) { + if (!klp_forced && klp_target_state == KLP_UNPATCHED) module_put(klp_transition_patch->mod); - } klp_target_state = KLP_UNDEFINED; klp_transition_patch = NULL; @@ -223,9 +209,6 @@ static int klp_check_stack_func(struct klp_func *func, struct klp_ops *ops; int i; - if (func->immediate) - return 0; - for (i = 0; i < trace->nr_entries; i++) { address = trace->entries[i]; @@ -387,13 +370,6 @@ void klp_try_complete_transition(void) WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED); - /* - * If the patch can be applied or reverted immediately, skip the - * per-task transitions. - */ - if (klp_transition_patch->immediate) - goto success; - /* * Try to switch the tasks to the target patch state by walking their * stacks and looking for any to-be-patched or to-be-unpatched @@ -437,7 +413,6 @@ void klp_try_complete_transition(void) return; } -success: /* we're done, now cleanup the data structures */ klp_complete_transition(); } @@ -457,13 +432,6 @@ void klp_start_transition(void) klp_transition_patch->mod->name, klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); - /* - * If the patch can be applied or reverted immediately, skip the - * per-task transitions. - */ - if (klp_transition_patch->immediate) - return; - /* * Mark all normal tasks as needing a patch state update. They'll * switch either in klp_try_complete_transition() or as they exit the @@ -513,13 +481,6 @@ void klp_init_transition(struct klp_patch *patch, int state) pr_debug("'%s': initializing %s transition\n", patch->mod->name, klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); - /* - * If the patch can be applied or reverted immediately, skip the - * per-task transitions. - */ - if (patch->immediate) - return; - /* * Initialize all tasks to the initial patch state to prepare them for * switching to the target state. -- cgit v1.2.2 From 8869016d3a58cbe7c31c70f4f008a92122b271c7 Mon Sep 17 00:00:00 2001 From: Miroslav Benes Date: Thu, 21 Dec 2017 14:40:43 +0100 Subject: livepatch: add locking to force and signal functions klp_send_signals() and klp_force_transition() do not acquire klp_mutex, because it seemed to be superfluous. A potential race in klp_send_signals() was harmless and there was nothing in klp_force_transition() which needed to be synchronized. That changed with the addition of klp_forced variable during the review process. There is a small window now, when klp_complete_transition() does not see klp_forced set to true while all tasks have been already transitioned to the target state. module_put() is called and the module can be removed. Acquire klp_mutex in sysfs callback to prevent it. Do the same for the signal sending just to be sure. There is no real downside to that. Fixes: c99a2be790b07 ("livepatch: force transition to finish") Fixes: 43347d56c8d9d ("livepatch: send a fake signal to all blocking tasks") Reported-by: Jason Baron Signed-off-by: Miroslav Benes Reviewed-by: Petr Mladek Acked-by: Josh Poimboeuf Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 52 ++++++++++++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 1c3c9b27c916..8fd8e8f126da 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -537,22 +537,24 @@ static ssize_t signal_store(struct kobject *kobj, struct kobj_attribute *attr, int ret; bool val; - patch = container_of(kobj, struct klp_patch, kobj); - - /* - * klp_mutex lock is not grabbed here intentionally. It is not really - * needed. The race window is harmless and grabbing the lock would only - * hold the action back. - */ - if (patch != klp_transition_patch) - return -EINVAL; - ret = kstrtobool(buf, &val); if (ret) return ret; - if (val) - klp_send_signals(); + if (!val) + return count; + + mutex_lock(&klp_mutex); + + patch = container_of(kobj, struct klp_patch, kobj); + if (patch != klp_transition_patch) { + mutex_unlock(&klp_mutex); + return -EINVAL; + } + + klp_send_signals(); + + mutex_unlock(&klp_mutex); return count; } @@ -564,22 +566,24 @@ static ssize_t force_store(struct kobject *kobj, struct kobj_attribute *attr, int ret; bool val; - patch = container_of(kobj, struct klp_patch, kobj); - - /* - * klp_mutex lock is not grabbed here intentionally. It is not really - * needed. The race window is harmless and grabbing the lock would only - * hold the action back. - */ - if (patch != klp_transition_patch) - return -EINVAL; - ret = kstrtobool(buf, &val); if (ret) return ret; - if (val) - klp_force_transition(); + if (!val) + return count; + + mutex_lock(&klp_mutex); + + patch = container_of(kobj, struct klp_patch, kobj); + if (patch != klp_transition_patch) { + mutex_unlock(&klp_mutex); + return -EINVAL; + } + + klp_force_transition(); + + mutex_unlock(&klp_mutex); return count; } -- cgit v1.2.2 From 526c3ddb6aa270fe6f71d227eac1e189746cc257 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 3 Jan 2018 18:07:12 -0600 Subject: signal/arm64: Document conflicts with SI_USER and SIGFPE,SIGTRAP,SIGBUS Setting si_code to 0 results in a userspace seeing an si_code of 0. This is the same si_code as SI_USER. Posix and common sense requires that SI_USER not be a signal specific si_code. As such this use of 0 for the si_code is a pretty horribly broken ABI. Further use of si_code == 0 guaranteed that copy_siginfo_to_user saw a value of __SI_KILL and now sees a value of SIL_KILL with the result that uid and pid fields are copied and which might copying the si_addr field by accident but certainly not by design. Making this a very flakey implementation. Utilizing FPE_FIXME, BUS_FIXME, TRAP_FIXME siginfo_layout will now return SIL_FAULT and the appropriate fields will be reliably copied. But folks this is a new and unique kind of bad. This is massively untested code bad. This is inventing new and unique was to get siginfo wrong bad. This is don't even think about Posix or what siginfo means bad. This is lots of eyeballs all missing the fact that the code does the wrong thing bad. This is getting stuck and keep making the same mistake bad. I really hope we can find a non userspace breaking fix for this on a port as new as arm64. Possible ABI fixes include: - Send the signal without siginfo - Don't generate a signal - Possibly assign and use an appropriate si_code - Don't handle cases which can't happen Cc: Catalin Marinas Cc: Will Deacon Cc: Tyler Baicar Cc: James Morse Cc: Tony Lindgren Cc: Nicolas Pitre Cc: Olof Johansson Cc: Santosh Shilimkar Cc: Arnd Bergmann Cc: linux-arm-kernel@lists.infradead.org Ref: 53631b54c870 ("arm64: Floating point and SIMD") Ref: 32015c235603 ("arm64: exception: handle Synchronous External Abort") Ref: 1d18c47c735e ("arm64: MMU fault handling and page table management") Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index c894162ec96c..fd182a845490 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2711,6 +2711,10 @@ enum siginfo_layout siginfo_layout(int sig, int si_code) #ifdef FPE_FIXME if ((sig == SIGFPE) && (si_code == FPE_FIXME)) layout = SIL_FAULT; +#endif +#ifdef BUS_FIXME + if ((sig == SIGBUS) && (si_code == BUS_FIXME)) + layout = SIL_FAULT; #endif } return layout; -- cgit v1.2.2 From faf1f22b61f2715224ba9b579e2a983e99b86823 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 5 Jan 2018 17:27:42 -0600 Subject: signal: Ensure generic siginfos the kernel sends have all bits initialized Call clear_siginfo to ensure stack allocated siginfos are fully initialized before being passed to the signal sending functions. This ensures that if there is the kind of confusion documented by TRAP_FIXME, FPE_FIXME, or BUS_FIXME the kernel won't send unitialized data to userspace when the kernel generates a signal with SI_USER but the copy to userspace assumes it is a different kind of signal, and different fields are initialized. This also prepares the way for turning copy_siginfo_to_user into a copy_to_user, by removing the need in many cases to perform a field by field copy simply to skip the uninitialized fields. Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index fd182a845490..241d54958bbb 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -549,6 +549,7 @@ still_pending: * a fast-pathed signal or we must have been * out of queue space. So zero out the info. */ + clear_siginfo(info); info->si_signo = sig; info->si_errno = 0; info->si_code = SI_USER; @@ -1043,6 +1044,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, list_add_tail(&q->list, &pending->list); switch ((unsigned long) info) { case (unsigned long) SEND_SIG_NOINFO: + clear_siginfo(&q->info); q->info.si_signo = sig; q->info.si_errno = 0; q->info.si_code = SI_USER; @@ -1051,6 +1053,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, q->info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); break; case (unsigned long) SEND_SIG_PRIV: + clear_siginfo(&q->info); q->info.si_signo = sig; q->info.si_errno = 0; q->info.si_code = SI_KERNEL; @@ -1623,6 +1626,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig) sig = SIGCHLD; } + clear_siginfo(&info); info.si_signo = sig; info.si_errno = 0; /* @@ -1717,6 +1721,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, parent = tsk->real_parent; } + clear_siginfo(&info); info.si_signo = SIGCHLD; info.si_errno = 0; /* @@ -1929,7 +1934,7 @@ static void ptrace_do_notify(int signr, int exit_code, int why) { siginfo_t info; - memset(&info, 0, sizeof info); + clear_siginfo(&info); info.si_signo = signr; info.si_code = exit_code; info.si_pid = task_pid_vnr(current); @@ -2136,6 +2141,7 @@ static int ptrace_signal(int signr, siginfo_t *info) * have updated *info via PTRACE_SETSIGINFO. */ if (signr != info->si_signo) { + clear_siginfo(info); info->si_signo = signr; info->si_errno = 0; info->si_code = SI_USER; @@ -2941,6 +2947,7 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) { struct siginfo info; + clear_siginfo(&info); info.si_signo = sig; info.si_errno = 0; info.si_code = SI_USER; -- cgit v1.2.2 From aba1be2f8c26ea322dd9c43047c1aff90528f032 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 19 Jul 2017 21:23:15 -0500 Subject: signal: Ensure no siginfo union member increases the size of struct siginfo Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 241d54958bbb..b9e5d825ee46 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3688,6 +3688,7 @@ void __init signals_init(void) /* If this check fails, the __ARCH_SI_PREAMBLE_SIZE value is wrong! */ BUILD_BUG_ON(__ARCH_SI_PREAMBLE_SIZE != offsetof(struct siginfo, _sifields._pad)); + BUILD_BUG_ON(sizeof(struct siginfo) != SI_MAX_SIZE); sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC); } -- cgit v1.2.2 From 9943d3accb4ec5a85e72c65a9959257fa7d12500 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 24 Jul 2017 14:53:03 -0500 Subject: signal: Clear si_sys_private before copying siginfo to userspace In preparation for unconditionally copying the whole of siginfo to userspace clear si_sys_private. So this kernel internal value is guaranteed not to make it to userspace. Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index b9e5d825ee46..18aa55c1bb4f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -643,6 +643,9 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) spin_unlock(&tsk->sighand->siglock); posixtimer_rearm(info); spin_lock(&tsk->sighand->siglock); + + /* Don't expose the si_sys_private value to userspace */ + info->si_sys_private = 0; } #endif return signr; -- cgit v1.2.2 From 30073566ca64cbc005f4fbcc21f0af7db83940a2 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 31 Jul 2017 15:47:40 -0500 Subject: signal/ia64: switch the last arch-specific copy_siginfo_to_user() to generic version Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 18aa55c1bb4f..62c642899290 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2729,8 +2729,6 @@ enum siginfo_layout siginfo_layout(int sig, int si_code) return layout; } -#ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER - int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) { int err; @@ -2769,6 +2767,11 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) #ifdef __ARCH_SI_TRAPNO err |= __put_user(from->si_trapno, &to->si_trapno); #endif +#ifdef __ia64__ + err |= __put_user(from->si_imm, &to->si_imm); + err |= __put_user(from->si_flags, &to->si_flags); + err |= __put_user(from->si_isr, &to->si_isr); +#endif #ifdef BUS_MCEERR_AO /* * Other callers might not initialize the si_lsb field, @@ -2812,8 +2815,6 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) return err; } -#endif - /** * do_sigtimedwait - wait for queued signals specified in @which * @which: queued signals to wait for -- cgit v1.2.2 From 0326e7ef057d214ed43a77557078c24e98b84af9 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 27 Jul 2017 11:59:46 -0500 Subject: signal: Remove unnecessary ifdefs now that there is only one struct siginfo Remove HAVE_ARCH_SIGINFO_T Remove __ARCH_SIGSYS Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 62c642899290..47c87b1d0b8a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2697,9 +2697,7 @@ enum siginfo_layout siginfo_layout(int sig, int si_code) #endif [SIGCHLD] = { NSIGCHLD, SIL_CHLD }, [SIGPOLL] = { NSIGPOLL, SIL_POLL }, -#ifdef __ARCH_SIGSYS [SIGSYS] = { NSIGSYS, SIL_SYS }, -#endif }; if ((sig < ARRAY_SIZE(filter)) && (si_code <= filter[sig].limit)) layout = filter[sig].layout; @@ -2804,13 +2802,11 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) err |= __put_user(from->si_uid, &to->si_uid); err |= __put_user(from->si_ptr, &to->si_ptr); break; -#ifdef __ARCH_SIGSYS case SIL_SYS: err |= __put_user(from->si_call_addr, &to->si_call_addr); err |= __put_user(from->si_syscall, &to->si_syscall); err |= __put_user(from->si_arch, &to->si_arch); break; -#endif } return err; } -- cgit v1.2.2 From b4da3340eae2c3932144be3e81ccfd4e424d87b7 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 13 Jan 2018 02:54:04 +0900 Subject: tracing/kprobe: bpf: Check error injectable event is on function entry Check whether error injectable event is on function entry or not. Currently it checks the event is ftrace-based kprobes or not, but that is wrong. It should check if the event is on the entry of target function. Since error injection will override a function to just return with modified return value, that operation must be done before the target function starts making stackframe. As a side effect, bpf error injection is no need to depend on function-tracer. It can work with sw-breakpoint based kprobe events too. Signed-off-by: Masami Hiramatsu Reviewed-by: Josef Bacik Signed-off-by: Alexei Starovoitov --- kernel/trace/Kconfig | 2 -- kernel/trace/bpf_trace.c | 8 ++++---- kernel/trace/trace_kprobe.c | 9 ++++++--- kernel/trace/trace_probe.h | 12 ++++++------ 4 files changed, 16 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index ae3a2d519e50..6400e1bf97c5 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -533,9 +533,7 @@ config FUNCTION_PROFILER config BPF_KPROBE_OVERRIDE bool "Enable BPF programs to override a kprobed function" depends on BPF_EVENTS - depends on KPROBES_ON_FTRACE depends on HAVE_KPROBE_OVERRIDE - depends on DYNAMIC_FTRACE_WITH_REGS default n help Allows BPF to override the execution of a probed function and diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f6d2327ecb59..1966ad3bf3e0 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -85,7 +85,7 @@ BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) { __this_cpu_write(bpf_kprobe_override, 1); regs_set_return_value(regs, rc); - arch_ftrace_kprobe_override_function(regs); + arch_kprobe_override_function(regs); return 0; } @@ -800,11 +800,11 @@ int perf_event_attach_bpf_prog(struct perf_event *event, int ret = -EEXIST; /* - * Kprobe override only works for ftrace based kprobes, and only if they - * are on the opt-in list. + * Kprobe override only works if they are on the function entry, + * and only if they are on the opt-in list. */ if (prog->kprobe_override && - (!trace_kprobe_ftrace(event->tp_event) || + (!trace_kprobe_on_func_entry(event->tp_event) || !trace_kprobe_error_injectable(event->tp_event))) return -EINVAL; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 91f4b57dab82..3c8deb977a8b 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -88,13 +88,16 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk) return nhit; } -int trace_kprobe_ftrace(struct trace_event_call *call) +bool trace_kprobe_on_func_entry(struct trace_event_call *call) { struct trace_kprobe *tk = (struct trace_kprobe *)call->data; - return kprobe_ftrace(&tk->rp.kp); + + return kprobe_on_func_entry(tk->rp.kp.addr, + tk->rp.kp.addr ? NULL : tk->rp.kp.symbol_name, + tk->rp.kp.addr ? 0 : tk->rp.kp.offset); } -int trace_kprobe_error_injectable(struct trace_event_call *call) +bool trace_kprobe_error_injectable(struct trace_event_call *call) { struct trace_kprobe *tk = (struct trace_kprobe *)call->data; unsigned long addr; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 5e54d748c84c..e101c5bb9eda 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -252,8 +252,8 @@ struct symbol_cache; unsigned long update_symbol_cache(struct symbol_cache *sc); void free_symbol_cache(struct symbol_cache *sc); struct symbol_cache *alloc_symbol_cache(const char *sym, long offset); -int trace_kprobe_ftrace(struct trace_event_call *call); -int trace_kprobe_error_injectable(struct trace_event_call *call); +bool trace_kprobe_on_func_entry(struct trace_event_call *call); +bool trace_kprobe_error_injectable(struct trace_event_call *call); #else /* uprobes do not support symbol fetch methods */ #define fetch_symbol_u8 NULL @@ -280,14 +280,14 @@ alloc_symbol_cache(const char *sym, long offset) return NULL; } -static inline int trace_kprobe_ftrace(struct trace_event_call *call) +static inline bool trace_kprobe_on_func_entry(struct trace_event_call *call) { - return 0; + return false; } -static inline int trace_kprobe_error_injectable(struct trace_event_call *call) +static inline bool trace_kprobe_error_injectable(struct trace_event_call *call) { - return 0; + return false; } #endif /* CONFIG_KPROBE_EVENTS */ -- cgit v1.2.2 From 66665ad2f1023d3ffb0c12eea9e0a6d0b613ecb3 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 13 Jan 2018 02:54:33 +0900 Subject: tracing/kprobe: bpf: Compare instruction pointer with original one Compare instruction pointer with original one on the stack instead using per-cpu bpf_kprobe_override flag. This patch also consolidates reset_current_kprobe() and preempt_enable_no_resched() blocks. Those can be done in one place. Signed-off-by: Masami Hiramatsu Reviewed-by: Josef Bacik Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 1 - kernel/trace/trace_kprobe.c | 21 +++++++-------------- 2 files changed, 7 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 1966ad3bf3e0..24ed6363e00f 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -83,7 +83,6 @@ EXPORT_SYMBOL_GPL(trace_call_bpf); #ifdef CONFIG_BPF_KPROBE_OVERRIDE BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) { - __this_cpu_write(bpf_kprobe_override, 1); regs_set_return_value(regs, rc); arch_kprobe_override_function(regs); return 0; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 3c8deb977a8b..b8c90441bc87 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -42,8 +42,6 @@ struct trace_kprobe { (offsetof(struct trace_kprobe, tp.args) + \ (sizeof(struct probe_arg) * (n))) -DEFINE_PER_CPU(int, bpf_kprobe_override); - static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk) { return tk->rp.handler != NULL; @@ -1205,6 +1203,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) int rctx; if (bpf_prog_array_valid(call)) { + unsigned long orig_ip = instruction_pointer(regs); int ret; ret = trace_call_bpf(call, regs); @@ -1212,12 +1211,13 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) /* * We need to check and see if we modified the pc of the * pt_regs, and if so clear the kprobe and return 1 so that we - * don't do the instruction skipping. Also reset our state so - * we are clean the next pass through. + * don't do the single stepping. + * The ftrace kprobe handler leaves it up to us to re-enable + * preemption here before returning if we've modified the ip. */ - if (__this_cpu_read(bpf_kprobe_override)) { - __this_cpu_write(bpf_kprobe_override, 0); + if (orig_ip != instruction_pointer(regs)) { reset_current_kprobe(); + preempt_enable_no_resched(); return 1; } if (!ret) @@ -1325,15 +1325,8 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) if (tk->tp.flags & TP_FLAG_TRACE) kprobe_trace_func(tk, regs); #ifdef CONFIG_PERF_EVENTS - if (tk->tp.flags & TP_FLAG_PROFILE) { + if (tk->tp.flags & TP_FLAG_PROFILE) ret = kprobe_perf_func(tk, regs); - /* - * The ftrace kprobe handler leaves it up to us to re-enable - * preemption here before returning if we've modified the ip. - */ - if (ret) - preempt_enable_no_resched(); - } #endif return ret; } -- cgit v1.2.2 From 540adea3809f61115d2a1ea4ed6e627613452ba1 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 13 Jan 2018 02:55:03 +0900 Subject: error-injection: Separate error-injection from kprobe Since error-injection framework is not limited to be used by kprobes, nor bpf. Other kernel subsystems can use it freely for checking safeness of error-injection, e.g. livepatch, ftrace etc. So this separate error-injection framework from kprobes. Some differences has been made: - "kprobe" word is removed from any APIs/structures. - BPF_ALLOW_ERROR_INJECTION() is renamed to ALLOW_ERROR_INJECTION() since it is not limited for BPF too. - CONFIG_FUNCTION_ERROR_INJECTION is the config item of this feature. It is automatically enabled if the arch supports error injection feature for kprobe or ftrace etc. Signed-off-by: Masami Hiramatsu Reviewed-by: Josef Bacik Signed-off-by: Alexei Starovoitov --- kernel/kprobes.c | 163 -------------------------------------------- kernel/module.c | 8 +-- kernel/trace/Kconfig | 2 +- kernel/trace/bpf_trace.c | 4 +- kernel/trace/trace_kprobe.c | 3 +- 5 files changed, 9 insertions(+), 171 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index b4aab48ad258..da2ccf142358 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -83,16 +83,6 @@ static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) return &(kretprobe_table_locks[hash].lock); } -/* List of symbols that can be overriden for error injection. */ -static LIST_HEAD(kprobe_error_injection_list); -static DEFINE_MUTEX(kprobe_ei_mutex); -struct kprobe_ei_entry { - struct list_head list; - unsigned long start_addr; - unsigned long end_addr; - void *priv; -}; - /* Blacklist -- list of struct kprobe_blacklist_entry */ static LIST_HEAD(kprobe_blacklist); @@ -1404,17 +1394,6 @@ bool within_kprobe_blacklist(unsigned long addr) return false; } -bool within_kprobe_error_injection_list(unsigned long addr) -{ - struct kprobe_ei_entry *ent; - - list_for_each_entry(ent, &kprobe_error_injection_list, list) { - if (addr >= ent->start_addr && addr < ent->end_addr) - return true; - } - return false; -} - /* * If we have a symbol_name argument, look it up and add the offset field * to it. This way, we can specify a relative address to a symbol. @@ -2189,86 +2168,6 @@ static int __init populate_kprobe_blacklist(unsigned long *start, return 0; } -#ifdef CONFIG_BPF_KPROBE_OVERRIDE -/* Markers of the _kprobe_error_inject_list section */ -extern unsigned long __start_kprobe_error_inject_list[]; -extern unsigned long __stop_kprobe_error_inject_list[]; - -/* - * Lookup and populate the kprobe_error_injection_list. - * - * For safety reasons we only allow certain functions to be overriden with - * bpf_error_injection, so we need to populate the list of the symbols that have - * been marked as safe for overriding. - */ -static void populate_kprobe_error_injection_list(unsigned long *start, - unsigned long *end, - void *priv) -{ - unsigned long *iter; - struct kprobe_ei_entry *ent; - unsigned long entry, offset = 0, size = 0; - - mutex_lock(&kprobe_ei_mutex); - for (iter = start; iter < end; iter++) { - entry = arch_deref_entry_point((void *)*iter); - - if (!kernel_text_address(entry) || - !kallsyms_lookup_size_offset(entry, &size, &offset)) { - pr_err("Failed to find error inject entry at %p\n", - (void *)entry); - continue; - } - - ent = kmalloc(sizeof(*ent), GFP_KERNEL); - if (!ent) - break; - ent->start_addr = entry; - ent->end_addr = entry + size; - ent->priv = priv; - INIT_LIST_HEAD(&ent->list); - list_add_tail(&ent->list, &kprobe_error_injection_list); - } - mutex_unlock(&kprobe_ei_mutex); -} - -static void __init populate_kernel_kprobe_ei_list(void) -{ - populate_kprobe_error_injection_list(__start_kprobe_error_inject_list, - __stop_kprobe_error_inject_list, - NULL); -} - -static void module_load_kprobe_ei_list(struct module *mod) -{ - if (!mod->num_kprobe_ei_funcs) - return; - populate_kprobe_error_injection_list(mod->kprobe_ei_funcs, - mod->kprobe_ei_funcs + - mod->num_kprobe_ei_funcs, mod); -} - -static void module_unload_kprobe_ei_list(struct module *mod) -{ - struct kprobe_ei_entry *ent, *n; - if (!mod->num_kprobe_ei_funcs) - return; - - mutex_lock(&kprobe_ei_mutex); - list_for_each_entry_safe(ent, n, &kprobe_error_injection_list, list) { - if (ent->priv == mod) { - list_del_init(&ent->list); - kfree(ent); - } - } - mutex_unlock(&kprobe_ei_mutex); -} -#else -static inline void __init populate_kernel_kprobe_ei_list(void) {} -static inline void module_load_kprobe_ei_list(struct module *m) {} -static inline void module_unload_kprobe_ei_list(struct module *m) {} -#endif - /* Module notifier call back, checking kprobes on the module */ static int kprobes_module_callback(struct notifier_block *nb, unsigned long val, void *data) @@ -2279,11 +2178,6 @@ static int kprobes_module_callback(struct notifier_block *nb, unsigned int i; int checkcore = (val == MODULE_STATE_GOING); - if (val == MODULE_STATE_COMING) - module_load_kprobe_ei_list(mod); - else if (val == MODULE_STATE_GOING) - module_unload_kprobe_ei_list(mod); - if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE) return NOTIFY_DONE; @@ -2346,8 +2240,6 @@ static int __init init_kprobes(void) pr_err("Please take care of using kprobes.\n"); } - populate_kernel_kprobe_ei_list(); - if (kretprobe_blacklist_size) { /* lookup the function address from its name */ for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { @@ -2515,56 +2407,6 @@ static const struct file_operations debugfs_kprobe_blacklist_ops = { .release = seq_release, }; -/* - * kprobes/error_injection_list -- shows which functions can be overriden for - * error injection. - * */ -static void *kprobe_ei_seq_start(struct seq_file *m, loff_t *pos) -{ - mutex_lock(&kprobe_ei_mutex); - return seq_list_start(&kprobe_error_injection_list, *pos); -} - -static void kprobe_ei_seq_stop(struct seq_file *m, void *v) -{ - mutex_unlock(&kprobe_ei_mutex); -} - -static void *kprobe_ei_seq_next(struct seq_file *m, void *v, loff_t *pos) -{ - return seq_list_next(v, &kprobe_error_injection_list, pos); -} - -static int kprobe_ei_seq_show(struct seq_file *m, void *v) -{ - char buffer[KSYM_SYMBOL_LEN]; - struct kprobe_ei_entry *ent = - list_entry(v, struct kprobe_ei_entry, list); - - sprint_symbol(buffer, ent->start_addr); - seq_printf(m, "%s\n", buffer); - return 0; -} - -static const struct seq_operations kprobe_ei_seq_ops = { - .start = kprobe_ei_seq_start, - .next = kprobe_ei_seq_next, - .stop = kprobe_ei_seq_stop, - .show = kprobe_ei_seq_show, -}; - -static int kprobe_ei_open(struct inode *inode, struct file *filp) -{ - return seq_open(filp, &kprobe_ei_seq_ops); -} - -static const struct file_operations debugfs_kprobe_ei_ops = { - .open = kprobe_ei_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - static void arm_all_kprobes(void) { struct hlist_head *head; @@ -2706,11 +2548,6 @@ static int __init debugfs_kprobe_init(void) if (!file) goto error; - file = debugfs_create_file("error_injection_list", 0444, dir, NULL, - &debugfs_kprobe_ei_ops); - if (!file) - goto error; - return 0; error: diff --git a/kernel/module.c b/kernel/module.c index bd695bfdc5c4..601494d4b7ea 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3118,10 +3118,10 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->ftrace_callsites), &mod->num_ftrace_callsites); #endif -#ifdef CONFIG_BPF_KPROBE_OVERRIDE - mod->kprobe_ei_funcs = section_objs(info, "_kprobe_error_inject_list", - sizeof(*mod->kprobe_ei_funcs), - &mod->num_kprobe_ei_funcs); +#ifdef CONFIG_FUNCTION_ERROR_INJECTION + mod->ei_funcs = section_objs(info, "_error_injection_whitelist", + sizeof(*mod->ei_funcs), + &mod->num_ei_funcs); #endif mod->extable = section_objs(info, "__ex_table", sizeof(*mod->extable), &mod->num_exentries); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 6400e1bf97c5..7114c885a78a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -533,7 +533,7 @@ config FUNCTION_PROFILER config BPF_KPROBE_OVERRIDE bool "Enable BPF programs to override a kprobed function" depends on BPF_EVENTS - depends on HAVE_KPROBE_OVERRIDE + depends on FUNCTION_ERROR_INJECTION default n help Allows BPF to override the execution of a probed function and diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 24ed6363e00f..f274468cbc45 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include "trace_probe.h" #include "trace.h" @@ -84,7 +84,7 @@ EXPORT_SYMBOL_GPL(trace_call_bpf); BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) { regs_set_return_value(regs, rc); - arch_kprobe_override_function(regs); + override_function_with_return(regs); return 0; } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index b8c90441bc87..1fad24acd444 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "trace_probe.h" @@ -107,7 +108,7 @@ bool trace_kprobe_error_injectable(struct trace_event_call *call) } else { addr = (unsigned long)tk->rp.kp.addr; } - return within_kprobe_error_injection_list(addr); + return within_error_injection_list(addr); } static int register_kprobe_event(struct trace_kprobe *tk); -- cgit v1.2.2 From 4b1a29a7f5425d32640b34b8a755f34e02f64d0f Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 13 Jan 2018 02:56:03 +0900 Subject: error-injection: Support fault injection framework Support in-kernel fault-injection framework via debugfs. This allows you to inject a conditional error to specified function using debugfs interfaces. Here is the result of test script described in Documentation/fault-injection/fault-injection.txt =========== # ./test_fail_function.sh 1+0 records in 1+0 records out 1048576 bytes (1.0 MB, 1.0 MiB) copied, 0.0227404 s, 46.1 MB/s btrfs-progs v4.4 See http://btrfs.wiki.kernel.org for more information. Label: (null) UUID: bfa96010-12e9-4360-aed0-42eec7af5798 Node size: 16384 Sector size: 4096 Filesystem size: 1001.00MiB Block group profiles: Data: single 8.00MiB Metadata: DUP 58.00MiB System: DUP 12.00MiB SSD detected: no Incompat features: extref, skinny-metadata Number of devices: 1 Devices: ID SIZE PATH 1 1001.00MiB /dev/loop2 mount: mount /dev/loop2 on /opt/tmpmnt failed: Cannot allocate memory SUCCESS! =========== Signed-off-by: Masami Hiramatsu Reviewed-by: Josef Bacik Signed-off-by: Alexei Starovoitov --- kernel/Makefile | 1 + kernel/fail_function.c | 349 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 350 insertions(+) create mode 100644 kernel/fail_function.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 172d151d429c..f85ae5dfa474 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -81,6 +81,7 @@ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o obj-$(CONFIG_GCOV_KERNEL) += gcov/ obj-$(CONFIG_KCOV) += kcov.o obj-$(CONFIG_KPROBES) += kprobes.o +obj-$(CONFIG_FAIL_FUNCTION) += fail_function.o obj-$(CONFIG_KGDB) += debug/ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o diff --git a/kernel/fail_function.c b/kernel/fail_function.c new file mode 100644 index 000000000000..21b0122cb39c --- /dev/null +++ b/kernel/fail_function.c @@ -0,0 +1,349 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fail_function.c: Function-based error injection + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs); + +struct fei_attr { + struct list_head list; + struct kprobe kp; + unsigned long retval; +}; +static DEFINE_MUTEX(fei_lock); +static LIST_HEAD(fei_attr_list); +static DECLARE_FAULT_ATTR(fei_fault_attr); +static struct dentry *fei_debugfs_dir; + +static unsigned long adjust_error_retval(unsigned long addr, unsigned long retv) +{ + switch (get_injectable_error_type(addr)) { + case EI_ETYPE_NULL: + if (retv != 0) + return 0; + break; + case EI_ETYPE_ERRNO: + if (retv < (unsigned long)-MAX_ERRNO) + return (unsigned long)-EINVAL; + break; + case EI_ETYPE_ERRNO_NULL: + if (retv != 0 && retv < (unsigned long)-MAX_ERRNO) + return (unsigned long)-EINVAL; + break; + } + + return retv; +} + +static struct fei_attr *fei_attr_new(const char *sym, unsigned long addr) +{ + struct fei_attr *attr; + + attr = kzalloc(sizeof(*attr), GFP_KERNEL); + if (attr) { + attr->kp.symbol_name = kstrdup(sym, GFP_KERNEL); + if (!attr->kp.symbol_name) { + kfree(attr); + return NULL; + } + attr->kp.pre_handler = fei_kprobe_handler; + attr->retval = adjust_error_retval(addr, 0); + INIT_LIST_HEAD(&attr->list); + } + return attr; +} + +static void fei_attr_free(struct fei_attr *attr) +{ + if (attr) { + kfree(attr->kp.symbol_name); + kfree(attr); + } +} + +static struct fei_attr *fei_attr_lookup(const char *sym) +{ + struct fei_attr *attr; + + list_for_each_entry(attr, &fei_attr_list, list) { + if (!strcmp(attr->kp.symbol_name, sym)) + return attr; + } + + return NULL; +} + +static bool fei_attr_is_valid(struct fei_attr *_attr) +{ + struct fei_attr *attr; + + list_for_each_entry(attr, &fei_attr_list, list) { + if (attr == _attr) + return true; + } + + return false; +} + +static int fei_retval_set(void *data, u64 val) +{ + struct fei_attr *attr = data; + unsigned long retv = (unsigned long)val; + int err = 0; + + mutex_lock(&fei_lock); + /* + * Since this operation can be done after retval file is removed, + * It is safer to check the attr is still valid before accessing + * its member. + */ + if (!fei_attr_is_valid(attr)) { + err = -ENOENT; + goto out; + } + + if (attr->kp.addr) { + if (adjust_error_retval((unsigned long)attr->kp.addr, + val) != retv) + err = -EINVAL; + } + if (!err) + attr->retval = val; +out: + mutex_unlock(&fei_lock); + + return err; +} + +static int fei_retval_get(void *data, u64 *val) +{ + struct fei_attr *attr = data; + int err = 0; + + mutex_lock(&fei_lock); + /* Here we also validate @attr to ensure it still exists. */ + if (!fei_attr_is_valid(attr)) + err = -ENOENT; + else + *val = attr->retval; + mutex_unlock(&fei_lock); + + return err; +} +DEFINE_DEBUGFS_ATTRIBUTE(fei_retval_ops, fei_retval_get, fei_retval_set, + "%llx\n"); + +static int fei_debugfs_add_attr(struct fei_attr *attr) +{ + struct dentry *dir; + + dir = debugfs_create_dir(attr->kp.symbol_name, fei_debugfs_dir); + if (!dir) + return -ENOMEM; + + if (!debugfs_create_file("retval", 0600, dir, attr, &fei_retval_ops)) { + debugfs_remove_recursive(dir); + return -ENOMEM; + } + + return 0; +} + +static void fei_debugfs_remove_attr(struct fei_attr *attr) +{ + struct dentry *dir; + + dir = debugfs_lookup(attr->kp.symbol_name, fei_debugfs_dir); + if (dir) + debugfs_remove_recursive(dir); +} + +static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs) +{ + struct fei_attr *attr = container_of(kp, struct fei_attr, kp); + + if (should_fail(&fei_fault_attr, 1)) { + regs_set_return_value(regs, attr->retval); + override_function_with_return(regs); + /* Kprobe specific fixup */ + reset_current_kprobe(); + preempt_enable_no_resched(); + return 1; + } + + return 0; +} +NOKPROBE_SYMBOL(fei_kprobe_handler) + +static void *fei_seq_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&fei_lock); + return seq_list_start(&fei_attr_list, *pos); +} + +static void fei_seq_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&fei_lock); +} + +static void *fei_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + return seq_list_next(v, &fei_attr_list, pos); +} + +static int fei_seq_show(struct seq_file *m, void *v) +{ + struct fei_attr *attr = list_entry(v, struct fei_attr, list); + + seq_printf(m, "%pf\n", attr->kp.addr); + return 0; +} + +static const struct seq_operations fei_seq_ops = { + .start = fei_seq_start, + .next = fei_seq_next, + .stop = fei_seq_stop, + .show = fei_seq_show, +}; + +static int fei_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &fei_seq_ops); +} + +static void fei_attr_remove(struct fei_attr *attr) +{ + fei_debugfs_remove_attr(attr); + unregister_kprobe(&attr->kp); + list_del(&attr->list); + fei_attr_free(attr); +} + +static void fei_attr_remove_all(void) +{ + struct fei_attr *attr, *n; + + list_for_each_entry_safe(attr, n, &fei_attr_list, list) { + fei_attr_remove(attr); + } +} + +static ssize_t fei_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + struct fei_attr *attr; + unsigned long addr; + char *buf, *sym; + int ret; + + /* cut off if it is too long */ + if (count > KSYM_NAME_LEN) + count = KSYM_NAME_LEN; + buf = kmalloc(sizeof(char) * (count + 1), GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (copy_from_user(buf, buffer, count)) { + ret = -EFAULT; + goto out; + } + buf[count] = '\0'; + sym = strstrip(buf); + + mutex_lock(&fei_lock); + + /* Writing just spaces will remove all injection points */ + if (sym[0] == '\0') { + fei_attr_remove_all(); + ret = count; + goto out; + } + /* Writing !function will remove one injection point */ + if (sym[0] == '!') { + attr = fei_attr_lookup(sym + 1); + if (!attr) { + ret = -ENOENT; + goto out; + } + fei_attr_remove(attr); + ret = count; + goto out; + } + + addr = kallsyms_lookup_name(sym); + if (!addr) { + ret = -EINVAL; + goto out; + } + if (!within_error_injection_list(addr)) { + ret = -ERANGE; + goto out; + } + if (fei_attr_lookup(sym)) { + ret = -EBUSY; + goto out; + } + attr = fei_attr_new(sym, addr); + if (!attr) { + ret = -ENOMEM; + goto out; + } + + ret = register_kprobe(&attr->kp); + if (!ret) + ret = fei_debugfs_add_attr(attr); + if (ret < 0) + fei_attr_remove(attr); + else { + list_add_tail(&attr->list, &fei_attr_list); + ret = count; + } +out: + kfree(buf); + mutex_unlock(&fei_lock); + return ret; +} + +static const struct file_operations fei_ops = { + .open = fei_open, + .read = seq_read, + .write = fei_write, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init fei_debugfs_init(void) +{ + struct dentry *dir; + + dir = fault_create_debugfs_attr("fail_function", NULL, + &fei_fault_attr); + if (IS_ERR(dir)) + return PTR_ERR(dir); + + /* injectable attribute is just a symlink of error_inject/list */ + if (!debugfs_create_symlink("injectable", dir, + "../error_injection/list")) + goto error; + + if (!debugfs_create_file("inject", 0600, dir, NULL, &fei_ops)) + goto error; + + fei_debugfs_dir = dir; + + return 0; +error: + debugfs_remove_recursive(dir); + return -ENOMEM; +} + +late_initcall(fei_debugfs_init); -- cgit v1.2.2 From 1110f3a9bcf394c06b81a98206aee9b6860653c8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Jan 2018 20:29:03 -0800 Subject: bpf: add map_alloc_check callback .map_alloc callbacks contain a number of checks validating user- -provided map attributes against constraints of a particular map type. For offloaded maps we will need to check map attributes without actually allocating any memory on the host. Add a new callback for validating attributes before any memory is allocated. This callback can be selectively implemented by map types for sharing code with offloads, or simply to separate the logical steps of validation and allocation. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2bac0dc8baba..c0ac03a04880 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -96,16 +96,25 @@ static int check_uarg_tail_zero(void __user *uaddr, static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) { + const struct bpf_map_ops *ops; struct bpf_map *map; + int err; - if (attr->map_type >= ARRAY_SIZE(bpf_map_types) || - !bpf_map_types[attr->map_type]) + if (attr->map_type >= ARRAY_SIZE(bpf_map_types)) + return ERR_PTR(-EINVAL); + ops = bpf_map_types[attr->map_type]; + if (!ops) return ERR_PTR(-EINVAL); - map = bpf_map_types[attr->map_type]->map_alloc(attr); + if (ops->map_alloc_check) { + err = ops->map_alloc_check(attr); + if (err) + return ERR_PTR(err); + } + map = ops->map_alloc(attr); if (IS_ERR(map)) return map; - map->ops = bpf_map_types[attr->map_type]; + map->ops = ops; map->map_type = attr->map_type; return map; } -- cgit v1.2.2 From daffc5a2e6f4bf4f99b00e183117920e321b6763 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Jan 2018 20:29:04 -0800 Subject: bpf: hashtab: move attribute validation before allocation Number of attribute checks are currently performed after hashtab is already allocated. Move them to be able to split them out to the check function later on. Checks have to now be performed on the attr union directly instead of the members of bpf_map, since bpf_map will be allocated later. No functional changes. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- kernel/bpf/hashtab.c | 47 +++++++++++++++++++++++------------------------ 1 file changed, 23 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3905d4bc5b80..b80f42adf068 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -269,6 +269,28 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru)) return ERR_PTR(-EINVAL); + /* check sanity of attributes. + * value_size == 0 may be allowed in the future to use map as a set + */ + if (attr->max_entries == 0 || attr->key_size == 0 || + attr->value_size == 0) + return ERR_PTR(-EINVAL); + + if (attr->key_size > MAX_BPF_STACK) + /* eBPF programs initialize keys on stack, so they cannot be + * larger than max stack size + */ + return ERR_PTR(-E2BIG); + + if (attr->value_size >= KMALLOC_MAX_SIZE - + MAX_BPF_STACK - sizeof(struct htab_elem)) + /* if value_size is bigger, the user space won't be able to + * access the elements via bpf syscall. This check also makes + * sure that the elem_size doesn't overflow and it's + * kmalloc-able later in htab_map_update_elem() + */ + return ERR_PTR(-E2BIG); + htab = kzalloc(sizeof(*htab), GFP_USER); if (!htab) return ERR_PTR(-ENOMEM); @@ -281,14 +303,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) htab->map.map_flags = attr->map_flags; htab->map.numa_node = numa_node; - /* check sanity of attributes. - * value_size == 0 may be allowed in the future to use map as a set - */ - err = -EINVAL; - if (htab->map.max_entries == 0 || htab->map.key_size == 0 || - htab->map.value_size == 0) - goto free_htab; - if (percpu_lru) { /* ensure each CPU's lru list has >=1 elements. * since we are at it, make each lru list has the same @@ -304,22 +318,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) /* hash table size must be power of 2 */ htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); - err = -E2BIG; - if (htab->map.key_size > MAX_BPF_STACK) - /* eBPF programs initialize keys on stack, so they cannot be - * larger than max stack size - */ - goto free_htab; - - if (htab->map.value_size >= KMALLOC_MAX_SIZE - - MAX_BPF_STACK - sizeof(struct htab_elem)) - /* if value_size is bigger, the user space won't be able to - * access the elements via bpf syscall. This check also makes - * sure that the elem_size doesn't overflow and it's - * kmalloc-able later in htab_map_update_elem() - */ - goto free_htab; - htab->elem_size = sizeof(struct htab_elem) + round_up(htab->map.key_size, 8); if (percpu) @@ -327,6 +325,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) else htab->elem_size += round_up(htab->map.value_size, 8); + err = -E2BIG; /* prevent zero size kmalloc and check for u32 overflow */ if (htab->n_buckets == 0 || htab->n_buckets > U32_MAX / sizeof(struct bucket)) -- cgit v1.2.2 From 9328e0d1bc09e96bd7dc85374f5c2a1e0e04e539 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Jan 2018 20:29:05 -0800 Subject: bpf: hashtab: move checks out of alloc function Use the new callback to perform allocation checks for hash maps. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- kernel/bpf/hashtab.c | 55 +++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 39 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index b80f42adf068..7fd6519444d3 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -227,7 +227,7 @@ static int alloc_extra_elems(struct bpf_htab *htab) } /* Called from syscall */ -static struct bpf_map *htab_map_alloc(union bpf_attr *attr) +static int htab_map_alloc_check(union bpf_attr *attr) { bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); @@ -241,9 +241,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); int numa_node = bpf_map_attr_numa_node(attr); - struct bpf_htab *htab; - int err, i; - u64 cost; BUILD_BUG_ON(offsetof(struct htab_elem, htab) != offsetof(struct htab_elem, hash_node.pprev)); @@ -254,33 +251,33 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) /* LRU implementation is much complicated than other * maps. Hence, limit to CAP_SYS_ADMIN for now. */ - return ERR_PTR(-EPERM); + return -EPERM; if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK) /* reserved bits should not be used */ - return ERR_PTR(-EINVAL); + return -EINVAL; if (!lru && percpu_lru) - return ERR_PTR(-EINVAL); + return -EINVAL; if (lru && !prealloc) - return ERR_PTR(-ENOTSUPP); + return -ENOTSUPP; if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru)) - return ERR_PTR(-EINVAL); + return -EINVAL; /* check sanity of attributes. * value_size == 0 may be allowed in the future to use map as a set */ if (attr->max_entries == 0 || attr->key_size == 0 || attr->value_size == 0) - return ERR_PTR(-EINVAL); + return -EINVAL; if (attr->key_size > MAX_BPF_STACK) /* eBPF programs initialize keys on stack, so they cannot be * larger than max stack size */ - return ERR_PTR(-E2BIG); + return -E2BIG; if (attr->value_size >= KMALLOC_MAX_SIZE - MAX_BPF_STACK - sizeof(struct htab_elem)) @@ -289,7 +286,28 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) * sure that the elem_size doesn't overflow and it's * kmalloc-able later in htab_map_update_elem() */ - return ERR_PTR(-E2BIG); + return -E2BIG; + + return 0; +} + +static struct bpf_map *htab_map_alloc(union bpf_attr *attr) +{ + bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || + attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); + bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || + attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); + /* percpu_lru means each cpu has its own LRU list. + * it is different from BPF_MAP_TYPE_PERCPU_HASH where + * the map's value itself is percpu. percpu_lru has + * nothing to do with the map's value. + */ + bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); + bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); + int numa_node = bpf_map_attr_numa_node(attr); + struct bpf_htab *htab; + int err, i; + u64 cost; htab = kzalloc(sizeof(*htab), GFP_USER); if (!htab) @@ -1142,6 +1160,7 @@ static void htab_map_free(struct bpf_map *map) } const struct bpf_map_ops htab_map_ops = { + .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, @@ -1152,6 +1171,7 @@ const struct bpf_map_ops htab_map_ops = { }; const struct bpf_map_ops htab_lru_map_ops = { + .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, @@ -1235,6 +1255,7 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, } const struct bpf_map_ops htab_percpu_map_ops = { + .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, @@ -1244,6 +1265,7 @@ const struct bpf_map_ops htab_percpu_map_ops = { }; const struct bpf_map_ops htab_lru_percpu_map_ops = { + .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, @@ -1252,11 +1274,11 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_delete_elem = htab_lru_map_delete_elem, }; -static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr) +static int fd_htab_map_alloc_check(union bpf_attr *attr) { if (attr->value_size != sizeof(u32)) - return ERR_PTR(-EINVAL); - return htab_map_alloc(attr); + return -EINVAL; + return htab_map_alloc_check(attr); } static void fd_htab_map_free(struct bpf_map *map) @@ -1327,7 +1349,7 @@ static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr) if (IS_ERR(inner_map_meta)) return inner_map_meta; - map = fd_htab_map_alloc(attr); + map = htab_map_alloc(attr); if (IS_ERR(map)) { bpf_map_meta_free(inner_map_meta); return map; @@ -1371,6 +1393,7 @@ static void htab_of_map_free(struct bpf_map *map) } const struct bpf_map_ops htab_of_maps_map_ops = { + .map_alloc_check = fd_htab_map_alloc_check, .map_alloc = htab_of_map_alloc, .map_free = htab_of_map_free, .map_get_next_key = htab_map_get_next_key, -- cgit v1.2.2 From bd475643d74e8ed78bfd36d941053b0e45974e8e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Jan 2018 20:29:06 -0800 Subject: bpf: add helper for copying attrs to struct bpf_map All map types reimplement the field-by-field copy of union bpf_attr members into struct bpf_map. Add a helper to perform this operation. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/cpumap.c | 8 +------- kernel/bpf/devmap.c | 8 +------- kernel/bpf/hashtab.c | 9 +-------- kernel/bpf/lpm_trie.c | 7 +------ kernel/bpf/sockmap.c | 8 +------- kernel/bpf/stackmap.c | 6 +----- kernel/bpf/syscall.c | 10 ++++++++++ 7 files changed, 16 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index ce5b669003b2..192151ec9d12 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -94,13 +94,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) if (!cmap) return ERR_PTR(-ENOMEM); - /* mandatory map attributes */ - cmap->map.map_type = attr->map_type; - cmap->map.key_size = attr->key_size; - cmap->map.value_size = attr->value_size; - cmap->map.max_entries = attr->max_entries; - cmap->map.map_flags = attr->map_flags; - cmap->map.numa_node = bpf_map_attr_numa_node(attr); + bpf_map_init_from_attr(&cmap->map, attr); /* Pre-limit array size based on NR_CPUS, not final CPU check */ if (cmap->map.max_entries > NR_CPUS) { diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index ebdef54bf7df..565f9ece9115 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -93,13 +93,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (!dtab) return ERR_PTR(-ENOMEM); - /* mandatory map attributes */ - dtab->map.map_type = attr->map_type; - dtab->map.key_size = attr->key_size; - dtab->map.value_size = attr->value_size; - dtab->map.max_entries = attr->max_entries; - dtab->map.map_flags = attr->map_flags; - dtab->map.numa_node = bpf_map_attr_numa_node(attr); + bpf_map_init_from_attr(&dtab->map, attr); /* make sure page count doesn't overflow */ cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 7fd6519444d3..b76828f23b49 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -304,7 +304,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) */ bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); - int numa_node = bpf_map_attr_numa_node(attr); struct bpf_htab *htab; int err, i; u64 cost; @@ -313,13 +312,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (!htab) return ERR_PTR(-ENOMEM); - /* mandatory map attributes */ - htab->map.map_type = attr->map_type; - htab->map.key_size = attr->key_size; - htab->map.value_size = attr->value_size; - htab->map.max_entries = attr->max_entries; - htab->map.map_flags = attr->map_flags; - htab->map.numa_node = numa_node; + bpf_map_init_from_attr(&htab->map, attr); if (percpu_lru) { /* ensure each CPU's lru list has >=1 elements. diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 885e45479680..584e02227671 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -522,12 +522,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); /* copy mandatory map attributes */ - trie->map.map_type = attr->map_type; - trie->map.key_size = attr->key_size; - trie->map.value_size = attr->value_size; - trie->map.max_entries = attr->max_entries; - trie->map.map_flags = attr->map_flags; - trie->map.numa_node = bpf_map_attr_numa_node(attr); + bpf_map_init_from_attr(&trie->map, attr); trie->data_size = attr->key_size - offsetof(struct bpf_lpm_trie_key, data); trie->max_prefixlen = trie->data_size * 8; diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 079968680bc3..0314d1783d77 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -513,13 +513,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) if (!stab) return ERR_PTR(-ENOMEM); - /* mandatory map attributes */ - stab->map.map_type = attr->map_type; - stab->map.key_size = attr->key_size; - stab->map.value_size = attr->value_size; - stab->map.max_entries = attr->max_entries; - stab->map.map_flags = attr->map_flags; - stab->map.numa_node = bpf_map_attr_numa_node(attr); + bpf_map_init_from_attr(&stab->map, attr); /* make sure page count doesn't overflow */ cost = (u64) stab->map.max_entries * sizeof(struct sock *); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 6c63c2222ea8..b0ecf43f5894 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -88,14 +88,10 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) if (cost >= U32_MAX - PAGE_SIZE) goto free_smap; - smap->map.map_type = attr->map_type; - smap->map.key_size = attr->key_size; + bpf_map_init_from_attr(&smap->map, attr); smap->map.value_size = value_size; - smap->map.max_entries = attr->max_entries; - smap->map.map_flags = attr->map_flags; smap->n_buckets = n_buckets; smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - smap->map.numa_node = bpf_map_attr_numa_node(attr); err = bpf_map_precharge_memlock(smap->map.pages); if (err) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c0ac03a04880..a3f726bb42ea 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -143,6 +143,16 @@ void bpf_map_area_free(void *area) kvfree(area); } +void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr) +{ + map->map_type = attr->map_type; + map->key_size = attr->key_size; + map->value_size = attr->value_size; + map->max_entries = attr->max_entries; + map->map_flags = attr->map_flags; + map->numa_node = bpf_map_attr_numa_node(attr); +} + int bpf_map_precharge_memlock(u32 pages) { struct user_struct *user = get_current_user(); -- cgit v1.2.2 From 0a9c1991f285f829fd786fa2a9c824c2a3f87bc6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Jan 2018 20:29:07 -0800 Subject: bpf: rename bpf_dev_offload -> bpf_prog_offload With map offload coming, we need to call program offload structure something less ambiguous. Pure rename, no functional changes. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- kernel/bpf/offload.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 040d4e0edf3f..001ddfde7874 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -32,7 +32,7 @@ static LIST_HEAD(bpf_prog_offload_devs); int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) { - struct bpf_dev_offload *offload; + struct bpf_prog_offload *offload; if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS && attr->prog_type != BPF_PROG_TYPE_XDP) @@ -72,7 +72,7 @@ err_free: static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd, struct netdev_bpf *data) { - struct bpf_dev_offload *offload = prog->aux->offload; + struct bpf_prog_offload *offload = prog->aux->offload; struct net_device *netdev; ASSERT_RTNL(); @@ -110,7 +110,7 @@ exit_unlock: int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx) { - struct bpf_dev_offload *offload; + struct bpf_prog_offload *offload; int ret = -ENODEV; down_read(&bpf_devs_lock); @@ -124,7 +124,7 @@ int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, static void __bpf_prog_offload_destroy(struct bpf_prog *prog) { - struct bpf_dev_offload *offload = prog->aux->offload; + struct bpf_prog_offload *offload = prog->aux->offload; struct netdev_bpf data = {}; data.offload.prog = prog; @@ -242,7 +242,7 @@ static int bpf_offload_notification(struct notifier_block *notifier, ulong event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); - struct bpf_dev_offload *offload, *tmp; + struct bpf_prog_offload *offload, *tmp; ASSERT_RTNL(); -- cgit v1.2.2 From 5bc2d55c6a69ef9efd11740359974b08ea11f1d7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Jan 2018 20:29:08 -0800 Subject: bpf: offload: factor out netdev checking at allocation time Add a helper to check if netdev could be found and whether it has .ndo_bpf callback. There is no need to check the callback every time it's invoked, ndos can't reasonably be swapped for a set without .ndp_bpf while program is loaded. bpf_dev_offload_check() will also be used by map offload. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- kernel/bpf/offload.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 001ddfde7874..cdd1e19a668b 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -30,9 +30,19 @@ static DECLARE_RWSEM(bpf_devs_lock); static LIST_HEAD(bpf_prog_offload_devs); +static int bpf_dev_offload_check(struct net_device *netdev) +{ + if (!netdev) + return -EINVAL; + if (!netdev->netdev_ops->ndo_bpf) + return -EOPNOTSUPP; + return 0; +} + int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) { struct bpf_prog_offload *offload; + int err; if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS && attr->prog_type != BPF_PROG_TYPE_XDP) @@ -49,12 +59,15 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) offload->netdev = dev_get_by_index(current->nsproxy->net_ns, attr->prog_ifindex); - if (!offload->netdev) - goto err_free; + err = bpf_dev_offload_check(offload->netdev); + if (err) + goto err_maybe_put; down_write(&bpf_devs_lock); - if (offload->netdev->reg_state != NETREG_REGISTERED) + if (offload->netdev->reg_state != NETREG_REGISTERED) { + err = -EINVAL; goto err_unlock; + } prog->aux->offload = offload; list_add_tail(&offload->offloads, &bpf_prog_offload_devs); dev_put(offload->netdev); @@ -63,10 +76,11 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) return 0; err_unlock: up_write(&bpf_devs_lock); - dev_put(offload->netdev); -err_free: +err_maybe_put: + if (offload->netdev) + dev_put(offload->netdev); kfree(offload); - return -EINVAL; + return err; } static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd, @@ -80,8 +94,6 @@ static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd, if (!offload) return -ENODEV; netdev = offload->netdev; - if (!netdev->netdev_ops->ndo_bpf) - return -EOPNOTSUPP; data->command = cmd; -- cgit v1.2.2 From a38845729ea3985db5d2544ec3ef3dc8f6313a27 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Jan 2018 20:29:09 -0800 Subject: bpf: offload: add map offload infrastructure BPF map offload follow similar path to program offload. At creation time users may specify ifindex of the device on which they want to create the map. Map will be validated by the kernel's .map_alloc_check callback and device driver will be called for the actual allocation. Map will have an empty set of operations associated with it (save for alloc and free callbacks). The real device callbacks are kept in map->offload->dev_ops because they have slightly different signatures. Map operations are called in process context so the driver may communicate with HW freely, msleep(), wait() etc. Map alloc and free callbacks are muxed via existing .ndo_bpf, and are always called with rtnl lock held. Maps and programs are guaranteed to be destroyed before .ndo_uninit (i.e. before unregister_netdev() returns). Map callbacks are invoked with bpf_devs_lock *read* locked, drivers must take care of exclusive locking if necessary. All offload-specific branches are marked with unlikely() (through bpf_map_is_dev_bound()), given that branch penalty will be negligible compared to IO anyway, and we don't want to penalize SW path unnecessarily. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- kernel/bpf/offload.c | 188 ++++++++++++++++++++++++++++++++++++++++++++++++-- kernel/bpf/syscall.c | 44 ++++++++++-- kernel/bpf/verifier.c | 7 ++ 3 files changed, 226 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index cdd1e19a668b..453785fa1881 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -24,11 +24,13 @@ #include #include -/* Protects bpf_prog_offload_devs and offload members of all progs. +/* Protects bpf_prog_offload_devs, bpf_map_offload_devs and offload members + * of all progs. * RTNL lock cannot be taken when holding this lock. */ static DECLARE_RWSEM(bpf_devs_lock); static LIST_HEAD(bpf_prog_offload_devs); +static LIST_HEAD(bpf_map_offload_devs); static int bpf_dev_offload_check(struct net_device *netdev) { @@ -250,11 +252,186 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info, const struct bpf_prog_ops bpf_offload_prog_ops = { }; +static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap, + enum bpf_netdev_command cmd) +{ + struct netdev_bpf data = {}; + struct net_device *netdev; + + ASSERT_RTNL(); + + data.command = cmd; + data.offmap = offmap; + /* Caller must make sure netdev is valid */ + netdev = offmap->netdev; + + return netdev->netdev_ops->ndo_bpf(netdev, &data); +} + +struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) +{ + struct net *net = current->nsproxy->net_ns; + struct bpf_offloaded_map *offmap; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + if (attr->map_type != BPF_MAP_TYPE_HASH) + return ERR_PTR(-EINVAL); + + offmap = kzalloc(sizeof(*offmap), GFP_USER); + if (!offmap) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&offmap->map, attr); + + rtnl_lock(); + down_write(&bpf_devs_lock); + offmap->netdev = __dev_get_by_index(net, attr->map_ifindex); + err = bpf_dev_offload_check(offmap->netdev); + if (err) + goto err_unlock; + + err = bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_ALLOC); + if (err) + goto err_unlock; + + list_add_tail(&offmap->offloads, &bpf_map_offload_devs); + up_write(&bpf_devs_lock); + rtnl_unlock(); + + return &offmap->map; + +err_unlock: + up_write(&bpf_devs_lock); + rtnl_unlock(); + kfree(offmap); + return ERR_PTR(err); +} + +static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap) +{ + WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE)); + /* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */ + bpf_map_free_id(&offmap->map, true); + list_del_init(&offmap->offloads); + offmap->netdev = NULL; +} + +void bpf_map_offload_map_free(struct bpf_map *map) +{ + struct bpf_offloaded_map *offmap = map_to_offmap(map); + + rtnl_lock(); + down_write(&bpf_devs_lock); + if (offmap->netdev) + __bpf_map_offload_destroy(offmap); + up_write(&bpf_devs_lock); + rtnl_unlock(); + + kfree(offmap); +} + +int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value) +{ + struct bpf_offloaded_map *offmap = map_to_offmap(map); + int ret = -ENODEV; + + down_read(&bpf_devs_lock); + if (offmap->netdev) + ret = offmap->dev_ops->map_lookup_elem(offmap, key, value); + up_read(&bpf_devs_lock); + + return ret; +} + +int bpf_map_offload_update_elem(struct bpf_map *map, + void *key, void *value, u64 flags) +{ + struct bpf_offloaded_map *offmap = map_to_offmap(map); + int ret = -ENODEV; + + if (unlikely(flags > BPF_EXIST)) + return -EINVAL; + + down_read(&bpf_devs_lock); + if (offmap->netdev) + ret = offmap->dev_ops->map_update_elem(offmap, key, value, + flags); + up_read(&bpf_devs_lock); + + return ret; +} + +int bpf_map_offload_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_offloaded_map *offmap = map_to_offmap(map); + int ret = -ENODEV; + + down_read(&bpf_devs_lock); + if (offmap->netdev) + ret = offmap->dev_ops->map_delete_elem(offmap, key); + up_read(&bpf_devs_lock); + + return ret; +} + +int bpf_map_offload_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + struct bpf_offloaded_map *offmap = map_to_offmap(map); + int ret = -ENODEV; + + down_read(&bpf_devs_lock); + if (offmap->netdev) + ret = offmap->dev_ops->map_get_next_key(offmap, key, next_key); + up_read(&bpf_devs_lock); + + return ret; +} + +bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map) +{ + struct bpf_offloaded_map *offmap; + struct bpf_prog_offload *offload; + bool ret; + + if (!!bpf_prog_is_dev_bound(prog->aux) != !!bpf_map_is_dev_bound(map)) + return false; + if (!bpf_prog_is_dev_bound(prog->aux)) + return true; + + down_read(&bpf_devs_lock); + offload = prog->aux->offload; + offmap = map_to_offmap(map); + + ret = offload && offload->netdev == offmap->netdev; + up_read(&bpf_devs_lock); + + return ret; +} + +static void bpf_offload_orphan_all_progs(struct net_device *netdev) +{ + struct bpf_prog_offload *offload, *tmp; + + list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, offloads) + if (offload->netdev == netdev) + __bpf_prog_offload_destroy(offload->prog); +} + +static void bpf_offload_orphan_all_maps(struct net_device *netdev) +{ + struct bpf_offloaded_map *offmap, *tmp; + + list_for_each_entry_safe(offmap, tmp, &bpf_map_offload_devs, offloads) + if (offmap->netdev == netdev) + __bpf_map_offload_destroy(offmap); +} + static int bpf_offload_notification(struct notifier_block *notifier, ulong event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); - struct bpf_prog_offload *offload, *tmp; ASSERT_RTNL(); @@ -265,11 +442,8 @@ static int bpf_offload_notification(struct notifier_block *notifier, break; down_write(&bpf_devs_lock); - list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, - offloads) { - if (offload->netdev == netdev) - __bpf_prog_offload_destroy(offload->prog); - } + bpf_offload_orphan_all_progs(netdev); + bpf_offload_orphan_all_maps(netdev); up_write(&bpf_devs_lock); break; default: diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a3f726bb42ea..c691b9e972e3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -94,6 +94,11 @@ static int check_uarg_tail_zero(void __user *uaddr, return 0; } +const struct bpf_map_ops bpf_map_offload_ops = { + .map_alloc = bpf_map_offload_map_alloc, + .map_free = bpf_map_offload_map_free, +}; + static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) { const struct bpf_map_ops *ops; @@ -111,6 +116,8 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) if (err) return ERR_PTR(err); } + if (attr->map_ifindex) + ops = &bpf_map_offload_ops; map = ops->map_alloc(attr); if (IS_ERR(map)) return map; @@ -208,16 +215,25 @@ static int bpf_map_alloc_id(struct bpf_map *map) return id > 0 ? 0 : id; } -static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) +void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) { unsigned long flags; + /* Offloaded maps are removed from the IDR store when their device + * disappears - even if someone holds an fd to them they are unusable, + * the memory is gone, all ops will fail; they are simply waiting for + * refcnt to drop to be freed. + */ + if (!map->id) + return; + if (do_idr_lock) spin_lock_irqsave(&map_idr_lock, flags); else __acquire(&map_idr_lock); idr_remove(&map_idr, map->id); + map->id = 0; if (do_idr_lock) spin_unlock_irqrestore(&map_idr_lock, flags); @@ -397,7 +413,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src) return 0; } -#define BPF_MAP_CREATE_LAST_FIELD map_name +#define BPF_MAP_CREATE_LAST_FIELD map_ifindex /* called via syscall */ static int map_create(union bpf_attr *attr) { @@ -585,8 +601,10 @@ static int map_lookup_elem(union bpf_attr *attr) if (!value) goto free_key; - if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { + if (bpf_map_is_dev_bound(map)) { + err = bpf_map_offload_lookup_elem(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { err = bpf_percpu_hash_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_copy(map, key, value); @@ -673,7 +691,10 @@ static int map_update_elem(union bpf_attr *attr) goto free_value; /* Need to create a kthread, thus must support schedule */ - if (map->map_type == BPF_MAP_TYPE_CPUMAP) { + if (bpf_map_is_dev_bound(map)) { + err = bpf_map_offload_update_elem(map, key, value, attr->flags); + goto out; + } else if (map->map_type == BPF_MAP_TYPE_CPUMAP) { err = map->ops->map_update_elem(map, key, value, attr->flags); goto out; } @@ -750,6 +771,11 @@ static int map_delete_elem(union bpf_attr *attr) goto err_put; } + if (bpf_map_is_dev_bound(map)) { + err = bpf_map_offload_delete_elem(map, key); + goto out; + } + preempt_disable(); __this_cpu_inc(bpf_prog_active); rcu_read_lock(); @@ -757,7 +783,7 @@ static int map_delete_elem(union bpf_attr *attr) rcu_read_unlock(); __this_cpu_dec(bpf_prog_active); preempt_enable(); - +out: if (!err) trace_bpf_map_delete_elem(map, ufd, key); kfree(key); @@ -807,9 +833,15 @@ static int map_get_next_key(union bpf_attr *attr) if (!next_key) goto free_key; + if (bpf_map_is_dev_bound(map)) { + err = bpf_map_offload_get_next_key(map, key, next_key); + goto out; + } + rcu_read_lock(); err = map->ops->map_get_next_key(map, key, next_key); rcu_read_unlock(); +out: if (err) goto free_next_key; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 48b61caa94cb..ceabb394d2dc 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4816,6 +4816,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, return -EINVAL; } } + + if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) && + !bpf_offload_dev_match(prog, map)) { + verbose(env, "offload device mismatch between prog and map\n"); + return -EINVAL; + } + return 0; } -- cgit v1.2.2 From 6106c0f82481e686b337ee0c403821fb5c3c17ef Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 11 Jan 2018 15:06:40 +1100 Subject: staging: lustre: lnet: convert selftest to use workqueues Instead of the cfs workitem library, use workqueues. As lnet wants to provide a cpu mask of allowed cpus, it needs to be a WQ_UNBOUND work queue so that tasks can run on cpus other than where they were submitted. This patch also exported apply_workqueue_attrs() which is a documented part of the workqueue API, that isn't currently exported. lustre needs it to allow workqueue thread to be limited to a subset of CPUs. Acked-by: Tejun Heo (for export of apply_workqueue_attrs) Signed-off-by: NeilBrown Signed-off-by: Greg Kroah-Hartman --- kernel/workqueue.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 43d18cb46308..d1d460bb9b02 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3806,6 +3806,7 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, return ret; } +EXPORT_SYMBOL_GPL(apply_workqueue_attrs); /** * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug -- cgit v1.2.2 From 07dcd7fe89938934ddad65f738bc5aac89b8e54d Mon Sep 17 00:00:00 2001 From: David Windsor Date: Tue, 15 Aug 2017 16:45:00 -0700 Subject: fork: Define usercopy region in mm_struct slab caches In support of usercopy hardening, this patch defines a region in the mm_struct slab caches in which userspace copy operations are allowed. Only the auxv field is copied to userspace. cache object allocation: kernel/fork.c: #define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) dup_mm(): ... mm = allocate_mm(); copy_mm(...): ... dup_mm(); copy_process(...): ... copy_mm(...) _do_fork(...): ... copy_process(...) example usage trace: fs/binfmt_elf.c: create_elf_tables(...): ... elf_info = (elf_addr_t *)current->mm->saved_auxv; ... copy_to_user(..., elf_info, ei_index * sizeof(elf_addr_t)) load_elf_binary(...): ... create_elf_tables(...); This region is known as the slab cache's usercopy region. Slab caches can now check that each dynamically sized copy operation involving cache-managed memory falls entirely within the slab's usercopy region. This patch is modified from Brad Spengler/PaX Team's PAX_USERCOPY whitelisting code in the last public patch of grsecurity/PaX based on my understanding of the code. Changes or omissions from the original code are mine and don't reflect the original grsecurity/PaX code. Signed-off-by: David Windsor [kees: adjust commit log, split patch, provide usage trace] Cc: Ingo Molnar Cc: Andrew Morton Cc: Thomas Gleixner Cc: Andy Lutomirski Signed-off-by: Kees Cook Acked-by: Rik van Riel --- kernel/fork.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 432eadf6b58c..82f2a0441d3b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2225,9 +2225,11 @@ void __init proc_caches_init(void) * maximum number of CPU's we can ever have. The cpumask_allocation * is at the end of the structure, exactly for that reason. */ - mm_cachep = kmem_cache_create("mm_struct", + mm_cachep = kmem_cache_create_usercopy("mm_struct", sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, + offsetof(struct mm_struct, saved_auxv), + sizeof_field(struct mm_struct, saved_auxv), NULL); vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); mmap_init(); -- cgit v1.2.2 From f9d29946c56734e954459bc9a0e688a8ae9b4cbf Mon Sep 17 00:00:00 2001 From: David Windsor Date: Sat, 10 Jun 2017 22:50:41 -0400 Subject: fork: Define usercopy region in thread_stack slab caches In support of usercopy hardening, this patch defines a region in the thread_stack slab caches in which userspace copy operations are allowed. Since the entire thread_stack needs to be available to userspace, the entire slab contents are whitelisted. Note that the slab-based thread stack is only present on systems with THREAD_SIZE < PAGE_SIZE and !CONFIG_VMAP_STACK. cache object allocation: kernel/fork.c: alloc_thread_stack_node(...): return kmem_cache_alloc_node(thread_stack_cache, ...) dup_task_struct(...): ... stack = alloc_thread_stack_node(...) ... tsk->stack = stack; copy_process(...): ... dup_task_struct(...) _do_fork(...): ... copy_process(...) This region is known as the slab cache's usercopy region. Slab caches can now check that each dynamically sized copy operation involving cache-managed memory falls entirely within the slab's usercopy region. This patch is modified from Brad Spengler/PaX Team's PAX_USERCOPY whitelisting code in the last public patch of grsecurity/PaX based on my understanding of the code. Changes or omissions from the original code are mine and don't reflect the original grsecurity/PaX code. Signed-off-by: David Windsor [kees: adjust commit log, split patch, provide usage trace] Cc: Ingo Molnar Cc: Andrew Morton Cc: Thomas Gleixner Cc: Andy Lutomirski Signed-off-by: Kees Cook Acked-by: Rik van Riel --- kernel/fork.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 82f2a0441d3b..0e086af148f2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -282,8 +282,9 @@ static void free_thread_stack(struct task_struct *tsk) void thread_stack_cache_init(void) { - thread_stack_cache = kmem_cache_create("thread_stack", THREAD_SIZE, - THREAD_SIZE, 0, NULL); + thread_stack_cache = kmem_cache_create_usercopy("thread_stack", + THREAD_SIZE, THREAD_SIZE, 0, 0, + THREAD_SIZE, NULL); BUG_ON(thread_stack_cache == NULL); } # endif -- cgit v1.2.2 From 5905429ad85657c28d93ec3d826ddeea1f44c3ce Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 16 Aug 2017 13:00:58 -0700 Subject: fork: Provide usercopy whitelisting for task_struct MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While the blocked and saved_sigmask fields of task_struct are copied to userspace (via sigmask_to_save() and setup_rt_frame()), it is always copied with a static length (i.e. sizeof(sigset_t)). The only portion of task_struct that is potentially dynamically sized and may be copied to userspace is in the architecture-specific thread_struct at the end of task_struct. cache object allocation: kernel/fork.c: alloc_task_struct_node(...): return kmem_cache_alloc_node(task_struct_cachep, ...); dup_task_struct(...): ... tsk = alloc_task_struct_node(node); copy_process(...): ... dup_task_struct(...) _do_fork(...): ... copy_process(...) example usage trace: arch/x86/kernel/fpu/signal.c: __fpu__restore_sig(...): ... struct task_struct *tsk = current; struct fpu *fpu = &tsk->thread.fpu; ... __copy_from_user(&fpu->state.xsave, ..., state_size); fpu__restore_sig(...): ... return __fpu__restore_sig(...); arch/x86/kernel/signal.c: restore_sigcontext(...): ... fpu__restore_sig(...) This introduces arch_thread_struct_whitelist() to let an architecture declare specifically where the whitelist should be within thread_struct. If undefined, the entire thread_struct field is left whitelisted. Cc: Andrew Morton Cc: Nicholas Piggin Cc: Laura Abbott Cc: "Mickaël Salaün" Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Andy Lutomirski Signed-off-by: Kees Cook Acked-by: Rik van Riel --- kernel/fork.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 0e086af148f2..5977e691c754 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -458,6 +458,21 @@ static void set_max_threads(unsigned int max_threads_suggested) int arch_task_struct_size __read_mostly; #endif +static void task_struct_whitelist(unsigned long *offset, unsigned long *size) +{ + /* Fetch thread_struct whitelist for the architecture. */ + arch_thread_struct_whitelist(offset, size); + + /* + * Handle zero-sized whitelist or empty thread_struct, otherwise + * adjust offset to position of thread_struct in task_struct. + */ + if (unlikely(*size == 0)) + *offset = 0; + else + *offset += offsetof(struct task_struct, thread); +} + void __init fork_init(void) { int i; @@ -466,11 +481,14 @@ void __init fork_init(void) #define ARCH_MIN_TASKALIGN 0 #endif int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN); + unsigned long useroffset, usersize; /* create a slab on which task_structs can be allocated */ - task_struct_cachep = kmem_cache_create("task_struct", + task_struct_whitelist(&useroffset, &usersize); + task_struct_cachep = kmem_cache_create_usercopy("task_struct", arch_task_struct_size, align, - SLAB_PANIC|SLAB_ACCOUNT, NULL); + SLAB_PANIC|SLAB_ACCOUNT, + useroffset, usersize, NULL); #endif /* do the arch specific task caches init */ -- cgit v1.2.2 From 71ee78d538a7bcb7a876dcbd65eb73627425df6f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 13 Jan 2018 18:25:09 -0600 Subject: signal/blackfin: Move the blackfin specific si_codes to asm-generic/siginfo.h Having si_codes in many different files simply encourages duplicate definitions that can cause problems later. To avoid that merge the blackfin specific si_codes into uapi/asm-generic/siginfo.h Update copy_siginfo_to_user to copy with the absence of BUS_MCEERR_AR that blackfin defines to be something else. Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 47c87b1d0b8a..4c3f4448c5f1 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2770,13 +2770,16 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) err |= __put_user(from->si_flags, &to->si_flags); err |= __put_user(from->si_isr, &to->si_isr); #endif -#ifdef BUS_MCEERR_AO /* * Other callers might not initialize the si_lsb field, * so check explicitly for the right codes here. */ - if (from->si_signo == SIGBUS && - (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)) +#ifdef BUS_MCEERR_AR + if (from->si_signo == SIGBUS && from->si_code == BUS_MCEERR_AR) + err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); +#endif +#ifdef BUS_MCEERR_AO + if (from->si_signo == SIGBUS && from->si_code == BUS_MCEERR_AO) err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); #endif #ifdef SEGV_BNDERR -- cgit v1.2.2 From 212a36a17efe4d696d1e3c31ebd79a9fb0cbb14b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 31 Jul 2017 17:15:31 -0500 Subject: signal: Unify and correct copy_siginfo_from_user32 The function copy_siginfo_from_user32 is used for two things, in ptrace since the dawn of siginfo for arbirarily modifying a signal that user space sees, and in sigqueueinfo to send a signal with arbirary siginfo data. Create a single copy of copy_siginfo_from_user32 that all architectures share, and teach it to handle all of the cases in the siginfo union. In the generic version of copy_siginfo_from_user32 ensure that all of the fields in siginfo are initialized so that the siginfo structure can be safely copied to userspace if necessary. When copying the embedded sigval union copy the si_int member. That ensures the 32bit values passes through the kernel unchanged. Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 4c3f4448c5f1..5211b1b57163 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2814,6 +2814,87 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) return err; } +#ifdef CONFIG_COMPAT +int copy_siginfo_from_user32(struct siginfo *to, + const struct compat_siginfo __user *ufrom) +{ + struct compat_siginfo from; + + if (copy_from_user(&from, ufrom, sizeof(struct compat_siginfo))) + return -EFAULT; + + clear_siginfo(to); + to->si_signo = from.si_signo; + to->si_errno = from.si_errno; + to->si_code = from.si_code; + switch(siginfo_layout(from.si_signo, from.si_code)) { + case SIL_KILL: + to->si_pid = from.si_pid; + to->si_uid = from.si_uid; + break; + case SIL_TIMER: + to->si_tid = from.si_tid; + to->si_overrun = from.si_overrun; + to->si_int = from.si_int; + break; + case SIL_POLL: + to->si_band = from.si_band; + to->si_fd = from.si_fd; + break; + case SIL_FAULT: + to->si_addr = compat_ptr(from.si_addr); +#ifdef __ARCH_SI_TRAPNO + to->si_trapno = from.si_trapno; +#endif +#ifdef BUS_MCEERR_AR + if ((from.si_signo == SIGBUS) && (from.si_code == BUS_MCEERR_AR)) + to->si_addr_lsb = from.si_addr_lsb; +#endif +#ifdef BUS_MCEER_AO + if ((from.si_signo == SIGBUS) && (from.si_code == BUS_MCEERR_AO)) + to->si_addr_lsb = from.si_addr_lsb; +#endif +#ifdef SEGV_BNDERR + if ((from.si_signo == SIGSEGV) && (from.si_code == SEGV_BNDERR)) { + to->si_lower = compat_ptr(from.si_lower); + to->si_upper = compat_ptr(from.si_upper); + } +#endif +#ifdef SEGV_PKUERR + if ((from.si_signo == SIGSEGV) && (from.si_code == SEGV_PKUERR)) + to->si_pkey = from.si_pkey; +#endif + break; + case SIL_CHLD: + to->si_pid = from.si_pid; + to->si_uid = from.si_uid; + to->si_status = from.si_status; +#ifdef CONFIG_X86_X32_ABI + if (in_x32_syscall()) { + to->si_utime = from._sifields._sigchld_x32._utime; + to->si_stime = from._sifields._sigchld_x32._stime; + } else +#endif + { + to->si_utime = from.si_utime; + to->si_stime = from.si_stime; + } + break; + case SIL_RT: + to->si_pid = from.si_pid; + to->si_uid = from.si_uid; + to->si_int = from.si_int; + break; + case SIL_SYS: + to->si_call_addr = compat_ptr(from.si_call_addr); + to->si_syscall = from.si_syscall; + to->si_arch = from.si_arch; + break; + } + return 0; +} +#endif /* CONFIG_COMPAT */ + /** * do_sigtimedwait - wait for queued signals specified in @which * @which: queued signals to wait for -- cgit v1.2.2 From eb5346c379cb272eca77f63473de09103a22ebee Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 31 Jul 2017 17:18:40 -0500 Subject: signal: Remove the code to clear siginfo before calling copy_siginfo_from_user32 The new unified copy_siginfo_from_user32 takes care of this. Signed-off-by: "Eric W. Biederman" --- kernel/ptrace.c | 1 - kernel/signal.c | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 84b1367935e4..ec4365da9be8 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -1226,7 +1226,6 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, break; case PTRACE_SETSIGINFO: - memset(&siginfo, 0, sizeof siginfo); if (copy_siginfo_from_user32( &siginfo, (struct compat_siginfo __user *) datap)) ret = -EFAULT; diff --git a/kernel/signal.c b/kernel/signal.c index 5211b1b57163..bebe44265b8b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3155,7 +3155,7 @@ COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo, int, sig, struct compat_siginfo __user *, uinfo) { - siginfo_t info = {}; + siginfo_t info; int ret = copy_siginfo_from_user32(&info, uinfo); if (unlikely(ret)) return ret; @@ -3199,7 +3199,7 @@ COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo, int, sig, struct compat_siginfo __user *, uinfo) { - siginfo_t info = {}; + siginfo_t info; if (copy_siginfo_from_user32(&info, uinfo)) return -EFAULT; -- cgit v1.2.2 From ea64d5acc8f033cd586182ae31531246cdeaea73 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 15 Jan 2018 18:03:33 -0600 Subject: signal: Unify and correct copy_siginfo_to_user32 Among the existing architecture specific versions of copy_siginfo_to_user32 there are several different implementation problems. Some architectures fail to handle all of the cases in in the siginfo union. Some architectures perform a blind copy of the siginfo union when the si_code is negative. A blind copy suggests the data is expected to be in 32bit siginfo format, which means that receiving such a signal via signalfd won't work, or that the data is in 64bit siginfo and the code is copying nonsense to userspace. Create a single instance of copy_siginfo_to_user32 that all of the architectures can share, and teach it to handle all of the cases in the siginfo union correctly, with the assumption that siginfo is stored internally to the kernel is 64bit siginfo format. A special case is made for x86 x32 format. This is needed as presence of both x32 and ia32 on x86_64 results in two different 32bit signal formats. By allowing this small special case there winds up being exactly one code base that needs to be maintained between all of the architectures. Vastly increasing the testing base and the chances of finding bugs. As the x86 copy of copy_siginfo_to_user32 the call of the x86 signal_compat_build_tests were moved into sigaction_compat_abi, so that they will keep running. Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index bebe44265b8b..4976f05aa09b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2815,6 +2815,96 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) } #ifdef CONFIG_COMPAT +int copy_siginfo_to_user32(struct compat_siginfo __user *to, + const struct siginfo *from) +#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION) +{ + return __copy_siginfo_to_user32(to, from, in_x32_syscall()); +} +int __copy_siginfo_to_user32(struct compat_siginfo __user *to, + const struct siginfo *from, bool x32_ABI) +#endif +{ + struct compat_siginfo new; + memset(&new, 0, sizeof(new)); + + new.si_signo = from->si_signo; + new.si_errno = from->si_errno; + new.si_code = from->si_code; + switch(siginfo_layout(from->si_signo, from->si_code)) { + case SIL_KILL: + new.si_pid = from->si_pid; + new.si_uid = from->si_uid; + break; + case SIL_TIMER: + new.si_tid = from->si_tid; + new.si_overrun = from->si_overrun; + new.si_int = from->si_int; + break; + case SIL_POLL: + new.si_band = from->si_band; + new.si_fd = from->si_fd; + break; + case SIL_FAULT: + new.si_addr = ptr_to_compat(from->si_addr); +#ifdef __ARCH_SI_TRAPNO + new.si_trapno = from->si_trapno; +#endif +#ifdef BUS_MCEERR_AR + if ((from->si_signo == SIGBUS) && (from->si_code == BUS_MCEERR_AR)) + new.si_addr_lsb = from->si_addr_lsb; +#endif +#ifdef BUS_MCEERR_AO + if ((from->si_signo == SIGBUS) && (from->si_code == BUS_MCEERR_AO)) + new.si_addr_lsb = from->si_addr_lsb; +#endif +#ifdef SEGV_BNDERR + if ((from->si_signo == SIGSEGV) && + (from->si_code == SEGV_BNDERR)) { + new.si_lower = ptr_to_compat(from->si_lower); + new.si_upper = ptr_to_compat(from->si_upper); + } +#endif +#ifdef SEGV_PKUERR + if ((from->si_signo == SIGSEGV) && + (from->si_code == SEGV_PKUERR)) + new.si_pkey = from->si_pkey; +#endif + + break; + case SIL_CHLD: + new.si_pid = from->si_pid; + new.si_uid = from->si_uid; + new.si_status = from->si_status; +#ifdef CONFIG_X86_X32_ABI + if (x32_ABI) { + new._sifields._sigchld_x32._utime = from->si_utime; + new._sifields._sigchld_x32._stime = from->si_stime; + } else +#endif + { + new.si_utime = from->si_utime; + new.si_stime = from->si_stime; + } + break; + case SIL_RT: + new.si_pid = from->si_pid; + new.si_uid = from->si_uid; + new.si_int = from->si_int; + break; + case SIL_SYS: + new.si_call_addr = ptr_to_compat(from->si_call_addr); + new.si_syscall = from->si_syscall; + new.si_arch = from->si_arch; + break; + } + + if (copy_to_user(to, &new, sizeof(struct compat_siginfo))) + return -EFAULT; + + return 0; +} + int copy_siginfo_from_user32(struct siginfo *to, const struct compat_siginfo __user *ufrom) { -- cgit v1.2.2 From d2279c9d7f7db7f97567368bfc4539b3411adf8d Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Fri, 5 Jan 2018 19:25:38 +0900 Subject: kallsyms: remove print_symbol() function No more print_symbol()/__print_symbol() users left, remove these symbols. It was a very old API that encouraged people use continuous lines. It had been obsoleted by %pS format specifier in a normal printk() call. Link: http://lkml.kernel.org/r/20180105102538.GC471@jagdpanzerIV Cc: Andrew Morton Cc: Russell King Cc: Catalin Marinas Cc: Mark Salter Cc: Tony Luck Cc: David Howells Cc: Yoshinori Sato Cc: Guan Xuetao Cc: Borislav Petkov Cc: Greg Kroah-Hartman Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Vineet Gupta Cc: Fengguang Wu Cc: Steven Rostedt Cc: LKML Cc: linux-arm-kernel@lists.infradead.org Cc: linux-c6x-dev@linux-c6x.org Cc: linux-ia64@vger.kernel.org Cc: linux-am33-list@redhat.com Cc: linux-sh@vger.kernel.org Cc: linux-edac@vger.kernel.org Cc: x86@kernel.org Cc: linux-snps-arc@lists.infradead.org Cc: Sergey Senozhatsky Signed-off-by: Sergey Senozhatsky Suggested-by: Joe Perches [pmladek@suse.com: updated commit message] Signed-off-by: Petr Mladek --- kernel/kallsyms.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 531ffa984bc2..32ba256f0092 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -464,17 +464,6 @@ int sprint_backtrace(char *buffer, unsigned long address) return __sprint_symbol(buffer, address, -1, 1); } -/* Look up a kernel symbol and print it to the kernel messages. */ -void __print_symbol(const char *fmt, unsigned long address) -{ - char buffer[KSYM_SYMBOL_LEN]; - - sprint_symbol(buffer, address); - - printk(fmt, buffer); -} -EXPORT_SYMBOL(__print_symbol); - /* To avoid using get_symbol_offset for every symbol, we carry prefix along. */ struct kallsym_iter { loff_t pos; -- cgit v1.2.2 From dbdda842fe96f8932bae554f0adf463c27c42bc7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 10 Jan 2018 14:24:17 +0100 Subject: printk: Add console owner and waiter logic to load balance console writes This patch implements what I discussed in Kernel Summit. I added lockdep annotation (hopefully correctly), and it hasn't had any splats (since I fixed some bugs in the first iterations). It did catch problems when I had the owner covering too much. But now that the owner is only set when actively calling the consoles, lockdep has stayed quiet. Here's the design again: I added a "console_owner" which is set to a task that is actively writing to the consoles. It is *not* the same as the owner of the console_lock. It is only set when doing the calls to the console functions. It is protected by a console_owner_lock which is a raw spin lock. There is a console_waiter. This is set when there is an active console owner that is not current, and waiter is not set. This too is protected by console_owner_lock. In printk() when it tries to write to the consoles, we have: if (console_trylock()) console_unlock(); Now I added an else, which will check if there is an active owner, and no current waiter. If that is the case, then console_waiter is set, and the task goes into a spin until it is no longer set. When the active console owner finishes writing the current message to the consoles, it grabs the console_owner_lock and sees if there is a waiter, and clears console_owner. If there is a waiter, then it breaks out of the loop, clears the waiter flag (because that will release the waiter from its spin), and exits. Note, it does *not* release the console semaphore. Because it is a semaphore, there is no owner. Another task may release it. This means that the waiter is guaranteed to be the new console owner! Which it becomes. Then the waiter calls console_unlock() and continues to write to the consoles. If another task comes along and does a printk() it too can become the new waiter, and we wash rinse and repeat! By Petr Mladek about possible new deadlocks: The thing is that we move console_sem only to printk() call that normally calls console_unlock() as well. It means that the transferred owner should not bring new type of dependencies. As Steven said somewhere: "If there is a deadlock, it was there even before." We could look at it from this side. The possible deadlock would look like: CPU0 CPU1 console_unlock() console_owner = current; spin_lockA() printk() spin = true; while (...) call_console_drivers() spin_lockA() This would be a deadlock. CPU0 would wait for the lock A. While CPU1 would own the lockA and would wait for CPU0 to finish calling the console drivers and pass the console_sem owner. But if the above is true than the following scenario was already possible before: CPU0 spin_lockA() printk() console_unlock() call_console_drivers() spin_lockA() By other words, this deadlock was there even before. Such deadlocks are prevented by using printk_deferred() in the sections guarded by the lock A. By Steven Rostedt: To demonstrate the issue, this module has been shown to lock up a system with 4 CPUs and a slow console (like a serial console). It is also able to lock up a 8 CPU system with only a fast (VGA) console, by passing in "loops=100". The changes in this commit prevent this module from locking up the system. #include #include #include #include #include #include static bool stop_testing; static unsigned int loops = 1; static void preempt_printk_workfn(struct work_struct *work) { int i; while (!READ_ONCE(stop_testing)) { for (i = 0; i < loops && !READ_ONCE(stop_testing); i++) { preempt_disable(); pr_emerg("%5d%-75s\n", smp_processor_id(), " XXX NOPREEMPT"); preempt_enable(); } msleep(1); } } static struct work_struct __percpu *works; static void finish(void) { int cpu; WRITE_ONCE(stop_testing, true); for_each_online_cpu(cpu) flush_work(per_cpu_ptr(works, cpu)); free_percpu(works); } static int __init test_init(void) { int cpu; works = alloc_percpu(struct work_struct); if (!works) return -ENOMEM; /* * This is just a test module. This will break if you * do any CPU hot plugging between loading and * unloading the module. */ for_each_online_cpu(cpu) { struct work_struct *work = per_cpu_ptr(works, cpu); INIT_WORK(work, &preempt_printk_workfn); schedule_work_on(cpu, work); } return 0; } static void __exit test_exit(void) { finish(); } module_param(loops, uint, 0); module_init(test_init); module_exit(test_exit); MODULE_LICENSE("GPL"); Link: http://lkml.kernel.org/r/20180110132418.7080-2-pmladek@suse.com Cc: akpm@linux-foundation.org Cc: linux-mm@kvack.org Cc: Cong Wang Cc: Dave Hansen Cc: Johannes Weiner Cc: Mel Gorman Cc: Michal Hocko Cc: Vlastimil Babka Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jan Kara Cc: Mathieu Desnoyers Cc: Tetsuo Handa Cc: Byungchul Park Cc: Tejun Heo Cc: Pavel Machek Cc: linux-kernel@vger.kernel.org Signed-off-by: Steven Rostedt (VMware) [pmladek@suse.com: Commit message about possible deadlocks] Acked-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 107 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 5d81206a572d..040fb948924e 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -86,8 +86,15 @@ EXPORT_SYMBOL_GPL(console_drivers); static struct lockdep_map console_lock_dep_map = { .name = "console_lock" }; +static struct lockdep_map console_owner_dep_map = { + .name = "console_owner" +}; #endif +static DEFINE_RAW_SPINLOCK(console_owner_lock); +static struct task_struct *console_owner; +static bool console_waiter; + enum devkmsg_log_bits { __DEVKMSG_LOG_BIT_ON = 0, __DEVKMSG_LOG_BIT_OFF, @@ -1753,8 +1760,56 @@ asmlinkage int vprintk_emit(int facility, int level, * semaphore. The release will print out buffers and wake up * /dev/kmsg and syslog() users. */ - if (console_trylock()) + if (console_trylock()) { console_unlock(); + } else { + struct task_struct *owner = NULL; + bool waiter; + bool spin = false; + + printk_safe_enter_irqsave(flags); + + raw_spin_lock(&console_owner_lock); + owner = READ_ONCE(console_owner); + waiter = READ_ONCE(console_waiter); + if (!waiter && owner && owner != current) { + WRITE_ONCE(console_waiter, true); + spin = true; + } + raw_spin_unlock(&console_owner_lock); + + /* + * If there is an active printk() writing to the + * consoles, instead of having it write our data too, + * see if we can offload that load from the active + * printer, and do some printing ourselves. + * Go into a spin only if there isn't already a waiter + * spinning, and there is an active printer, and + * that active printer isn't us (recursive printk?). + */ + if (spin) { + /* We spin waiting for the owner to release us */ + spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); + /* Owner will clear console_waiter on hand off */ + while (READ_ONCE(console_waiter)) + cpu_relax(); + + spin_release(&console_owner_dep_map, 1, _THIS_IP_); + printk_safe_exit_irqrestore(flags); + + /* + * The owner passed the console lock to us. + * Since we did not spin on console lock, annotate + * this as a trylock. Otherwise lockdep will + * complain. + */ + mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_); + console_unlock(); + printk_safe_enter_irqsave(flags); + } + printk_safe_exit_irqrestore(flags); + + } } return printed_len; @@ -2141,6 +2196,7 @@ void console_unlock(void) static u64 seen_seq; unsigned long flags; bool wake_klogd = false; + bool waiter = false; bool do_cond_resched, retry; if (console_suspended) { @@ -2229,14 +2285,64 @@ skip: console_seq++; raw_spin_unlock(&logbuf_lock); + /* + * While actively printing out messages, if another printk() + * were to occur on another CPU, it may wait for this one to + * finish. This task can not be preempted if there is a + * waiter waiting to take over. + */ + raw_spin_lock(&console_owner_lock); + console_owner = current; + raw_spin_unlock(&console_owner_lock); + + /* The waiter may spin on us after setting console_owner */ + spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); + stop_critical_timings(); /* don't trace print latency */ call_console_drivers(ext_text, ext_len, text, len); start_critical_timings(); + + raw_spin_lock(&console_owner_lock); + waiter = READ_ONCE(console_waiter); + console_owner = NULL; + raw_spin_unlock(&console_owner_lock); + + /* + * If there is a waiter waiting for us, then pass the + * rest of the work load over to that waiter. + */ + if (waiter) + break; + + /* There was no waiter, and nothing will spin on us here */ + spin_release(&console_owner_dep_map, 1, _THIS_IP_); + printk_safe_exit_irqrestore(flags); if (do_cond_resched) cond_resched(); } + + /* + * If there is an active waiter waiting on the console_lock. + * Pass off the printing to the waiter, and the waiter + * will continue printing on its CPU, and when all writing + * has finished, the last printer will wake up klogd. + */ + if (waiter) { + WRITE_ONCE(console_waiter, false); + /* The waiter is now free to continue */ + spin_release(&console_owner_dep_map, 1, _THIS_IP_); + /* + * Hand off console_lock to waiter. The waiter will perform + * the up(). After this, the waiter is the console_lock owner. + */ + mutex_release(&console_lock_dep_map, 1, _THIS_IP_); + printk_safe_exit_irqrestore(flags); + /* Note, if waiter is set, logbuf_lock is not held */ + return; + } + console_locked = 0; /* Release the exclusive_console once it is used */ -- cgit v1.2.2 From c162d5b4338d72deed61aa65ed0f2f4ba2bbc8ab Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 12 Jan 2018 17:08:37 +0100 Subject: printk: Hide console waiter logic into helpers The commit ("printk: Add console owner and waiter logic to load balance console writes") made vprintk_emit() and console_unlock() even more complicated. This patch extracts the new code into 3 helper functions. They should help to keep it rather self-contained. It will be easier to use and maintain. This patch just shuffles the existing code. It does not change the functionality. Link: http://lkml.kernel.org/r/20180112160837.GD24497@linux.suse Cc: akpm@linux-foundation.org Cc: linux-mm@kvack.org Cc: Cong Wang Cc: Dave Hansen Cc: Johannes Weiner Cc: Mel Gorman Cc: Michal Hocko Cc: Vlastimil Babka Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jan Kara Cc: Mathieu Desnoyers Cc: Tetsuo Handa Cc: rostedt@home.goodmis.org Cc: Byungchul Park Cc: Tejun Heo Cc: Pavel Machek Cc: linux-kernel@vger.kernel.org Reviewed-by: Steven Rostedt (VMware) Acked-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 245 +++++++++++++++++++++++++++++-------------------- 1 file changed, 148 insertions(+), 97 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 040fb948924e..3a475f58b749 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -86,15 +86,8 @@ EXPORT_SYMBOL_GPL(console_drivers); static struct lockdep_map console_lock_dep_map = { .name = "console_lock" }; -static struct lockdep_map console_owner_dep_map = { - .name = "console_owner" -}; #endif -static DEFINE_RAW_SPINLOCK(console_owner_lock); -static struct task_struct *console_owner; -static bool console_waiter; - enum devkmsg_log_bits { __DEVKMSG_LOG_BIT_ON = 0, __DEVKMSG_LOG_BIT_OFF, @@ -1550,6 +1543,146 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) return do_syslog(type, buf, len, SYSLOG_FROM_READER); } +/* + * Special console_lock variants that help to reduce the risk of soft-lockups. + * They allow to pass console_lock to another printk() call using a busy wait. + */ + +#ifdef CONFIG_LOCKDEP +static struct lockdep_map console_owner_dep_map = { + .name = "console_owner" +}; +#endif + +static DEFINE_RAW_SPINLOCK(console_owner_lock); +static struct task_struct *console_owner; +static bool console_waiter; + +/** + * console_lock_spinning_enable - mark beginning of code where another + * thread might safely busy wait + * + * This basically converts console_lock into a spinlock. This marks + * the section where the console_lock owner can not sleep, because + * there may be a waiter spinning (like a spinlock). Also it must be + * ready to hand over the lock at the end of the section. + */ +static void console_lock_spinning_enable(void) +{ + raw_spin_lock(&console_owner_lock); + console_owner = current; + raw_spin_unlock(&console_owner_lock); + + /* The waiter may spin on us after setting console_owner */ + spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); +} + +/** + * console_lock_spinning_disable_and_check - mark end of code where another + * thread was able to busy wait and check if there is a waiter + * + * This is called at the end of the section where spinning is allowed. + * It has two functions. First, it is a signal that it is no longer + * safe to start busy waiting for the lock. Second, it checks if + * there is a busy waiter and passes the lock rights to her. + * + * Important: Callers lose the lock if there was a busy waiter. + * They must not touch items synchronized by console_lock + * in this case. + * + * Return: 1 if the lock rights were passed, 0 otherwise. + */ +static int console_lock_spinning_disable_and_check(void) +{ + int waiter; + + raw_spin_lock(&console_owner_lock); + waiter = READ_ONCE(console_waiter); + console_owner = NULL; + raw_spin_unlock(&console_owner_lock); + + if (!waiter) { + spin_release(&console_owner_dep_map, 1, _THIS_IP_); + return 0; + } + + /* The waiter is now free to continue */ + WRITE_ONCE(console_waiter, false); + + spin_release(&console_owner_dep_map, 1, _THIS_IP_); + + /* + * Hand off console_lock to waiter. The waiter will perform + * the up(). After this, the waiter is the console_lock owner. + */ + mutex_release(&console_lock_dep_map, 1, _THIS_IP_); + return 1; +} + +/** + * console_trylock_spinning - try to get console_lock by busy waiting + * + * This allows to busy wait for the console_lock when the current + * owner is running in specially marked sections. It means that + * the current owner is running and cannot reschedule until it + * is ready to lose the lock. + * + * Return: 1 if we got the lock, 0 othrewise + */ +static int console_trylock_spinning(void) +{ + struct task_struct *owner = NULL; + bool waiter; + bool spin = false; + unsigned long flags; + + if (console_trylock()) + return 1; + + printk_safe_enter_irqsave(flags); + + raw_spin_lock(&console_owner_lock); + owner = READ_ONCE(console_owner); + waiter = READ_ONCE(console_waiter); + if (!waiter && owner && owner != current) { + WRITE_ONCE(console_waiter, true); + spin = true; + } + raw_spin_unlock(&console_owner_lock); + + /* + * If there is an active printk() writing to the + * consoles, instead of having it write our data too, + * see if we can offload that load from the active + * printer, and do some printing ourselves. + * Go into a spin only if there isn't already a waiter + * spinning, and there is an active printer, and + * that active printer isn't us (recursive printk?). + */ + if (!spin) { + printk_safe_exit_irqrestore(flags); + return 0; + } + + /* We spin waiting for the owner to release us */ + spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); + /* Owner will clear console_waiter on hand off */ + while (READ_ONCE(console_waiter)) + cpu_relax(); + spin_release(&console_owner_dep_map, 1, _THIS_IP_); + + printk_safe_exit_irqrestore(flags); + /* + * The owner passed the console lock to us. + * Since we did not spin on console lock, annotate + * this as a trylock. Otherwise lockdep will + * complain. + */ + mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_); + + return 1; +} + /* * Call the console drivers, asking them to write out * log_buf[start] to log_buf[end - 1]. @@ -1760,56 +1893,8 @@ asmlinkage int vprintk_emit(int facility, int level, * semaphore. The release will print out buffers and wake up * /dev/kmsg and syslog() users. */ - if (console_trylock()) { + if (console_trylock_spinning()) console_unlock(); - } else { - struct task_struct *owner = NULL; - bool waiter; - bool spin = false; - - printk_safe_enter_irqsave(flags); - - raw_spin_lock(&console_owner_lock); - owner = READ_ONCE(console_owner); - waiter = READ_ONCE(console_waiter); - if (!waiter && owner && owner != current) { - WRITE_ONCE(console_waiter, true); - spin = true; - } - raw_spin_unlock(&console_owner_lock); - - /* - * If there is an active printk() writing to the - * consoles, instead of having it write our data too, - * see if we can offload that load from the active - * printer, and do some printing ourselves. - * Go into a spin only if there isn't already a waiter - * spinning, and there is an active printer, and - * that active printer isn't us (recursive printk?). - */ - if (spin) { - /* We spin waiting for the owner to release us */ - spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); - /* Owner will clear console_waiter on hand off */ - while (READ_ONCE(console_waiter)) - cpu_relax(); - - spin_release(&console_owner_dep_map, 1, _THIS_IP_); - printk_safe_exit_irqrestore(flags); - - /* - * The owner passed the console lock to us. - * Since we did not spin on console lock, annotate - * this as a trylock. Otherwise lockdep will - * complain. - */ - mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_); - console_unlock(); - printk_safe_enter_irqsave(flags); - } - printk_safe_exit_irqrestore(flags); - - } } return printed_len; @@ -1910,6 +1995,8 @@ static ssize_t msg_print_ext_header(char *buf, size_t size, static ssize_t msg_print_ext_body(char *buf, size_t size, char *dict, size_t dict_len, char *text, size_t text_len) { return 0; } +static void console_lock_spinning_enable(void) { } +static int console_lock_spinning_disable_and_check(void) { return 0; } static void call_console_drivers(const char *ext_text, size_t ext_len, const char *text, size_t len) {} static size_t msg_print_text(const struct printk_log *msg, @@ -2196,7 +2283,6 @@ void console_unlock(void) static u64 seen_seq; unsigned long flags; bool wake_klogd = false; - bool waiter = false; bool do_cond_resched, retry; if (console_suspended) { @@ -2291,31 +2377,16 @@ skip: * finish. This task can not be preempted if there is a * waiter waiting to take over. */ - raw_spin_lock(&console_owner_lock); - console_owner = current; - raw_spin_unlock(&console_owner_lock); - - /* The waiter may spin on us after setting console_owner */ - spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); + console_lock_spinning_enable(); stop_critical_timings(); /* don't trace print latency */ call_console_drivers(ext_text, ext_len, text, len); start_critical_timings(); - raw_spin_lock(&console_owner_lock); - waiter = READ_ONCE(console_waiter); - console_owner = NULL; - raw_spin_unlock(&console_owner_lock); - - /* - * If there is a waiter waiting for us, then pass the - * rest of the work load over to that waiter. - */ - if (waiter) - break; - - /* There was no waiter, and nothing will spin on us here */ - spin_release(&console_owner_dep_map, 1, _THIS_IP_); + if (console_lock_spinning_disable_and_check()) { + printk_safe_exit_irqrestore(flags); + return; + } printk_safe_exit_irqrestore(flags); @@ -2323,26 +2394,6 @@ skip: cond_resched(); } - /* - * If there is an active waiter waiting on the console_lock. - * Pass off the printing to the waiter, and the waiter - * will continue printing on its CPU, and when all writing - * has finished, the last printer will wake up klogd. - */ - if (waiter) { - WRITE_ONCE(console_waiter, false); - /* The waiter is now free to continue */ - spin_release(&console_owner_dep_map, 1, _THIS_IP_); - /* - * Hand off console_lock to waiter. The waiter will perform - * the up(). After this, the waiter is the console_lock owner. - */ - mutex_release(&console_lock_dep_map, 1, _THIS_IP_); - printk_safe_exit_irqrestore(flags); - /* Note, if waiter is set, logbuf_lock is not held */ - return; - } - console_locked = 0; /* Release the exclusive_console once it is used */ -- cgit v1.2.2 From fd5f7cde1b85d4c8e09ca46ce948e008a2377f64 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 16 Jan 2018 13:47:16 +0900 Subject: printk: Never set console_may_schedule in console_trylock() This patch, basically, reverts commit 6b97a20d3a79 ("printk: set may_schedule for some of console_trylock() callers"). That commit was a mistake, it introduced a big dependency on the scheduler, by enabling preemption under console_sem in printk()->console_unlock() path, which is rather too critical. The patch did not significantly reduce the possibilities of printk() lockups, but made it possible to stall printk(), as has been reported by Tetsuo Handa [1]. Another issues is that preemption under console_sem also messes up with Steven Rostedt's hand off scheme, by making it possible to sleep with console_sem both in console_unlock() and in vprintk_emit(), after acquiring the console_sem ownership (anywhere between printk_safe_exit_irqrestore() in console_trylock_spinning() and printk_safe_enter_irqsave() in console_unlock()). This makes hand off less likely and, at the same time, may result in a significant amount of pending logbuf messages. Preempted console_sem owner makes it impossible for other CPUs to emit logbuf messages, but does not make it impossible for other CPUs to append new messages to the logbuf. Reinstate the old behavior and make printk() non-preemptible. Should any printk() lockup reports arrive they must be handled in a different way. [1] http://lkml.kernel.org/r/201603022101.CAH73907.OVOOMFHFFtQJSL%20()%20I-love%20!%20SAKURA%20!%20ne%20!%20jp Fixes: 6b97a20d3a79 ("printk: set may_schedule for some of console_trylock() callers") Link: http://lkml.kernel.org/r/20180116044716.GE6607@jagdpanzerIV To: Tetsuo Handa Cc: Sergey Senozhatsky Cc: Tejun Heo Cc: akpm@linux-foundation.org Cc: linux-mm@kvack.org Cc: Cong Wang Cc: Dave Hansen Cc: Johannes Weiner Cc: Mel Gorman Cc: Michal Hocko Cc: Vlastimil Babka Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jan Kara Cc: Mathieu Desnoyers Cc: Byungchul Park Cc: Pavel Machek Cc: linux-kernel@vger.kernel.org Signed-off-by: Sergey Senozhatsky Reported-by: Tetsuo Handa Reviewed-by: Steven Rostedt (VMware) Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 3a475f58b749..6b9d8d56e0e2 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1888,6 +1888,12 @@ asmlinkage int vprintk_emit(int facility, int level, /* If called from the scheduler, we can not call up(). */ if (!in_sched) { + /* + * Disable preemption to avoid being preempted while holding + * console_sem which would prevent anyone from printing to + * console + */ + preempt_disable(); /* * Try to acquire and then immediately release the console * semaphore. The release will print out buffers and wake up @@ -1895,6 +1901,7 @@ asmlinkage int vprintk_emit(int facility, int level, */ if (console_trylock_spinning()) console_unlock(); + preempt_enable(); } return printed_len; @@ -2211,20 +2218,7 @@ int console_trylock(void) return 0; } console_locked = 1; - /* - * When PREEMPT_COUNT disabled we can't reliably detect if it's - * safe to schedule (e.g. calling printk while holding a spin_lock), - * because preempt_disable()/preempt_enable() are just barriers there - * and preempt_count() is always 0. - * - * RCU read sections have a separate preemption counter when - * PREEMPT_RCU enabled thus we must take extra care and check - * rcu_preempt_depth(), otherwise RCU read sections modify - * preempt_count(). - */ - console_may_schedule = !oops_in_progress && - preemptible() && - !rcu_preempt_depth(); + console_may_schedule = 0; return 1; } EXPORT_SYMBOL(console_trylock); -- cgit v1.2.2 From 0752d7bf626ba90c68caff8be4e5638f6679cacf Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 24 Jul 2017 15:08:16 -0500 Subject: ptrace: Use copy_siginfo in setsiginfo and getsiginfo Now that copy_siginfo copies all of the fields this is safe, safer (as all of the bits are guaranteed to be copied), clearer, and less error prone than using a structure copy. Signed-off-by: "Eric W. Biederman" --- kernel/ptrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index ec4365da9be8..f3c82e26b995 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -659,7 +659,7 @@ static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info) if (lock_task_sighand(child, &flags)) { error = -EINVAL; if (likely(child->last_siginfo != NULL)) { - *info = *child->last_siginfo; + copy_siginfo(info, child->last_siginfo); error = 0; } unlock_task_sighand(child, &flags); @@ -675,7 +675,7 @@ static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info) if (lock_task_sighand(child, &flags)) { error = -EINVAL; if (likely(child->last_siginfo != NULL)) { - *child->last_siginfo = *info; + copy_siginfo(child->last_siginfo, info); error = 0; } unlock_task_sighand(child, &flags); -- cgit v1.2.2 From 0fe875c5c7bc67072b629f13cf1ededcb4da4fb2 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 16 Jan 2018 11:27:05 +0000 Subject: bpf: cpumap: make some functions static Fixes the following sparse warnings: kernel/bpf/cpumap.c:146:6: warning: symbol '__cpu_map_queue_destructor' was not declared. Should it be static? kernel/bpf/cpumap.c:225:16: warning: symbol 'cpu_map_build_skb' was not declared. Should it be static? kernel/bpf/cpumap.c:340:26: warning: symbol '__cpu_map_entry_alloc' was not declared. Should it be static? kernel/bpf/cpumap.c:398:6: warning: symbol '__cpu_map_entry_free' was not declared. Should it be static? kernel/bpf/cpumap.c:441:6: warning: symbol '__cpu_map_entry_replace' was not declared. Should it be static? kernel/bpf/cpumap.c:454:5: warning: symbol 'cpu_map_delete_elem' was not declared. Should it be static? kernel/bpf/cpumap.c:467:5: warning: symbol 'cpu_map_update_elem' was not declared. Should it be static? kernel/bpf/cpumap.c:505:6: warning: symbol 'cpu_map_free' was not declared. Should it be static? Signed-off-by: Wei Yongjun Signed-off-by: Daniel Borkmann --- kernel/bpf/cpumap.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 192151ec9d12..fbfdada6caee 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -137,7 +137,7 @@ free_cmap: return ERR_PTR(err); } -void __cpu_map_queue_destructor(void *ptr) +static void __cpu_map_queue_destructor(void *ptr) { /* The tear-down procedure should have made sure that queue is * empty. See __cpu_map_entry_replace() and work-queue @@ -216,8 +216,8 @@ static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp) return xdp_pkt; } -struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, - struct xdp_pkt *xdp_pkt) +static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, + struct xdp_pkt *xdp_pkt) { unsigned int frame_size; void *pkt_data_start; @@ -331,7 +331,8 @@ static int cpu_map_kthread_run(void *data) return 0; } -struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, int map_id) +static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, + int map_id) { gfp_t gfp = GFP_ATOMIC|__GFP_NOWARN; struct bpf_cpu_map_entry *rcpu; @@ -389,7 +390,7 @@ free_rcu: return NULL; } -void __cpu_map_entry_free(struct rcu_head *rcu) +static void __cpu_map_entry_free(struct rcu_head *rcu) { struct bpf_cpu_map_entry *rcpu; int cpu; @@ -432,8 +433,8 @@ void __cpu_map_entry_free(struct rcu_head *rcu) * cpu_map_kthread_stop, which waits for an RCU graze period before * stopping kthread, emptying the queue. */ -void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, - u32 key_cpu, struct bpf_cpu_map_entry *rcpu) +static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, + u32 key_cpu, struct bpf_cpu_map_entry *rcpu) { struct bpf_cpu_map_entry *old_rcpu; @@ -445,7 +446,7 @@ void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, } } -int cpu_map_delete_elem(struct bpf_map *map, void *key) +static int cpu_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); u32 key_cpu = *(u32 *)key; @@ -458,8 +459,8 @@ int cpu_map_delete_elem(struct bpf_map *map, void *key) return 0; } -int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, - u64 map_flags) +static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); struct bpf_cpu_map_entry *rcpu; @@ -496,7 +497,7 @@ int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, return 0; } -void cpu_map_free(struct bpf_map *map) +static void cpu_map_free(struct bpf_map *map) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); int cpu; -- cgit v1.2.2 From 0a2d28ff516c48a21c1e3be1ff7fba3e94248baa Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Jan 2018 15:51:45 -0800 Subject: bpf: offload: make bpf_offload_dev_match() reject host+host case Daniel suggests it would be more logical for bpf_offload_dev_match() to return false is either the program or the map are not offloaded, rather than treating the both not offloaded case as a "matching CPU/host device". This makes no functional difference today, since verifier only calls bpf_offload_dev_match() when one of the objects is offloaded. Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- kernel/bpf/offload.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 453785fa1881..a88cebf368bf 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -395,10 +395,8 @@ bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map) struct bpf_prog_offload *offload; bool ret; - if (!!bpf_prog_is_dev_bound(prog->aux) != !!bpf_map_is_dev_bound(map)) + if (!bpf_prog_is_dev_bound(prog->aux) || !bpf_map_is_dev_bound(map)) return false; - if (!bpf_prog_is_dev_bound(prog->aux)) - return true; down_read(&bpf_devs_lock); offload = prog->aux->offload; -- cgit v1.2.2 From 48b325632641da27a241912d66b38a1221ab425f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Jan 2018 15:51:46 -0800 Subject: bpf: annotate bpf_insn_print_t with __printf Functions of type bpf_insn_print_t take printf-like format string, mark the type accordingly. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- kernel/bpf/disasm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h index e0857d016f89..266fe8ee542b 100644 --- a/kernel/bpf/disasm.h +++ b/kernel/bpf/disasm.h @@ -29,8 +29,8 @@ extern const char *const bpf_class_string[8]; const char *func_id_name(int id); -typedef void (*bpf_insn_print_t)(struct bpf_verifier_env *env, - const char *, ...); +typedef __printf(2, 3) void (*bpf_insn_print_t)(struct bpf_verifier_env *env, + const char *, ...); typedef const char *(*bpf_insn_revmap_call_t)(void *private_data, const struct bpf_insn *insn); typedef const char *(*bpf_insn_print_imm_t)(void *private_data, -- cgit v1.2.2 From fcfb126defda3cee3f1d9460dbe9a2ccac4fbd21 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Tue, 16 Jan 2018 16:05:19 -0800 Subject: bpf: add new jited info fields in bpf_dev_offload and bpf_prog_info For host JIT, there are "jited_len"/"bpf_func" fields in struct bpf_prog used by all host JIT targets to get jited image and it's length. While for offload, targets are likely to have different offload mechanisms that these info are kept in device private data fields. Therefore, BPF_OBJ_GET_INFO_BY_FD syscall needs an unified way to get JIT length and contents info for offload targets. One way is to introduce new callback to parse device private data then fill those fields in bpf_prog_info. This might be a little heavy, the other way is to add generic fields which will be initialized by all offload targets. This patch follow the second approach to introduce two new fields in struct bpf_dev_offload and teach bpf_prog_get_info_by_fd about them to fill correct jited_prog_len and jited_prog_insns in bpf_prog_info. Reviewed-by: Jakub Kicinski Signed-off-by: Jiong Wang Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/offload.c | 23 +++++++++++++++++++++++ kernel/bpf/syscall.c | 31 ++++++++++++++++++------------- 2 files changed, 41 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index a88cebf368bf..6c0baa1cf8f8 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -230,9 +230,12 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info, .prog = prog, .info = info, }; + struct bpf_prog_aux *aux = prog->aux; struct inode *ns_inode; struct path ns_path; + char __user *uinsns; void *res; + u32 ulen; res = ns_get_path_cb(&ns_path, bpf_prog_offload_info_fill_ns, &args); if (IS_ERR(res)) { @@ -241,6 +244,26 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info, return PTR_ERR(res); } + down_read(&bpf_devs_lock); + + if (!aux->offload) { + up_read(&bpf_devs_lock); + return -ENODEV; + } + + ulen = info->jited_prog_len; + info->jited_prog_len = aux->offload->jited_len; + if (info->jited_prog_len & ulen) { + uinsns = u64_to_user_ptr(info->jited_prog_insns); + ulen = min_t(u32, info->jited_prog_len, ulen); + if (copy_to_user(uinsns, aux->offload->jited_image, ulen)) { + up_read(&bpf_devs_lock); + return -EFAULT; + } + } + + up_read(&bpf_devs_lock); + ns_inode = ns_path.dentry->d_inode; info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev); info->netns_ino = ns_inode->i_ino; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c691b9e972e3..c28524483bf4 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1724,19 +1724,6 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, goto done; } - ulen = info.jited_prog_len; - info.jited_prog_len = prog->jited_len; - if (info.jited_prog_len && ulen) { - if (bpf_dump_raw_ok()) { - uinsns = u64_to_user_ptr(info.jited_prog_insns); - ulen = min_t(u32, info.jited_prog_len, ulen); - if (copy_to_user(uinsns, prog->bpf_func, ulen)) - return -EFAULT; - } else { - info.jited_prog_insns = 0; - } - } - ulen = info.xlated_prog_len; info.xlated_prog_len = bpf_prog_insn_size(prog); if (info.xlated_prog_len && ulen) { @@ -1762,6 +1749,24 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, err = bpf_prog_offload_info_fill(&info, prog); if (err) return err; + goto done; + } + + /* NOTE: the following code is supposed to be skipped for offload. + * bpf_prog_offload_info_fill() is the place to fill similar fields + * for offload. + */ + ulen = info.jited_prog_len; + info.jited_prog_len = prog->jited_len; + if (info.jited_prog_len && ulen) { + if (bpf_dump_raw_ok()) { + uinsns = u64_to_user_ptr(info.jited_prog_insns); + ulen = min_t(u32, info.jited_prog_len, ulen); + if (copy_to_user(uinsns, prog->bpf_func, ulen)) + return -EFAULT; + } else { + info.jited_prog_insns = 0; + } } done: -- cgit v1.2.2 From eefa864a81501161c05b35f14197677c937e5b9a Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 17 Jan 2018 09:19:32 -0800 Subject: bpf: change fake_ip for bpf_trace_printk helper Currently, for bpf_trace_printk helper, fake ip address 0x1 is used with comments saying that fake ip will not be printed. This is indeed true for 4.12 and earlier version, but for 4.13 and later version, the ip address will be printed if it cannot be resolved with kallsym. Running samples/bpf/tracex5 program and you will have the following in the debugfs trace_pipe output: ... <...>-1819 [003] .... 443.497877: 0x00000001: mmap <...>-1819 [003] .... 443.498289: 0x00000001: syscall=102 (one of get/set uid/pid/gid) ... The kernel commit changed this behavior is: commit feaf1283d11794b9d518fcfd54b6bf8bee1f0b4b Author: Steven Rostedt (VMware) Date: Thu Jun 22 17:04:55 2017 -0400 tracing: Show address when function names are not found ... This patch changed the comment and also altered the fake ip address to 0x0 as users may think 0x1 has some special meaning while it doesn't. The new output: ... <...>-1799 [002] .... 25.953576: 0: mmap <...>-1799 [002] .... 25.953865: 0: read(fd=0, buf=00000000053936b5, size=512) ... Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/trace/bpf_trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f274468cbc45..fc2838ac8b78 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -245,7 +245,7 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, */ #define __BPF_TP_EMIT() __BPF_ARG3_TP() #define __BPF_TP(...) \ - __trace_printk(1 /* Fake ip will not be printed. */, \ + __trace_printk(0 /* Fake ip */, \ fmt, ##__VA_ARGS__) #define __BPF_ARG1_TP(...) \ -- cgit v1.2.2 From 61f3c964dfd287b05d7ac6660a4f4ddfef84786c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 17 Jan 2018 16:52:02 -0800 Subject: bpf: allow socket_filter programs to use bpf_prog_test_run in order to improve test coverage allow socket_filter program type to be run via bpf_prog_test_run command. Since such programs can be loaded by non-root tighten permissions for bpf_prog_test_run to be root only to avoid surprises. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c28524483bf4..97a825ffc763 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1504,6 +1504,8 @@ static int bpf_prog_test_run(const union bpf_attr *attr, struct bpf_prog *prog; int ret = -ENOTSUPP; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; if (CHECK_ATTR(BPF_PROG_TEST_RUN)) return -EINVAL; -- cgit v1.2.2 From ad46061fca87c0ab6670af3a44e03237f99d7a1f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 17 Jan 2018 19:13:25 -0800 Subject: bpf: arraymap: move checks out of alloc function Use the new callback to perform allocation checks for array maps. The fd maps don't need a special allocation callback, they only need a special check callback. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- kernel/bpf/arraymap.c | 42 ++++++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index ab94d304a634..68336092bfb4 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -49,27 +49,35 @@ static int bpf_array_alloc_percpu(struct bpf_array *array) } /* Called from syscall */ -static struct bpf_map *array_map_alloc(union bpf_attr *attr) +static int array_map_alloc_check(union bpf_attr *attr) { bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; int numa_node = bpf_map_attr_numa_node(attr); - u32 elem_size, index_mask, max_entries; - bool unpriv = !capable(CAP_SYS_ADMIN); - struct bpf_array *array; - u64 array_size, mask64; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || attr->value_size == 0 || attr->map_flags & ~ARRAY_CREATE_FLAG_MASK || (percpu && numa_node != NUMA_NO_NODE)) - return ERR_PTR(-EINVAL); + return -EINVAL; if (attr->value_size > KMALLOC_MAX_SIZE) /* if value_size is bigger, the user space won't be able to * access the elements. */ - return ERR_PTR(-E2BIG); + return -E2BIG; + + return 0; +} + +static struct bpf_map *array_map_alloc(union bpf_attr *attr) +{ + bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; + int numa_node = bpf_map_attr_numa_node(attr); + u32 elem_size, index_mask, max_entries; + bool unpriv = !capable(CAP_SYS_ADMIN); + struct bpf_array *array; + u64 array_size, mask64; elem_size = round_up(attr->value_size, 8); @@ -327,6 +335,7 @@ static void array_map_free(struct bpf_map *map) } const struct bpf_map_ops array_map_ops = { + .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, @@ -337,6 +346,7 @@ const struct bpf_map_ops array_map_ops = { }; const struct bpf_map_ops percpu_array_map_ops = { + .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, @@ -345,12 +355,12 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_delete_elem = array_map_delete_elem, }; -static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr) +static int fd_array_map_alloc_check(union bpf_attr *attr) { /* only file descriptors can be stored in this type of map */ if (attr->value_size != sizeof(u32)) - return ERR_PTR(-EINVAL); - return array_map_alloc(attr); + return -EINVAL; + return array_map_alloc_check(attr); } static void fd_array_map_free(struct bpf_map *map) @@ -474,7 +484,8 @@ void bpf_fd_array_map_clear(struct bpf_map *map) } const struct bpf_map_ops prog_array_map_ops = { - .map_alloc = fd_array_map_alloc, + .map_alloc_check = fd_array_map_alloc_check, + .map_alloc = array_map_alloc, .map_free = fd_array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, @@ -561,7 +572,8 @@ static void perf_event_fd_array_release(struct bpf_map *map, } const struct bpf_map_ops perf_event_array_map_ops = { - .map_alloc = fd_array_map_alloc, + .map_alloc_check = fd_array_map_alloc_check, + .map_alloc = array_map_alloc, .map_free = fd_array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, @@ -592,7 +604,8 @@ static void cgroup_fd_array_free(struct bpf_map *map) } const struct bpf_map_ops cgroup_array_map_ops = { - .map_alloc = fd_array_map_alloc, + .map_alloc_check = fd_array_map_alloc_check, + .map_alloc = array_map_alloc, .map_free = cgroup_fd_array_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, @@ -610,7 +623,7 @@ static struct bpf_map *array_of_map_alloc(union bpf_attr *attr) if (IS_ERR(inner_map_meta)) return inner_map_meta; - map = fd_array_map_alloc(attr); + map = array_map_alloc(attr); if (IS_ERR(map)) { bpf_map_meta_free(inner_map_meta); return map; @@ -673,6 +686,7 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map, } const struct bpf_map_ops array_of_maps_map_ops = { + .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_of_map_alloc, .map_free = array_of_map_free, .map_get_next_key = array_map_get_next_key, -- cgit v1.2.2 From 32852649ba3f74aab10025f2e59ca2b49d5cccfa Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 17 Jan 2018 19:13:26 -0800 Subject: bpf: arraymap: use bpf_map_init_from_attr() Arraymap was not converted to use bpf_map_init_from_attr() to avoid merge conflicts with emergency fixes. Do it now. Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- kernel/bpf/arraymap.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 68336092bfb4..b1f66480135b 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -120,12 +120,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) array->map.unpriv_array = unpriv; /* copy mandatory map attributes */ - array->map.map_type = attr->map_type; - array->map.key_size = attr->key_size; - array->map.value_size = attr->value_size; - array->map.max_entries = attr->max_entries; - array->map.map_flags = attr->map_flags; - array->map.numa_node = numa_node; + bpf_map_init_from_attr(&array->map, attr); array->elem_size = elem_size; if (!percpu) -- cgit v1.2.2 From 7a0ef6939548b9eb74bf464daf55ad68a23602a2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 17 Jan 2018 19:13:27 -0800 Subject: bpf: offload: allow array map offload The special handling of different map types is left to the driver. Allow offload of array maps by simply adding it to accepted types. For nfp we have to make sure array elements are not deleted. Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- kernel/bpf/offload.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 6c0baa1cf8f8..2657976aec2a 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -299,7 +299,8 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) if (!capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); - if (attr->map_type != BPF_MAP_TYPE_HASH) + if (attr->map_type != BPF_MAP_TYPE_ARRAY && + attr->map_type != BPF_MAP_TYPE_HASH) return ERR_PTR(-EINVAL); offmap = kzalloc(sizeof(*offmap), GFP_USER); -- cgit v1.2.2 From 52775b33bb5072fbc07b02c0cf4fe8da1f7ee7cd Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 17 Jan 2018 19:13:28 -0800 Subject: bpf: offload: report device information about offloaded maps Tell user space about device on which the map was created. Unfortunate reality of user ABI makes sharing this code with program offload difficult but the information is the same. Signed-off-by: Jakub Kicinski Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/offload.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 6 ++++++ 2 files changed, 61 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 2657976aec2a..c9401075b58c 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -413,6 +413,61 @@ int bpf_map_offload_get_next_key(struct bpf_map *map, void *key, void *next_key) return ret; } +struct ns_get_path_bpf_map_args { + struct bpf_offloaded_map *offmap; + struct bpf_map_info *info; +}; + +static struct ns_common *bpf_map_offload_info_fill_ns(void *private_data) +{ + struct ns_get_path_bpf_map_args *args = private_data; + struct ns_common *ns; + struct net *net; + + rtnl_lock(); + down_read(&bpf_devs_lock); + + if (args->offmap->netdev) { + args->info->ifindex = args->offmap->netdev->ifindex; + net = dev_net(args->offmap->netdev); + get_net(net); + ns = &net->ns; + } else { + args->info->ifindex = 0; + ns = NULL; + } + + up_read(&bpf_devs_lock); + rtnl_unlock(); + + return ns; +} + +int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map) +{ + struct ns_get_path_bpf_map_args args = { + .offmap = map_to_offmap(map), + .info = info, + }; + struct inode *ns_inode; + struct path ns_path; + void *res; + + res = ns_get_path_cb(&ns_path, bpf_map_offload_info_fill_ns, &args); + if (IS_ERR(res)) { + if (!info->ifindex) + return -ENODEV; + return PTR_ERR(res); + } + + ns_inode = ns_path.dentry->d_inode; + info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev); + info->netns_ino = ns_inode->i_ino; + path_put(&ns_path); + + return 0; +} + bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map) { struct bpf_offloaded_map *offmap; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 97a825ffc763..5bdb0cc84ad2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1801,6 +1801,12 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, info.map_flags = map->map_flags; memcpy(info.name, map->name, sizeof(map->name)); + if (bpf_map_is_dev_bound(map)) { + err = bpf_map_offload_info_fill(&info, map); + if (err) + return err; + } + if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) return -EFAULT; -- cgit v1.2.2 From 08a77676f9c5fc69a681ccd2cd8140e65dcb26c7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 9 Jan 2018 07:21:15 -0800 Subject: string: drop __must_check from strscpy() and restore strscpy() usages in cgroup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit e7fd37ba1217 ("cgroup: avoid copying strings longer than the buffers") converted possibly unsafe strncpy() usages in cgroup to strscpy(). However, although the callsites are completely fine with truncated copied, because strscpy() is marked __must_check, it led to the following warnings. kernel/cgroup/cgroup.c: In function ‘cgroup_file_name’: kernel/cgroup/cgroup.c:1400:10: warning: ignoring return value of ‘strscpy’, declared with attribute warn_unused_result [-Wunused-result] strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX); ^ To avoid the warnings, 50034ed49645 ("cgroup: use strlcpy() instead of strscpy() to avoid spurious warning") switched them to strlcpy(). strlcpy() is worse than strlcpy() because it unconditionally runs strlen() on the source string, and the only reason we switched to strlcpy() here was because it was lacking __must_check, which doesn't reflect any material differences between the two function. It's just that someone added __must_check to strscpy() and not to strlcpy(). These basic string copy operations are used in variety of ways, and one of not-so-uncommon use cases is safely handling truncated copies, where the caller naturally doesn't care about the return value. The __must_check doesn't match the actual use cases and forces users to opt for inferior variants which lack __must_check by happenstance or spread ugly (void) casts. Remove __must_check from strscpy() and restore strscpy() usages in cgroup. Signed-off-by: Tejun Heo Suggested-by: Linus Torvalds Cc: Ma Shimiao Cc: Arnd Bergmann Cc: Chris Metcalf --- kernel/cgroup/cgroup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 7e4c44538119..8cda3bc3ae22 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1397,7 +1397,7 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, cft->name); else - strlcpy(buf, cft->name, CGROUP_FILE_NAME_MAX); + strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX); return buf; } @@ -1864,9 +1864,9 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) root->flags = opts->flags; if (opts->release_agent) - strlcpy(root->release_agent_path, opts->release_agent, PATH_MAX); + strscpy(root->release_agent_path, opts->release_agent, PATH_MAX); if (opts->name) - strlcpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN); + strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN); if (opts->cpuset_clone_children) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } -- cgit v1.2.2 From b471f2f1de8b816f1e799b80aa92588f3566e4bd Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 18 Jan 2018 15:08:50 -0800 Subject: bpf: implement MAP_GET_NEXT_KEY command for LPM_TRIE map Current LPM_TRIE map type does not implement MAP_GET_NEXT_KEY command. This command is handy when users want to enumerate keys. Otherwise, a different map which supports key enumeration may be required to store the keys. If the map data is sparse and all map data are to be deleted without closing file descriptor, using MAP_GET_NEXT_KEY to find all keys is much faster than enumerating all key space. This patch implements MAP_GET_NEXT_KEY command for LPM_TRIE map. If user provided key pointer is NULL or the key does not have an exact match in the trie, the first key will be returned. Otherwise, the next key will be returned. In this implemenation, key enumeration follows a postorder traversal of internal trie. More specific keys will be returned first than less specific ones, given a sequence of MAP_GET_NEXT_KEY syscalls. Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/lpm_trie.c | 95 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 93 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 584e02227671..d7ea96218516 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -591,9 +591,100 @@ unlock: raw_spin_unlock(&trie->lock); } -static int trie_get_next_key(struct bpf_map *map, void *key, void *next_key) +static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) { - return -ENOTSUPP; + struct lpm_trie *trie = container_of(map, struct lpm_trie, map); + struct bpf_lpm_trie_key *key = _key, *next_key = _next_key; + struct lpm_trie_node *node, *next_node = NULL, *parent; + struct lpm_trie_node **node_stack = NULL; + struct lpm_trie_node __rcu **root; + int err = 0, stack_ptr = -1; + unsigned int next_bit; + size_t matchlen; + + /* The get_next_key follows postorder. For the 4 node example in + * the top of this file, the trie_get_next_key() returns the following + * one after another: + * 192.168.0.0/24 + * 192.168.1.0/24 + * 192.168.128.0/24 + * 192.168.0.0/16 + * + * The idea is to return more specific keys before less specific ones. + */ + + /* Empty trie */ + if (!rcu_dereference(trie->root)) + return -ENOENT; + + /* For invalid key, find the leftmost node in the trie */ + if (!key || key->prefixlen > trie->max_prefixlen) { + root = &trie->root; + goto find_leftmost; + } + + node_stack = kmalloc(trie->max_prefixlen * sizeof(struct lpm_trie_node *), + GFP_USER | __GFP_NOWARN); + if (!node_stack) + return -ENOMEM; + + /* Try to find the exact node for the given key */ + for (node = rcu_dereference(trie->root); node;) { + node_stack[++stack_ptr] = node; + matchlen = longest_prefix_match(trie, node, key); + if (node->prefixlen != matchlen || + node->prefixlen == key->prefixlen) + break; + + next_bit = extract_bit(key->data, node->prefixlen); + node = rcu_dereference(node->child[next_bit]); + } + if (!node || node->prefixlen != key->prefixlen || + (node->flags & LPM_TREE_NODE_FLAG_IM)) { + root = &trie->root; + goto find_leftmost; + } + + /* The node with the exactly-matching key has been found, + * find the first node in postorder after the matched node. + */ + node = node_stack[stack_ptr]; + while (stack_ptr > 0) { + parent = node_stack[stack_ptr - 1]; + if (rcu_dereference(parent->child[0]) == node && + rcu_dereference(parent->child[1])) { + root = &parent->child[1]; + goto find_leftmost; + } + if (!(parent->flags & LPM_TREE_NODE_FLAG_IM)) { + next_node = parent; + goto do_copy; + } + + node = parent; + stack_ptr--; + } + + /* did not find anything */ + err = -ENOENT; + goto free_stack; + +find_leftmost: + /* Find the leftmost non-intermediate node, all intermediate nodes + * have exact two children, so this function will never return NULL. + */ + for (node = rcu_dereference(*root); node;) { + if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) + next_node = node; + node = rcu_dereference(node->child[0]); + } +do_copy: + next_key->prefixlen = next_node->prefixlen; + memcpy((void *)next_key + offsetof(struct bpf_lpm_trie_key, data), + next_node->data, trie->data_size); +free_stack: + kfree(node_stack); + return err; } const struct bpf_map_ops trie_map_ops = { -- cgit v1.2.2 From 10a0cd6e4932b5078215b1ec2c896597eec0eff9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=2E=20Sch=C3=B6nherr?= Date: Fri, 19 Jan 2018 16:27:54 -0800 Subject: mm: Fix memory size alignment in devm_memremap_pages_release() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The functions devm_memremap_pages() and devm_memremap_pages_release() use different ways to calculate the section-aligned amount of memory. The latter function may use an incorrect size if the memory region is small but straddles a section border. Use the same code for both. Cc: Fixes: 5f29a77cd957 ("mm: fix mixed zone detection in devm_memremap_pages") Signed-off-by: Jan H. Schönherr Signed-off-by: Dan Williams --- kernel/memremap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index ada31b0d76d4..4ef97525a4ff 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -293,7 +293,8 @@ static void devm_memremap_pages_release(void *data) /* pages are dead and unused, undo the arch mapping */ align_start = res->start & ~(SECTION_SIZE - 1); - align_size = ALIGN(resource_size(res), SECTION_SIZE); + align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) + - align_start; mem_hotplug_begin(); arch_remove_memory(align_start, align_size, pgmap->altmap_valid ? -- cgit v1.2.2 From 77dd66a3c67c93ab401ccc15efff25578be281fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=2E=20Sch=C3=B6nherr?= Date: Fri, 19 Jan 2018 16:26:33 -0800 Subject: mm: Fix devm_memremap_pages() collision handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If devm_memremap_pages() detects a collision while adding entries to the radix-tree, we call pgmap_radix_release(). Unfortunately, the function removes *all* entries for the range -- including the entries that caused the collision in the first place. Modify pgmap_radix_release() to take an additional argument to indicate where to stop, so that only newly added entries are removed from the tree. Cc: Fixes: 9476df7d80df ("mm: introduce find_dev_pagemap()") Signed-off-by: Jan H. Schönherr Signed-off-by: Dan Williams --- kernel/memremap.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 4ef97525a4ff..4849be5f9b3c 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -241,13 +241,16 @@ int device_private_entry_fault(struct vm_area_struct *vma, EXPORT_SYMBOL(device_private_entry_fault); #endif /* CONFIG_DEVICE_PRIVATE */ -static void pgmap_radix_release(struct resource *res) +static void pgmap_radix_release(struct resource *res, unsigned long end_pgoff) { unsigned long pgoff, order; mutex_lock(&pgmap_lock); - foreach_order_pgoff(res, order, pgoff) + foreach_order_pgoff(res, order, pgoff) { + if (pgoff >= end_pgoff) + break; radix_tree_delete(&pgmap_radix, PHYS_PFN(res->start) + pgoff); + } mutex_unlock(&pgmap_lock); synchronize_rcu(); @@ -302,7 +305,7 @@ static void devm_memremap_pages_release(void *data) mem_hotplug_done(); untrack_pfn(NULL, PHYS_PFN(align_start), align_size); - pgmap_radix_release(res); + pgmap_radix_release(res, -1); dev_WARN_ONCE(dev, pgmap->altmap.alloc, "%s: failed to free all reserved pages\n", __func__); } @@ -418,7 +421,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) untrack_pfn(NULL, PHYS_PFN(align_start), align_size); err_pfn_remap: err_radix: - pgmap_radix_release(res); + pgmap_radix_release(res, pgoff); devres_free(pgmap); return ERR_PTR(error); } -- cgit v1.2.2 From 901334159419afc8c1b8556243fc53e9617472d2 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 20 Jan 2018 01:24:29 +0100 Subject: bpf, verifier: detect misconfigured mem, size argument pair I've seen two patch proposals now for helper additions that used ARG_PTR_TO_MEM or similar in reg_X but no corresponding ARG_CONST_SIZE in reg_X+1. Verifier won't complain in such case, but it will omit verifying the memory passed to the helper thus ending up badly. Detect such buggy helper function signature and bail out during verification rather than finding them through review. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 79 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 55 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2e7a43edf264..5eeb200e82c4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1837,6 +1837,19 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, } } +static bool arg_type_is_mem_ptr(enum bpf_arg_type type) +{ + return type == ARG_PTR_TO_MEM || + type == ARG_PTR_TO_MEM_OR_NULL || + type == ARG_PTR_TO_UNINIT_MEM; +} + +static bool arg_type_is_mem_size(enum bpf_arg_type type) +{ + return type == ARG_CONST_SIZE || + type == ARG_CONST_SIZE_OR_ZERO; +} + static int check_func_arg(struct bpf_verifier_env *env, u32 regno, enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta) @@ -1886,9 +1899,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, expected_type = PTR_TO_CTX; if (type != expected_type) goto err_type; - } else if (arg_type == ARG_PTR_TO_MEM || - arg_type == ARG_PTR_TO_MEM_OR_NULL || - arg_type == ARG_PTR_TO_UNINIT_MEM) { + } else if (arg_type_is_mem_ptr(arg_type)) { expected_type = PTR_TO_STACK; /* One exception here. In case function allows for NULL to be * passed in as argument, it's a SCALAR_VALUE type. Final test @@ -1949,25 +1960,12 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, err = check_stack_boundary(env, regno, meta->map_ptr->value_size, false, NULL); - } else if (arg_type == ARG_CONST_SIZE || - arg_type == ARG_CONST_SIZE_OR_ZERO) { + } else if (arg_type_is_mem_size(arg_type)) { bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); - /* bpf_xxx(..., buf, len) call will access 'len' bytes - * from stack pointer 'buf'. Check it - * note: regno == len, regno - 1 == buf - */ - if (regno == 0) { - /* kernel subsystem misconfigured verifier */ - verbose(env, - "ARG_CONST_SIZE cannot be first argument\n"); - return -EACCES; - } - /* The register is SCALAR_VALUE; the access check * happens using its boundaries. */ - if (!tnum_is_const(reg->var_off)) /* For unprivileged variable accesses, disable raw * mode so that the program is required to @@ -2111,7 +2109,7 @@ error: return -EINVAL; } -static int check_raw_mode(const struct bpf_func_proto *fn) +static bool check_raw_mode_ok(const struct bpf_func_proto *fn) { int count = 0; @@ -2126,7 +2124,44 @@ static int check_raw_mode(const struct bpf_func_proto *fn) if (fn->arg5_type == ARG_PTR_TO_UNINIT_MEM) count++; - return count > 1 ? -EINVAL : 0; + /* We only support one arg being in raw mode at the moment, + * which is sufficient for the helper functions we have + * right now. + */ + return count <= 1; +} + +static bool check_args_pair_invalid(enum bpf_arg_type arg_curr, + enum bpf_arg_type arg_next) +{ + return (arg_type_is_mem_ptr(arg_curr) && + !arg_type_is_mem_size(arg_next)) || + (!arg_type_is_mem_ptr(arg_curr) && + arg_type_is_mem_size(arg_next)); +} + +static bool check_arg_pair_ok(const struct bpf_func_proto *fn) +{ + /* bpf_xxx(..., buf, len) call will access 'len' + * bytes from memory 'buf'. Both arg types need + * to be paired, so make sure there's no buggy + * helper function specification. + */ + if (arg_type_is_mem_size(fn->arg1_type) || + arg_type_is_mem_ptr(fn->arg5_type) || + check_args_pair_invalid(fn->arg1_type, fn->arg2_type) || + check_args_pair_invalid(fn->arg2_type, fn->arg3_type) || + check_args_pair_invalid(fn->arg3_type, fn->arg4_type) || + check_args_pair_invalid(fn->arg4_type, fn->arg5_type)) + return false; + + return true; +} + +static int check_func_proto(const struct bpf_func_proto *fn) +{ + return check_raw_mode_ok(fn) && + check_arg_pair_ok(fn) ? 0 : -EINVAL; } /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] @@ -2282,7 +2317,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn if (env->ops->get_func_proto) fn = env->ops->get_func_proto(func_id); - if (!fn) { verbose(env, "unknown func %s#%d\n", func_id_name(func_id), func_id); @@ -2306,10 +2340,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn memset(&meta, 0, sizeof(meta)); meta.pkt_access = fn->pkt_access; - /* We only support one arg being in raw mode at the moment, which - * is sufficient for the helper functions we have right now. - */ - err = check_raw_mode(fn); + err = check_func_proto(fn); if (err) { verbose(env, "kernel subsystem misconfigured func %s#%d\n", func_id_name(func_id), func_id); -- cgit v1.2.2 From fa9dd599b4dae841924b022768354cfde9affecb Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 20 Jan 2018 01:24:33 +0100 Subject: bpf: get rid of pure_initcall dependency to enable jits Having a pure_initcall() callback just to permanently enable BPF JITs under CONFIG_BPF_JIT_ALWAYS_ON is unnecessary and could leave a small race window in future where JIT is still disabled on boot. Since we know about the setting at compilation time anyway, just initialize it properly there. Also consolidate all the individual bpf_jit_enable variables into a single one and move them under one location. Moreover, don't allow for setting unspecified garbage values on them. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 25e723b0dfd4..bc9c5b11d6a9 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -300,6 +300,11 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, } #ifdef CONFIG_BPF_JIT +/* All BPF JIT sysctl knobs here. */ +int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON); +int bpf_jit_harden __read_mostly; +int bpf_jit_kallsyms __read_mostly; + static __always_inline void bpf_get_prog_addr_region(const struct bpf_prog *prog, unsigned long *symbol_start, @@ -381,8 +386,6 @@ static DEFINE_SPINLOCK(bpf_lock); static LIST_HEAD(bpf_kallsyms); static struct latch_tree_root bpf_tree __cacheline_aligned; -int bpf_jit_kallsyms __read_mostly; - static void bpf_prog_ksym_node_add(struct bpf_prog_aux *aux) { WARN_ON_ONCE(!list_empty(&aux->ksym_lnode)); @@ -563,8 +566,6 @@ void __weak bpf_jit_free(struct bpf_prog *fp) bpf_prog_unlock_free(fp); } -int bpf_jit_harden __read_mostly; - static int bpf_jit_blind_insn(const struct bpf_insn *from, const struct bpf_insn *aux, struct bpf_insn *to_buff) @@ -1379,9 +1380,13 @@ void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth) } #else -static unsigned int __bpf_prog_ret0(const void *ctx, - const struct bpf_insn *insn) +static unsigned int __bpf_prog_ret0_warn(const void *ctx, + const struct bpf_insn *insn) { + /* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON + * is not working properly, so warn about it! + */ + WARN_ON_ONCE(1); return 0; } #endif @@ -1441,7 +1446,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; #else - fp->bpf_func = __bpf_prog_ret0; + fp->bpf_func = __bpf_prog_ret0_warn; #endif /* eBPF JITs can rewrite the program in case constant -- cgit v1.2.2 From 4bd95f4b99e921f51783bfddcd9738e9d3eef2b5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 20 Jan 2018 01:24:36 +0100 Subject: bpf: add upper complexity limit to verifier log Given the limit could potentially get further adjustments in the future, add it to the log so it becomes obvious what the current limit is w/o having to check the source first. This may also be helpful for debugging complexity related issues on kernels that backport from upstream. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5eeb200e82c4..caae4955fbdb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4810,7 +4810,8 @@ process_bpf_exit: insn_idx++; } - verbose(env, "processed %d insns, stack depth ", insn_processed); + verbose(env, "processed %d insns (limit %d), stack depth ", + insn_processed, BPF_COMPLEXITY_LIMIT_INSNS); for (i = 0; i < env->subprog_cnt + 1; i++) { u32 depth = env->subprog_stack_depth[i]; -- cgit v1.2.2 From 6fd78a1a99c9580da49ee8f951fdce9846256375 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Fri, 19 Jan 2018 13:39:01 +0900 Subject: printk: drop redundant devkmsg_log_str memsets We copy in null terminated strings "on" and "off", no need to zero out devkmsg_log_str in control_devkmsg(). Link: http://lkml.kernel.org/r/20180119043901.1728-1-sergey.senozhatsky@gmail.com Cc: linux-kernel@vger.kernel.org Signed-off-by: Sergey Senozhatsky Reviewed-by: Steven Rostedt (VMware) Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 568729e0dc2c..bf2e6741ec12 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -131,13 +131,10 @@ static int __init control_devkmsg(char *str) /* * Set sysctl string accordingly: */ - if (devkmsg_log == DEVKMSG_LOG_MASK_ON) { - memset(devkmsg_log_str, 0, DEVKMSG_STR_MAX_SIZE); - strncpy(devkmsg_log_str, "on", 2); - } else if (devkmsg_log == DEVKMSG_LOG_MASK_OFF) { - memset(devkmsg_log_str, 0, DEVKMSG_STR_MAX_SIZE); - strncpy(devkmsg_log_str, "off", 3); - } + if (devkmsg_log == DEVKMSG_LOG_MASK_ON) + strcpy(devkmsg_log_str, "on"); + else if (devkmsg_log == DEVKMSG_LOG_MASK_OFF) + strcpy(devkmsg_log_str, "off"); /* else "ratelimit" which is set by default. */ /* -- cgit v1.2.2 From 5f74972ce69fdc6473f74253283408af75a3be15 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 22 Jan 2018 14:58:57 -0600 Subject: signal: Don't use structure initializers for struct siginfo The siginfo structure has all manners of holes with the result that a structure initializer is not guaranteed to initialize all of the bits. As we have to copy the structure to userspace don't even try to use a structure initializer. Instead use clear_siginfo followed by initializing selected fields. This gives a guarantee that uninitialized kernel memory is not copied to userspace. Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 4976f05aa09b..f14492ff976f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3163,8 +3163,9 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) static int do_tkill(pid_t tgid, pid_t pid, int sig) { - struct siginfo info = {}; + struct siginfo info; + clear_siginfo(&info); info.si_signo = sig; info.si_errno = 0; info.si_code = SI_TKILL; -- cgit v1.2.2 From 3b10db2b06e2f6191aabb14babe28dcaa657a947 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 18 Aug 2017 19:56:27 -0500 Subject: signal: Replace memset(info,...) with clear_siginfo for clarity The function clear_siginfo is just a nice wrapper around memset so this results in no functional change. This change makes mistakes a little more difficult and it makes it clearer what is going on. Signed-off-by: "Eric W. Biederman" --- kernel/seccomp.c | 2 +- kernel/time/posix-timers.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 5f0dfb2abb8d..3153c9ea51bf 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -515,7 +515,7 @@ void put_seccomp_filter(struct task_struct *tsk) static void seccomp_init_siginfo(siginfo_t *info, int syscall, int reason) { - memset(info, 0, sizeof(*info)); + clear_siginfo(info); info->si_signo = SIGSYS; info->si_code = SYS_SECCOMP; info->si_call_addr = (void __user *)KSTK_EIP(current); diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index ec999f32c840..75043046914e 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -462,7 +462,7 @@ static struct k_itimer * alloc_posix_timer(void) kmem_cache_free(posix_timers_cache, tmr); return NULL; } - memset(&tmr->sigq->info, 0, sizeof(siginfo_t)); + clear_siginfo(&tmr->sigq->info); return tmr; } -- cgit v1.2.2 From f8ec66014ffd95a783b1f9f3b62d7cadb96b78d5 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 18 Jan 2018 14:54:54 -0600 Subject: signal: Add send_sig_fault and force_sig_fault The vast majority of signals sent from architecture specific code are simple faults. Encapsulate this reality with two helper functions so that the nit-picky implementation of preparing a siginfo does not need to be repeated many times on each architecture. As only some architectures support the trapno field, make the trapno arguement only present on those architectures. Similary as ia64 has three fields: imm, flags, and isr that are specific to it. Have those arguments always present on ia64 and no where else. This ensures the architecture specific code always remembers which fields it needs to pass into the siginfo structure. Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index f14492ff976f..15ec7b3cbe69 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1491,6 +1491,53 @@ force_sigsegv(int sig, struct task_struct *p) return 0; } +int force_sig_fault(int sig, int code, void __user *addr + ___ARCH_SI_TRAPNO(int trapno) + ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) + , struct task_struct *t) +{ + struct siginfo info; + + clear_siginfo(&info); + info.si_signo = sig; + info.si_errno = 0; + info.si_code = code; + info.si_addr = addr; +#ifdef __ARCH_SI_TRAPNO + info.si_trapno = trapno; +#endif +#ifdef __ia64__ + info.si_imm = imm; + info.si_flags = flags; + info.si_isr = isr; +#endif + return force_sig_info(info.si_signo, &info, t); +} + +int send_sig_fault(int sig, int code, void __user *addr + ___ARCH_SI_TRAPNO(int trapno) + ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) + , struct task_struct *t) +{ + struct siginfo info; + + clear_siginfo(&info); + info.si_signo = sig; + info.si_errno = 0; + info.si_code = code; + info.si_addr = addr; +#ifdef __ARCH_SI_TRAPNO + info.si_trapno = trapno; +#endif +#ifdef __ia64__ + info.si_imm = imm; + info.si_flags = flags; + info.si_isr = isr; +#endif + return send_sig_info(info.si_signo, &info, t); +} + + int kill_pgrp(struct pid *pid, int sig, int priv) { int ret; -- cgit v1.2.2 From 382467358ac9675b1b6814400a9a9e36dc7da14f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 18 Jan 2018 18:54:31 -0600 Subject: signal: Helpers for faults with specialized siginfo layouts The helpers added are: send_sig_mceerr force_sig_mceerr force_sig_bnderr force_sig_pkuerr Filling out siginfo properly can ge tricky. Especially for these specialized cases where the temptation is to share code with other cases which use a different subset of siginfo fields. Unfortunately that code sharing frequently results in bugs with the wrong siginfo fields filled in, and makes it harder to verify that the siginfo structure was properly initialized. Provide these helpers instead that get all of the details right, and guarantee that siginfo is properly initialized. send_sig_mceerr and force_sig_mceer are a little special as two si codes BUS_MCEERR_AO and BUS_MCEER_AR both use the same extended signinfo layout. Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 15ec7b3cbe69..4f6300ef8062 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1537,6 +1537,67 @@ int send_sig_fault(int sig, int code, void __user *addr return send_sig_info(info.si_signo, &info, t); } +#if defined(BUS_MCEERR_AO) && defined(BUS_MCEERR_AR) +int force_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t) +{ + struct siginfo info; + + WARN_ON((code != BUS_MCEERR_AO) && (code != BUS_MCEERR_AR)); + clear_siginfo(&info); + info.si_signo = SIGBUS; + info.si_errno = 0; + info.si_code = code; + info.si_addr = addr; + info.si_addr_lsb = lsb; + return force_sig_info(info.si_signo, &info, t); +} + +int send_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t) +{ + struct siginfo info; + + WARN_ON((code != BUS_MCEERR_AO) && (code != BUS_MCEERR_AR)); + clear_siginfo(&info); + info.si_signo = SIGBUS; + info.si_errno = 0; + info.si_code = code; + info.si_addr = addr; + info.si_addr_lsb = lsb; + return send_sig_info(info.si_signo, &info, t); +} +EXPORT_SYMBOL(send_sig_mceerr); +#endif + +#ifdef SEGV_BNDERR +int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper) +{ + struct siginfo info; + + clear_siginfo(&info); + info.si_signo = SIGSEGV; + info.si_errno = 0; + info.si_code = SEGV_BNDERR; + info.si_addr = addr; + info.si_lower = lower; + info.si_upper = upper; + return force_sig_info(info.si_signo, &info, current); +} +#endif + +#ifdef SEGV_PKUERR +int force_sig_pkuerr(void __user *addr, u32 pkey) +{ + struct siginfo info; + + clear_siginfo(&info); + info.si_signo = SIGSEGV; + info.si_errno = 0; + info.si_code = SEGV_PKUERR; + info.si_addr = addr; + info.si_pkey = pkey; + return force_sig_info(info.si_signo, &info, current); +} +#endif int kill_pgrp(struct pid *pid, int sig, int priv) { -- cgit v1.2.2 From f71dd7dc2dc989dc712b246a74d243e4b2c5f8a7 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 22 Jan 2018 14:37:25 -0600 Subject: signal/ptrace: Add force_sig_ptrace_errno_trap and use it where needed There are so many places that build struct siginfo by hand that at least one of them is bound to get it wrong. A handful of cases in the kernel arguably did just that when using the errno field of siginfo to pass no errno values to userspace. The usage is limited to a single si_code so at least does not mess up anything else. Encapsulate this questionable pattern in a helper function so that the userspace ABI is preserved. Update all of the places that use this pattern to use the new helper function. Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 4f6300ef8062..e549174c0831 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1599,6 +1599,21 @@ int force_sig_pkuerr(void __user *addr, u32 pkey) } #endif +/* For the crazy architectures that include trap information in + * the errno field, instead of an actual errno value. + */ +int force_sig_ptrace_errno_trap(int errno, void __user *addr) +{ + struct siginfo info; + + clear_siginfo(&info); + info.si_signo = SIGTRAP; + info.si_errno = errno; + info.si_code = TRAP_HWBKPT; + info.si_addr = addr; + return force_sig_info(info.si_signo, &info, current); +} + int kill_pgrp(struct pid *pid, int sig, int priv) { int ret; -- cgit v1.2.2 From 2310035fa03f651dd5b03f19a26a97512aa8842c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 22 Jan 2018 22:53:51 -0800 Subject: bpf: fix incorrect kmalloc usage in lpm_trie MAP_GET_NEXT_KEY rcu region In commit b471f2f1de8b ("bpf: implement MAP_GET_NEXT_KEY command for LPM_TRIE map"), the implemented MAP_GET_NEXT_KEY callback function is guarded with rcu read lock. In the function body, "kmalloc(size, GFP_USER | __GFP_NOWARN)" is used which may sleep and violate rcu read lock region requirements. This patch fixed the issue by using GFP_ATOMIC instead to avoid blocking kmalloc. Tested with CONFIG_DEBUG_ATOMIC_SLEEP=y as suggested by Eric Dumazet. Fixes: b471f2f1de8b ("bpf: implement MAP_GET_NEXT_KEY command for LPM_TRIE map") Signed-off-by: Yonghong Song Reported-by: syzbot Reviewed-by: Eric Dumazet Signed-off-by: Daniel Borkmann --- kernel/bpf/lpm_trie.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index d7ea96218516..8f083ea9d4b9 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -624,7 +624,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) } node_stack = kmalloc(trie->max_prefixlen * sizeof(struct lpm_trie_node *), - GFP_USER | __GFP_NOWARN); + GFP_ATOMIC | __GFP_NOWARN); if (!node_stack) return -ENOMEM; -- cgit v1.2.2 From 921a7acd85ebbab1b3cd99828e6842fd3e78df24 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Tue, 16 Jan 2018 17:02:28 +0800 Subject: tracing: Detect the string nul character when parsing user input string User space can pass in a C nul character '\0' along with its input. The function trace_get_user() will try to process it as a normal character, and that will fail to parse. open("/sys/kernel/debug/tracing//set_ftrace_pid", O_WRONLY|O_TRUNC) = 3 write(3, " \0", 2) = -1 EINVAL (Invalid argument) while parse can handle spaces, so below works. $ echo "" > set_ftrace_pid $ echo " " > set_ftrace_pid $ echo -n " " > set_ftrace_pid Have the parser stop on '\0' and cease any further parsing. Only process the characters up to the nul '\0' character and do not process it. Link: http://lkml.kernel.org/r/1516093350-12045-2-git-send-email-changbin.du@intel.com Acked-by: Namhyung Kim Signed-off-by: Changbin Du Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8e3f20a18a06..c00a31d18f8a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1237,7 +1237,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, } /* only spaces were written */ - if (isspace(ch)) { + if (isspace(ch) || !ch) { *ppos += read; ret = read; goto out; @@ -1247,7 +1247,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, } /* read the non-space input */ - while (cnt && !isspace(ch)) { + while (cnt && !isspace(ch) && ch) { if (parser->idx < parser->size - 1) parser->buffer[parser->idx++] = ch; else { @@ -1262,7 +1262,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, } /* We either got finished input or we have to wait for another call. */ - if (isspace(ch)) { + if (isspace(ch) || !ch) { parser->buffer[parser->idx] = 0; parser->cont = false; } else if (parser->idx < parser->size - 1) { -- cgit v1.2.2 From 76638d96502744b0d593f2386b75ae5a017c13bb Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Tue, 16 Jan 2018 17:02:29 +0800 Subject: tracing: Clear parser->idx if only spaces are read If only spaces were read while parsing the next string, then parser->idx should be cleared in order to make trace_parser_loaded() return false. Link: http://lkml.kernel.org/r/1516093350-12045-3-git-send-email-changbin.du@intel.com Acked-by: Namhyung Kim Signed-off-by: Changbin Du Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c00a31d18f8a..cb90435e63da 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1236,14 +1236,14 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, cnt--; } + parser->idx = 0; + /* only spaces were written */ if (isspace(ch) || !ch) { *ppos += read; ret = read; goto out; } - - parser->idx = 0; } /* read the non-space input */ -- cgit v1.2.2 From f4d0706cde27f29ff89e6bf94ded4113f8fe6e80 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Tue, 16 Jan 2018 17:02:30 +0800 Subject: tracing: Make sure the parsed string always terminates with '\0' Always mark the parsed string with a terminated nul '\0' character. This removes the need for the users to have to append the '\0' before using the parsed string. Link: http://lkml.kernel.org/r/1516093350-12045-4-git-send-email-changbin.du@intel.com Acked-by: Namhyung Kim Signed-off-by: Changbin Du Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 2 -- kernel/trace/trace.c | 4 ++-- kernel/trace/trace_events.c | 2 -- 3 files changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 554b517c61a0..dabd9d167d42 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5015,7 +5015,6 @@ int ftrace_regex_release(struct inode *inode, struct file *file) parser = &iter->parser; if (trace_parser_loaded(parser)) { - parser->buffer[parser->idx] = 0; ftrace_match_records(iter->hash, parser->buffer, parser->idx); } @@ -5329,7 +5328,6 @@ ftrace_graph_release(struct inode *inode, struct file *file) parser = &fgd->parser; if (trace_parser_loaded((parser))) { - parser->buffer[parser->idx] = 0; ret = ftrace_graph_set_hash(fgd->new_hash, parser->buffer); } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index cb90435e63da..58de825df19c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -530,8 +530,6 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, ubuf += ret; cnt -= ret; - parser.buffer[parser.idx] = 0; - ret = -EINVAL; if (kstrtoul(parser.buffer, 0, &val)) break; @@ -1268,6 +1266,8 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, } else if (parser->idx < parser->size - 1) { parser->cont = true; parser->buffer[parser->idx++] = ch; + /* Make sure the parsed string always terminates with '\0'. */ + parser->buffer[parser->idx] = 0; } else { ret = -EINVAL; goto out; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 1b87157edbff..05c7172c6667 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -885,8 +885,6 @@ ftrace_event_write(struct file *file, const char __user *ubuf, if (*parser.buffer == '!') set = 0; - parser.buffer[parser.idx] = 0; - ret = ftrace_set_clr_event(tr, parser.buffer + !set, set); if (ret) goto out_put; -- cgit v1.2.2 From 0e4d819d0893dc043ea7b7cb6baf4be1e310bd96 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Sat, 6 Jan 2018 11:12:46 +0530 Subject: trace_uprobe: Display correct offset in uprobe_events Recently, how the pointers being printed with %p has been changed by commit ad67b74d2469 ("printk: hash addresses printed with %p"). This is causing a regression while showing offset in the uprobe_events file. Instead of %p, use %px to display offset. Before patch: # perf probe -vv -x /tmp/a.out main Opening /sys/kernel/debug/tracing//uprobe_events write=1 Writing event: p:probe_a/main /tmp/a.out:0x58c # cat /sys/kernel/debug/tracing/uprobe_events p:probe_a/main /tmp/a.out:0x0000000049a0f352 After patch: # cat /sys/kernel/debug/tracing/uprobe_events p:probe_a/main /tmp/a.out:0x000000000000058c Link: http://lkml.kernel.org/r/20180106054246.15375-1-ravi.bangoria@linux.vnet.ibm.com Cc: stable@vger.kernel.org Fixes: ad67b74d2469 ("printk: hash addresses printed with %p") Acked-by: Srikar Dronamraju Signed-off-by: Ravi Bangoria Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_uprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 40592e7b3568..268029ae1be6 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -608,7 +608,7 @@ static int probes_seq_show(struct seq_file *m, void *v) /* Don't print "0x (null)" when offset is 0 */ if (tu->offset) { - seq_printf(m, "0x%p", (void *)tu->offset); + seq_printf(m, "0x%px", (void *)tu->offset); } else { switch (sizeof(void *)) { case 4: -- cgit v1.2.2 From dd3dad0d716dbb9fccadcc8709900d9cac6ca252 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 21 Dec 2017 15:37:32 -0800 Subject: ftrace: Mark function tracer test functions noinline/noclone The ftrace function tracer self tests calls some functions to verify the get traced. This relies on them not being inlined. Previously this was ensured by putting them into another file, but with LTO the compiler can inline across files, which makes the tests fail. Mark these functions as noinline and noclone. Link: http://lkml.kernel.org/r/20171221233732.31896-1-andi@firstfloor.org Signed-off-by: Andi Kleen Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_selftest_dynamic.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c index 8cda06a10d66..c364cf777e1a 100644 --- a/kernel/trace/trace_selftest_dynamic.c +++ b/kernel/trace/trace_selftest_dynamic.c @@ -1,13 +1,14 @@ // SPDX-License-Identifier: GPL-2.0 +#include #include "trace.h" -int DYN_FTRACE_TEST_NAME(void) +noinline __noclone int DYN_FTRACE_TEST_NAME(void) { /* used to call mcount */ return 0; } -int DYN_FTRACE_TEST_NAME2(void) +noinline __noclone int DYN_FTRACE_TEST_NAME2(void) { /* used to call mcount */ return 0; -- cgit v1.2.2 From 9c147b56fc7165856da9c510463fafc2f0d58d5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Fri, 26 Jan 2018 00:54:02 +0100 Subject: bpf: Use the IS_FD_ARRAY() macro in map_update_elem() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make the code more readable. Signed-off-by: Mickaël Salaün Cc: Alexei Starovoitov Cc: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5bdb0cc84ad2..e24aa3241387 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -709,10 +709,7 @@ static int map_update_elem(union bpf_attr *attr) err = bpf_percpu_hash_update(map, key, value, attr->flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_update(map, key, value, attr->flags); - } else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || - map->map_type == BPF_MAP_TYPE_PROG_ARRAY || - map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || - map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) { + } else if (IS_FD_ARRAY(map)) { rcu_read_lock(); err = bpf_fd_array_map_update_elem(map, f.file, key, value, attr->flags); -- cgit v1.2.2 From 2a5418a13fcfbb1f13a847eedb9a8e30a9ead765 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 26 Jan 2018 23:33:37 +0100 Subject: bpf: improve dead code sanitizing Given we recently had c131187db2d3 ("bpf: fix branch pruning logic") and 95a762e2c8c9 ("bpf: fix incorrect sign extension in check_alu_op()") in particular where before verifier skipped verification of the wrongly assumed dead branch, we should not just replace the dead code parts with nops (mov r0,r0). If there is a bug such as fixed in 95a762e2c8c9 in future again, where runtime could execute those insns, then one of the potential issues with the current setting would be that given the nops would be at the end of the program, we could execute out of bounds at some point. The best in such case would be to just exit the BPF program altogether and return an exception code. However, given this would require two instructions, and such a dead code gap could just be a single insn long, we would need to place 'r0 = X; ret' snippet at the very end after the user program or at the start before the program (where we'd skip that region on prog entry), and then place unconditional ja's into the dead code gap. While more complex but possible, there's still another block in the road that currently prevents from this, namely BPF to BPF calls. The issue here is that such exception could be returned from a callee, but the caller would not know that it's an exception that needs to be propagated further down. Alternative that has little complexity is to just use a ja-1 code for now which will trap the execution here instead of silently doing bad things if we ever get there due to bugs. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index dfb138b46488..8365259a7c5a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5064,14 +5064,21 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of return new_prog; } -/* The verifier does more data flow analysis than llvm and will not explore - * branches that are dead at run time. Malicious programs can have dead code - * too. Therefore replace all dead at-run-time code with nops. +/* The verifier does more data flow analysis than llvm and will not + * explore branches that are dead at run time. Malicious programs can + * have dead code too. Therefore replace all dead at-run-time code + * with 'ja -1'. + * + * Just nops are not optimal, e.g. if they would sit at the end of the + * program and through another bug we would manage to jump there, then + * we'd execute beyond program memory otherwise. Returning exception + * code also wouldn't work since we can have subprogs where the dead + * code could be located. */ static void sanitize_dead_code(struct bpf_verifier_env *env) { struct bpf_insn_aux_data *aux_data = env->insn_aux_data; - struct bpf_insn nop = BPF_MOV64_REG(BPF_REG_0, BPF_REG_0); + struct bpf_insn trap = BPF_JMP_IMM(BPF_JA, 0, 0, -1); struct bpf_insn *insn = env->prog->insnsi; const int insn_cnt = env->prog->len; int i; @@ -5079,7 +5086,7 @@ static void sanitize_dead_code(struct bpf_verifier_env *env) for (i = 0; i < insn_cnt; i++) { if (aux_data[i].seen) continue; - memcpy(insn + i, &nop, sizeof(nop)); + memcpy(insn + i, &trap, sizeof(trap)); } } -- cgit v1.2.2 From 5e581dad4fec0e6d062740dc35b8dc248b39d224 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 26 Jan 2018 23:33:38 +0100 Subject: bpf: make unknown opcode handling more robust Recent findings by syzcaller fixed in 7891a87efc71 ("bpf: arsh is not supported in 32 bit alu thus reject it") triggered a warning in the interpreter due to unknown opcode not being rejected by the verifier. The 'return 0' for an unknown opcode is really not optimal, since with BPF to BPF calls, this would go untracked by the verifier. Do two things here to improve the situation: i) perform basic insn sanity check early on in the verification phase and reject every non-uapi insn right there. The bpf_opcode_in_insntable() table reuses the same mapping as the jumptable in ___bpf_prog_run() sans the non-public mappings. And ii) in ___bpf_prog_run() we do need to BUG in the case where the verifier would ever create an unknown opcode due to some rewrites. Note that JITs do not have such issues since they would punt to interpreter in these situations. Moreover, the BPF_JIT_ALWAYS_ON would also help to avoid such unknown opcodes in the first place. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 250 +++++++++++++++++++++++++++++--------------------- kernel/bpf/verifier.c | 7 ++ 2 files changed, 152 insertions(+), 105 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 3aa0658add76..8301de2d1f96 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -782,6 +782,137 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) } EXPORT_SYMBOL_GPL(__bpf_call_base); +/* All UAPI available opcodes. */ +#define BPF_INSN_MAP(INSN_2, INSN_3) \ + /* 32 bit ALU operations. */ \ + /* Register based. */ \ + INSN_3(ALU, ADD, X), \ + INSN_3(ALU, SUB, X), \ + INSN_3(ALU, AND, X), \ + INSN_3(ALU, OR, X), \ + INSN_3(ALU, LSH, X), \ + INSN_3(ALU, RSH, X), \ + INSN_3(ALU, XOR, X), \ + INSN_3(ALU, MUL, X), \ + INSN_3(ALU, MOV, X), \ + INSN_3(ALU, DIV, X), \ + INSN_3(ALU, MOD, X), \ + INSN_2(ALU, NEG), \ + INSN_3(ALU, END, TO_BE), \ + INSN_3(ALU, END, TO_LE), \ + /* Immediate based. */ \ + INSN_3(ALU, ADD, K), \ + INSN_3(ALU, SUB, K), \ + INSN_3(ALU, AND, K), \ + INSN_3(ALU, OR, K), \ + INSN_3(ALU, LSH, K), \ + INSN_3(ALU, RSH, K), \ + INSN_3(ALU, XOR, K), \ + INSN_3(ALU, MUL, K), \ + INSN_3(ALU, MOV, K), \ + INSN_3(ALU, DIV, K), \ + INSN_3(ALU, MOD, K), \ + /* 64 bit ALU operations. */ \ + /* Register based. */ \ + INSN_3(ALU64, ADD, X), \ + INSN_3(ALU64, SUB, X), \ + INSN_3(ALU64, AND, X), \ + INSN_3(ALU64, OR, X), \ + INSN_3(ALU64, LSH, X), \ + INSN_3(ALU64, RSH, X), \ + INSN_3(ALU64, XOR, X), \ + INSN_3(ALU64, MUL, X), \ + INSN_3(ALU64, MOV, X), \ + INSN_3(ALU64, ARSH, X), \ + INSN_3(ALU64, DIV, X), \ + INSN_3(ALU64, MOD, X), \ + INSN_2(ALU64, NEG), \ + /* Immediate based. */ \ + INSN_3(ALU64, ADD, K), \ + INSN_3(ALU64, SUB, K), \ + INSN_3(ALU64, AND, K), \ + INSN_3(ALU64, OR, K), \ + INSN_3(ALU64, LSH, K), \ + INSN_3(ALU64, RSH, K), \ + INSN_3(ALU64, XOR, K), \ + INSN_3(ALU64, MUL, K), \ + INSN_3(ALU64, MOV, K), \ + INSN_3(ALU64, ARSH, K), \ + INSN_3(ALU64, DIV, K), \ + INSN_3(ALU64, MOD, K), \ + /* Call instruction. */ \ + INSN_2(JMP, CALL), \ + /* Exit instruction. */ \ + INSN_2(JMP, EXIT), \ + /* Jump instructions. */ \ + /* Register based. */ \ + INSN_3(JMP, JEQ, X), \ + INSN_3(JMP, JNE, X), \ + INSN_3(JMP, JGT, X), \ + INSN_3(JMP, JLT, X), \ + INSN_3(JMP, JGE, X), \ + INSN_3(JMP, JLE, X), \ + INSN_3(JMP, JSGT, X), \ + INSN_3(JMP, JSLT, X), \ + INSN_3(JMP, JSGE, X), \ + INSN_3(JMP, JSLE, X), \ + INSN_3(JMP, JSET, X), \ + /* Immediate based. */ \ + INSN_3(JMP, JEQ, K), \ + INSN_3(JMP, JNE, K), \ + INSN_3(JMP, JGT, K), \ + INSN_3(JMP, JLT, K), \ + INSN_3(JMP, JGE, K), \ + INSN_3(JMP, JLE, K), \ + INSN_3(JMP, JSGT, K), \ + INSN_3(JMP, JSLT, K), \ + INSN_3(JMP, JSGE, K), \ + INSN_3(JMP, JSLE, K), \ + INSN_3(JMP, JSET, K), \ + INSN_2(JMP, JA), \ + /* Store instructions. */ \ + /* Register based. */ \ + INSN_3(STX, MEM, B), \ + INSN_3(STX, MEM, H), \ + INSN_3(STX, MEM, W), \ + INSN_3(STX, MEM, DW), \ + INSN_3(STX, XADD, W), \ + INSN_3(STX, XADD, DW), \ + /* Immediate based. */ \ + INSN_3(ST, MEM, B), \ + INSN_3(ST, MEM, H), \ + INSN_3(ST, MEM, W), \ + INSN_3(ST, MEM, DW), \ + /* Load instructions. */ \ + /* Register based. */ \ + INSN_3(LDX, MEM, B), \ + INSN_3(LDX, MEM, H), \ + INSN_3(LDX, MEM, W), \ + INSN_3(LDX, MEM, DW), \ + /* Immediate based. */ \ + INSN_3(LD, IMM, DW), \ + /* Misc (old cBPF carry-over). */ \ + INSN_3(LD, ABS, B), \ + INSN_3(LD, ABS, H), \ + INSN_3(LD, ABS, W), \ + INSN_3(LD, IND, B), \ + INSN_3(LD, IND, H), \ + INSN_3(LD, IND, W) + +bool bpf_opcode_in_insntable(u8 code) +{ +#define BPF_INSN_2_TBL(x, y) [BPF_##x | BPF_##y] = true +#define BPF_INSN_3_TBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = true + static const bool public_insntable[256] = { + [0 ... 255] = false, + /* Now overwrite non-defaults ... */ + BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL), + }; +#undef BPF_INSN_3_TBL +#undef BPF_INSN_2_TBL + return public_insntable[code]; +} + #ifndef CONFIG_BPF_JIT_ALWAYS_ON /** * __bpf_prog_run - run eBPF program on a given context @@ -793,115 +924,18 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) { u64 tmp; +#define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y +#define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z static const void *jumptable[256] = { [0 ... 255] = &&default_label, /* Now overwrite non-defaults ... */ - /* 32 bit ALU operations */ - [BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X, - [BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K, - [BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X, - [BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K, - [BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X, - [BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K, - [BPF_ALU | BPF_OR | BPF_X] = &&ALU_OR_X, - [BPF_ALU | BPF_OR | BPF_K] = &&ALU_OR_K, - [BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X, - [BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K, - [BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X, - [BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K, - [BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X, - [BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K, - [BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X, - [BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K, - [BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X, - [BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K, - [BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X, - [BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K, - [BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X, - [BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K, - [BPF_ALU | BPF_NEG] = &&ALU_NEG, - [BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE, - [BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE, - /* 64 bit ALU operations */ - [BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X, - [BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K, - [BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X, - [BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K, - [BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X, - [BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K, - [BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X, - [BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K, - [BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X, - [BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K, - [BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X, - [BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K, - [BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X, - [BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K, - [BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X, - [BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K, - [BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X, - [BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K, - [BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X, - [BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K, - [BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X, - [BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K, - [BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X, - [BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K, - [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG, - /* Call instruction */ - [BPF_JMP | BPF_CALL] = &&JMP_CALL, + BPF_INSN_MAP(BPF_INSN_2_LBL, BPF_INSN_3_LBL), + /* Non-UAPI available opcodes. */ [BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS, [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL, - /* Jumps */ - [BPF_JMP | BPF_JA] = &&JMP_JA, - [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X, - [BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K, - [BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X, - [BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K, - [BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X, - [BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K, - [BPF_JMP | BPF_JLT | BPF_X] = &&JMP_JLT_X, - [BPF_JMP | BPF_JLT | BPF_K] = &&JMP_JLT_K, - [BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X, - [BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K, - [BPF_JMP | BPF_JLE | BPF_X] = &&JMP_JLE_X, - [BPF_JMP | BPF_JLE | BPF_K] = &&JMP_JLE_K, - [BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X, - [BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K, - [BPF_JMP | BPF_JSLT | BPF_X] = &&JMP_JSLT_X, - [BPF_JMP | BPF_JSLT | BPF_K] = &&JMP_JSLT_K, - [BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X, - [BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K, - [BPF_JMP | BPF_JSLE | BPF_X] = &&JMP_JSLE_X, - [BPF_JMP | BPF_JSLE | BPF_K] = &&JMP_JSLE_K, - [BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X, - [BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K, - /* Program return */ - [BPF_JMP | BPF_EXIT] = &&JMP_EXIT, - /* Store instructions */ - [BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B, - [BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H, - [BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W, - [BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW, - [BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W, - [BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW, - [BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B, - [BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H, - [BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W, - [BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW, - /* Load instructions */ - [BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B, - [BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H, - [BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W, - [BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW, - [BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W, - [BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H, - [BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B, - [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W, - [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H, - [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B, - [BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW, }; +#undef BPF_INSN_3_LBL +#undef BPF_INSN_2_LBL u32 tail_call_cnt = 0; void *ptr; int off; @@ -1302,8 +1336,14 @@ load_byte: goto load_byte; default_label: - /* If we ever reach this, we have a bug somewhere. */ - WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code); + /* If we ever reach this, we have a bug somewhere. Die hard here + * instead of just returning 0; we could be somewhere in a subprog, + * so execution could continue otherwise which we do /not/ want. + * + * Note, verifier whitelists all opcodes in bpf_opcode_in_insntable(). + */ + pr_warn("BPF interpreter: unknown opcode %02x\n", insn->code); + BUG_ON(1); return 0; } STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8365259a7c5a..0c5269415090 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4981,6 +4981,13 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) next_insn: insn++; i++; + continue; + } + + /* Basic sanity check before we invest more work here. */ + if (!bpf_opcode_in_insntable(insn->code)) { + verbose(env, "unknown opcode %02x\n", insn->code); + return -EINVAL; } } -- cgit v1.2.2 From f6b1b3bf0d5f681631a293cfe1ca934b81716f1e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 26 Jan 2018 23:33:39 +0100 Subject: bpf: fix subprog verifier bypass by div/mod by 0 exception One of the ugly leftovers from the early eBPF days is that div/mod operations based on registers have a hard-coded src_reg == 0 test in the interpreter as well as in JIT code generators that would return from the BPF program with exit code 0. This was basically adopted from cBPF interpreter for historical reasons. There are multiple reasons why this is very suboptimal and prone to bugs. To name one: the return code mapping for such abnormal program exit of 0 does not always match with a suitable program type's exit code mapping. For example, '0' in tc means action 'ok' where the packet gets passed further up the stack, which is just undesirable for such cases (e.g. when implementing policy) and also does not match with other program types. While trying to work out an exception handling scheme, I also noticed that programs crafted like the following will currently pass the verifier: 0: (bf) r6 = r1 1: (85) call pc+8 caller: R6=ctx(id=0,off=0,imm=0) R10=fp0,call_-1 callee: frame1: R1=ctx(id=0,off=0,imm=0) R10=fp0,call_1 10: (b4) (u32) r2 = (u32) 0 11: (b4) (u32) r3 = (u32) 1 12: (3c) (u32) r3 /= (u32) r2 13: (61) r0 = *(u32 *)(r1 +76) 14: (95) exit returning from callee: frame1: R0_w=pkt(id=0,off=0,r=0,imm=0) R1=ctx(id=0,off=0,imm=0) R2_w=inv0 R3_w=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R10=fp0,call_1 to caller at 2: R0_w=pkt(id=0,off=0,r=0,imm=0) R6=ctx(id=0,off=0,imm=0) R10=fp0,call_-1 from 14 to 2: R0=pkt(id=0,off=0,r=0,imm=0) R6=ctx(id=0,off=0,imm=0) R10=fp0,call_-1 2: (bf) r1 = r6 3: (61) r1 = *(u32 *)(r1 +80) 4: (bf) r2 = r0 5: (07) r2 += 8 6: (2d) if r2 > r1 goto pc+1 R0=pkt(id=0,off=0,r=8,imm=0) R1=pkt_end(id=0,off=0,imm=0) R2=pkt(id=0,off=8,r=8,imm=0) R6=ctx(id=0,off=0,imm=0) R10=fp0,call_-1 7: (71) r0 = *(u8 *)(r0 +0) 8: (b7) r0 = 1 9: (95) exit from 6 to 8: safe processed 16 insns (limit 131072), stack depth 0+0 Basically what happens is that in the subprog we make use of a div/mod by 0 exception and in the 'normal' subprog's exit path we just return skb->data back to the main prog. This has the implication that the verifier thinks we always get a pkt pointer in R0 while we still have the implicit 'return 0' from the div as an alternative unconditional return path earlier. Thus, R0 then contains 0, meaning back in the parent prog we get the address range of [0x0, skb->data_end] as read and writeable. Similar can be crafted with other pointer register types. Since i) BPF_ABS/IND is not allowed in programs that contain BPF to BPF calls (and generally it's also disadvised to use in native eBPF context), ii) unknown opcodes don't return zero anymore, iii) we don't return an exception code in dead branches, the only last missing case affected and to fix is the div/mod handling. What we would really need is some infrastructure to propagate exceptions all the way to the original prog unwinding the current stack and returning that code to the caller of the BPF program. In user space such exception handling for similar runtimes is typically implemented with setjmp(3) and longjmp(3) as one possibility which is not available in the kernel, though (kgdb used to implement it in kernel long time ago). I implemented a PoC exception handling mechanism into the BPF interpreter with porting setjmp()/longjmp() into x86_64 and adding a new internal BPF_ABRT opcode that can use a program specific exception code for all exception cases we have (e.g. div/mod by 0, unknown opcodes, etc). While this seems to work in the constrained BPF environment (meaning, here, we don't need to deal with state e.g. from memory allocations that we would need to undo before going into exception state), it still has various drawbacks: i) we would need to implement the setjmp()/longjmp() for every arch supported in the kernel and for x86_64, arm64, sparc64 JITs currently supporting calls, ii) it has unconditional additional cost on main program entry to store CPU register state in initial setjmp() call, and we would need some way to pass the jmp_buf down into ___bpf_prog_run() for main prog and all subprogs, but also storing on stack is not really nice (other option would be per-cpu storage for this, but it also has the drawback that we need to disable preemption for every BPF program types). All in all this approach would add a lot of complexity. Another poor-man's solution would be to have some sort of additional shared register or scratch buffer to hold state for exceptions, and test that after every call return to chain returns and pass R0 all the way down to BPF prog caller. This is also problematic in various ways: i) an additional register doesn't map well into JITs, and some other scratch space could only be on per-cpu storage, which, again has the side-effect that this only works when we disable preemption, or somewhere in the input context which is not available everywhere either, and ii) this adds significant runtime overhead by putting conditionals after each and every call, as well as implementation complexity. Yet another option is to teach verifier that div/mod can return an integer, which however is also complex to implement as verifier would need to walk such fake 'mov r0,; exit;' sequeuence and there would still be no guarantee for having propagation of this further down to the BPF caller as proper exception code. For parent prog, it is also is not distinguishable from a normal return of a constant scalar value. The approach taken here is a completely different one with little complexity and no additional overhead involved in that we make use of the fact that a div/mod by 0 is undefined behavior. Instead of bailing out, we adapt the same behavior as on some major archs like ARMv8 [0] into eBPF as well: X div 0 results in 0, and X mod 0 results in X. aarch64 and aarch32 ISA do not generate any traps or otherwise aborts of program execution for unsigned divides. I verified this also with a test program compiled by gcc and clang, and the behavior matches with the spec. Going forward we adapt the eBPF verifier to emit such rewrites once div/mod by register was seen. cBPF is not touched and will keep existing 'return 0' semantics. Given the options, it seems the most suitable from all of them, also since major archs have similar schemes in place. Given this is all in the realm of undefined behavior, we still have the option to adapt if deemed necessary and this way we would also have the option of more flexibility from LLVM code generation side (which is then fully visible to verifier). Thus, this patch i) fixes the panic seen in above program and ii) doesn't bypass the verifier observations. [0] ARM Architecture Reference Manual, ARMv8 [ARM DDI 0487B.b] http://infocenter.arm.com/help/topic/com.arm.doc.ddi0487b.b/DDI0487B_b_armv8_arm.pdf 1) aarch64 instruction set: section C3.4.7 and C6.2.279 (UDIV) "A division by zero results in a zero being written to the destination register, without any indication that the division by zero occurred." 2) aarch32 instruction set: section F1.4.8 and F5.1.263 (UDIV) "For the SDIV and UDIV instructions, division by zero always returns a zero result." Fixes: f4d7e40a5b71 ("bpf: introduce function calls (verification)") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 8 -------- kernel/bpf/verifier.c | 38 ++++++++++++++++++++++++++++++-------- 2 files changed, 30 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8301de2d1f96..5f35f93dcab2 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -999,14 +999,10 @@ select_insn: (*(s64 *) &DST) >>= IMM; CONT; ALU64_MOD_X: - if (unlikely(SRC == 0)) - return 0; div64_u64_rem(DST, SRC, &tmp); DST = tmp; CONT; ALU_MOD_X: - if (unlikely((u32)SRC == 0)) - return 0; tmp = (u32) DST; DST = do_div(tmp, (u32) SRC); CONT; @@ -1019,13 +1015,9 @@ select_insn: DST = do_div(tmp, (u32) IMM); CONT; ALU64_DIV_X: - if (unlikely(SRC == 0)) - return 0; DST = div64_u64(DST, SRC); CONT; ALU_DIV_X: - if (unlikely((u32)SRC == 0)) - return 0; tmp = (u32) DST; do_div(tmp, (u32) SRC); DST = (u32) tmp; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0c5269415090..5fb69a85d967 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5400,15 +5400,37 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) int i, cnt, delta = 0; for (i = 0; i < insn_cnt; i++, insn++) { - if (insn->code == (BPF_ALU | BPF_MOD | BPF_X) || + if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) || + insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) || + insn->code == (BPF_ALU | BPF_MOD | BPF_X) || insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { - /* due to JIT bugs clear upper 32-bits of src register - * before div/mod operation - */ - insn_buf[0] = BPF_MOV32_REG(insn->src_reg, insn->src_reg); - insn_buf[1] = *insn; - cnt = 2; - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + bool is64 = BPF_CLASS(insn->code) == BPF_ALU64; + struct bpf_insn mask_and_div[] = { + BPF_MOV32_REG(insn->src_reg, insn->src_reg), + /* Rx div 0 -> 0 */ + BPF_JMP_IMM(BPF_JNE, insn->src_reg, 0, 2), + BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg), + BPF_JMP_IMM(BPF_JA, 0, 0, 1), + *insn, + }; + struct bpf_insn mask_and_mod[] = { + BPF_MOV32_REG(insn->src_reg, insn->src_reg), + /* Rx mod 0 -> Rx */ + BPF_JMP_IMM(BPF_JEQ, insn->src_reg, 0, 1), + *insn, + }; + struct bpf_insn *patchlet; + + if (insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) || + insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { + patchlet = mask_and_div + (is64 ? 1 : 0); + cnt = ARRAY_SIZE(mask_and_div) - (is64 ? 1 : 0); + } else { + patchlet = mask_and_mod + (is64 ? 1 : 0); + cnt = ARRAY_SIZE(mask_and_mod) - (is64 ? 1 : 0); + } + + new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt); if (!new_prog) return -ENOMEM; -- cgit v1.2.2 From 6dd1ec6c7a2c304e9e2e2edd9e7ccb8e8791d36a Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 26 Jan 2018 15:06:07 -0800 Subject: bpf: fix kernel page fault in lpm map trie_get_next_key Commit b471f2f1de8b ("bpf: implement MAP_GET_NEXT_KEY command for LPM_TRIE map") introduces a bug likes below: if (!rcu_dereference(trie->root)) return -ENOENT; if (!key || key->prefixlen > trie->max_prefixlen) { root = &trie->root; goto find_leftmost; } ...... find_leftmost: for (node = rcu_dereference(*root); node;) { In the code after label find_leftmost, it is assumed that *root should not be NULL, but it is not true as it is possbile trie->root is changed to NULL by an asynchronous delete operation. The issue is reported by syzbot and Eric Dumazet with the below error log: ...... kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 1 PID: 8033 Comm: syz-executor3 Not tainted 4.15.0-rc8+ #4 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:trie_get_next_key+0x3c2/0xf10 kernel/bpf/lpm_trie.c:682 ...... This patch fixed the issue by use local rcu_dereferenced pointer instead of *(&trie->root) later on. Fixes: b471f2f1de8b ("bpf: implement MAP_GET_NEXT_KEY command or LPM_TRIE map") Reported-by: syzbot Reported-by: Eric Dumazet Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- kernel/bpf/lpm_trie.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 8f083ea9d4b9..7b469d10d0e9 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -593,11 +593,10 @@ unlock: static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) { + struct lpm_trie_node *node, *next_node = NULL, *parent, *search_root; struct lpm_trie *trie = container_of(map, struct lpm_trie, map); struct bpf_lpm_trie_key *key = _key, *next_key = _next_key; - struct lpm_trie_node *node, *next_node = NULL, *parent; struct lpm_trie_node **node_stack = NULL; - struct lpm_trie_node __rcu **root; int err = 0, stack_ptr = -1; unsigned int next_bit; size_t matchlen; @@ -614,14 +613,13 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) */ /* Empty trie */ - if (!rcu_dereference(trie->root)) + search_root = rcu_dereference(trie->root); + if (!search_root) return -ENOENT; /* For invalid key, find the leftmost node in the trie */ - if (!key || key->prefixlen > trie->max_prefixlen) { - root = &trie->root; + if (!key || key->prefixlen > trie->max_prefixlen) goto find_leftmost; - } node_stack = kmalloc(trie->max_prefixlen * sizeof(struct lpm_trie_node *), GFP_ATOMIC | __GFP_NOWARN); @@ -629,7 +627,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) return -ENOMEM; /* Try to find the exact node for the given key */ - for (node = rcu_dereference(trie->root); node;) { + for (node = search_root; node;) { node_stack[++stack_ptr] = node; matchlen = longest_prefix_match(trie, node, key); if (node->prefixlen != matchlen || @@ -640,10 +638,8 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) node = rcu_dereference(node->child[next_bit]); } if (!node || node->prefixlen != key->prefixlen || - (node->flags & LPM_TREE_NODE_FLAG_IM)) { - root = &trie->root; + (node->flags & LPM_TREE_NODE_FLAG_IM)) goto find_leftmost; - } /* The node with the exactly-matching key has been found, * find the first node in postorder after the matched node. @@ -651,10 +647,10 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) node = node_stack[stack_ptr]; while (stack_ptr > 0) { parent = node_stack[stack_ptr - 1]; - if (rcu_dereference(parent->child[0]) == node && - rcu_dereference(parent->child[1])) { - root = &parent->child[1]; - goto find_leftmost; + if (rcu_dereference(parent->child[0]) == node) { + search_root = rcu_dereference(parent->child[1]); + if (search_root) + goto find_leftmost; } if (!(parent->flags & LPM_TREE_NODE_FLAG_IM)) { next_node = parent; @@ -673,7 +669,7 @@ find_leftmost: /* Find the leftmost non-intermediate node, all intermediate nodes * have exact two children, so this function will never return NULL. */ - for (node = rcu_dereference(*root); node;) { + for (node = search_root; node;) { if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) next_node = node; node = rcu_dereference(node->child[0]); -- cgit v1.2.2 From d70f2a14b72a4bc094cf3a92e4794644a7adc590 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 31 Jan 2018 16:15:51 -0800 Subject: include/linux/sched/mm.h: uninline mmdrop_async(), etc mmdrop_async() is only used in fork.c. Move that and its support functions into fork.c, uninline it all. Quite a lot of code gets moved around to avoid forward declarations. Cc: Ingo Molnar Cc: Michal Hocko Cc: Peter Zijlstra Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 448 +++++++++++++++++++++++++++++++--------------------------- 1 file changed, 236 insertions(+), 212 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 2295fc69717f..5e6cf0dd031c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -77,6 +77,7 @@ #include #include #include +#include #include #include #include @@ -390,6 +391,241 @@ void free_task(struct task_struct *tsk) } EXPORT_SYMBOL(free_task); +#ifdef CONFIG_MMU +static __latent_entropy int dup_mmap(struct mm_struct *mm, + struct mm_struct *oldmm) +{ + struct vm_area_struct *mpnt, *tmp, *prev, **pprev; + struct rb_node **rb_link, *rb_parent; + int retval; + unsigned long charge; + LIST_HEAD(uf); + + uprobe_start_dup_mmap(); + if (down_write_killable(&oldmm->mmap_sem)) { + retval = -EINTR; + goto fail_uprobe_end; + } + flush_cache_dup_mm(oldmm); + uprobe_dup_mmap(oldmm, mm); + /* + * Not linked in yet - no deadlock potential: + */ + down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); + + /* No ordering required: file already has been exposed. */ + RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); + + mm->total_vm = oldmm->total_vm; + mm->data_vm = oldmm->data_vm; + mm->exec_vm = oldmm->exec_vm; + mm->stack_vm = oldmm->stack_vm; + + rb_link = &mm->mm_rb.rb_node; + rb_parent = NULL; + pprev = &mm->mmap; + retval = ksm_fork(mm, oldmm); + if (retval) + goto out; + retval = khugepaged_fork(mm, oldmm); + if (retval) + goto out; + + prev = NULL; + for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { + struct file *file; + + if (mpnt->vm_flags & VM_DONTCOPY) { + vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); + continue; + } + charge = 0; + if (mpnt->vm_flags & VM_ACCOUNT) { + unsigned long len = vma_pages(mpnt); + + if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ + goto fail_nomem; + charge = len; + } + tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + if (!tmp) + goto fail_nomem; + *tmp = *mpnt; + INIT_LIST_HEAD(&tmp->anon_vma_chain); + retval = vma_dup_policy(mpnt, tmp); + if (retval) + goto fail_nomem_policy; + tmp->vm_mm = mm; + retval = dup_userfaultfd(tmp, &uf); + if (retval) + goto fail_nomem_anon_vma_fork; + if (tmp->vm_flags & VM_WIPEONFORK) { + /* VM_WIPEONFORK gets a clean slate in the child. */ + tmp->anon_vma = NULL; + if (anon_vma_prepare(tmp)) + goto fail_nomem_anon_vma_fork; + } else if (anon_vma_fork(tmp, mpnt)) + goto fail_nomem_anon_vma_fork; + tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); + tmp->vm_next = tmp->vm_prev = NULL; + file = tmp->vm_file; + if (file) { + struct inode *inode = file_inode(file); + struct address_space *mapping = file->f_mapping; + + get_file(file); + if (tmp->vm_flags & VM_DENYWRITE) + atomic_dec(&inode->i_writecount); + i_mmap_lock_write(mapping); + if (tmp->vm_flags & VM_SHARED) + atomic_inc(&mapping->i_mmap_writable); + flush_dcache_mmap_lock(mapping); + /* insert tmp into the share list, just after mpnt */ + vma_interval_tree_insert_after(tmp, mpnt, + &mapping->i_mmap); + flush_dcache_mmap_unlock(mapping); + i_mmap_unlock_write(mapping); + } + + /* + * Clear hugetlb-related page reserves for children. This only + * affects MAP_PRIVATE mappings. Faults generated by the child + * are not guaranteed to succeed, even if read-only + */ + if (is_vm_hugetlb_page(tmp)) + reset_vma_resv_huge_pages(tmp); + + /* + * Link in the new vma and copy the page table entries. + */ + *pprev = tmp; + pprev = &tmp->vm_next; + tmp->vm_prev = prev; + prev = tmp; + + __vma_link_rb(mm, tmp, rb_link, rb_parent); + rb_link = &tmp->vm_rb.rb_right; + rb_parent = &tmp->vm_rb; + + mm->map_count++; + if (!(tmp->vm_flags & VM_WIPEONFORK)) + retval = copy_page_range(mm, oldmm, mpnt); + + if (tmp->vm_ops && tmp->vm_ops->open) + tmp->vm_ops->open(tmp); + + if (retval) + goto out; + } + /* a new mm has just been created */ + arch_dup_mmap(oldmm, mm); + retval = 0; +out: + up_write(&mm->mmap_sem); + flush_tlb_mm(oldmm); + up_write(&oldmm->mmap_sem); + dup_userfaultfd_complete(&uf); +fail_uprobe_end: + uprobe_end_dup_mmap(); + return retval; +fail_nomem_anon_vma_fork: + mpol_put(vma_policy(tmp)); +fail_nomem_policy: + kmem_cache_free(vm_area_cachep, tmp); +fail_nomem: + retval = -ENOMEM; + vm_unacct_memory(charge); + goto out; +} + +static inline int mm_alloc_pgd(struct mm_struct *mm) +{ + mm->pgd = pgd_alloc(mm); + if (unlikely(!mm->pgd)) + return -ENOMEM; + return 0; +} + +static inline void mm_free_pgd(struct mm_struct *mm) +{ + pgd_free(mm, mm->pgd); +} +#else +static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) +{ + down_write(&oldmm->mmap_sem); + RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); + up_write(&oldmm->mmap_sem); + return 0; +} +#define mm_alloc_pgd(mm) (0) +#define mm_free_pgd(mm) +#endif /* CONFIG_MMU */ + +static void check_mm(struct mm_struct *mm) +{ + int i; + + for (i = 0; i < NR_MM_COUNTERS; i++) { + long x = atomic_long_read(&mm->rss_stat.count[i]); + + if (unlikely(x)) + printk(KERN_ALERT "BUG: Bad rss-counter state " + "mm:%p idx:%d val:%ld\n", mm, i, x); + } + + if (mm_pgtables_bytes(mm)) + pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n", + mm_pgtables_bytes(mm)); + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS + VM_BUG_ON_MM(mm->pmd_huge_pte, mm); +#endif +} + +#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) +#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) + +/* + * Called when the last reference to the mm + * is dropped: either by a lazy thread or by + * mmput. Free the page directory and the mm. + */ +static void __mmdrop(struct mm_struct *mm) +{ + BUG_ON(mm == &init_mm); + mm_free_pgd(mm); + destroy_context(mm); + hmm_mm_destroy(mm); + mmu_notifier_mm_destroy(mm); + check_mm(mm); + put_user_ns(mm->user_ns); + free_mm(mm); +} + +void mmdrop(struct mm_struct *mm) +{ + if (unlikely(atomic_dec_and_test(&mm->mm_count))) + __mmdrop(mm); +} +EXPORT_SYMBOL_GPL(mmdrop); + +static void mmdrop_async_fn(struct work_struct *work) +{ + struct mm_struct *mm; + + mm = container_of(work, struct mm_struct, async_put_work); + __mmdrop(mm); +} + +static void mmdrop_async(struct mm_struct *mm) +{ + if (unlikely(atomic_dec_and_test(&mm->mm_count))) { + INIT_WORK(&mm->async_put_work, mmdrop_async_fn); + schedule_work(&mm->async_put_work); + } +} + static inline void free_signal_struct(struct signal_struct *sig) { taskstats_tgid_free(sig); @@ -594,181 +830,8 @@ free_tsk: return NULL; } -#ifdef CONFIG_MMU -static __latent_entropy int dup_mmap(struct mm_struct *mm, - struct mm_struct *oldmm) -{ - struct vm_area_struct *mpnt, *tmp, *prev, **pprev; - struct rb_node **rb_link, *rb_parent; - int retval; - unsigned long charge; - LIST_HEAD(uf); - - uprobe_start_dup_mmap(); - if (down_write_killable(&oldmm->mmap_sem)) { - retval = -EINTR; - goto fail_uprobe_end; - } - flush_cache_dup_mm(oldmm); - uprobe_dup_mmap(oldmm, mm); - /* - * Not linked in yet - no deadlock potential: - */ - down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); - - /* No ordering required: file already has been exposed. */ - RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); - - mm->total_vm = oldmm->total_vm; - mm->data_vm = oldmm->data_vm; - mm->exec_vm = oldmm->exec_vm; - mm->stack_vm = oldmm->stack_vm; - - rb_link = &mm->mm_rb.rb_node; - rb_parent = NULL; - pprev = &mm->mmap; - retval = ksm_fork(mm, oldmm); - if (retval) - goto out; - retval = khugepaged_fork(mm, oldmm); - if (retval) - goto out; - - prev = NULL; - for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { - struct file *file; - - if (mpnt->vm_flags & VM_DONTCOPY) { - vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); - continue; - } - charge = 0; - if (mpnt->vm_flags & VM_ACCOUNT) { - unsigned long len = vma_pages(mpnt); - - if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ - goto fail_nomem; - charge = len; - } - tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); - if (!tmp) - goto fail_nomem; - *tmp = *mpnt; - INIT_LIST_HEAD(&tmp->anon_vma_chain); - retval = vma_dup_policy(mpnt, tmp); - if (retval) - goto fail_nomem_policy; - tmp->vm_mm = mm; - retval = dup_userfaultfd(tmp, &uf); - if (retval) - goto fail_nomem_anon_vma_fork; - if (tmp->vm_flags & VM_WIPEONFORK) { - /* VM_WIPEONFORK gets a clean slate in the child. */ - tmp->anon_vma = NULL; - if (anon_vma_prepare(tmp)) - goto fail_nomem_anon_vma_fork; - } else if (anon_vma_fork(tmp, mpnt)) - goto fail_nomem_anon_vma_fork; - tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); - tmp->vm_next = tmp->vm_prev = NULL; - file = tmp->vm_file; - if (file) { - struct inode *inode = file_inode(file); - struct address_space *mapping = file->f_mapping; - - get_file(file); - if (tmp->vm_flags & VM_DENYWRITE) - atomic_dec(&inode->i_writecount); - i_mmap_lock_write(mapping); - if (tmp->vm_flags & VM_SHARED) - atomic_inc(&mapping->i_mmap_writable); - flush_dcache_mmap_lock(mapping); - /* insert tmp into the share list, just after mpnt */ - vma_interval_tree_insert_after(tmp, mpnt, - &mapping->i_mmap); - flush_dcache_mmap_unlock(mapping); - i_mmap_unlock_write(mapping); - } - - /* - * Clear hugetlb-related page reserves for children. This only - * affects MAP_PRIVATE mappings. Faults generated by the child - * are not guaranteed to succeed, even if read-only - */ - if (is_vm_hugetlb_page(tmp)) - reset_vma_resv_huge_pages(tmp); - - /* - * Link in the new vma and copy the page table entries. - */ - *pprev = tmp; - pprev = &tmp->vm_next; - tmp->vm_prev = prev; - prev = tmp; - - __vma_link_rb(mm, tmp, rb_link, rb_parent); - rb_link = &tmp->vm_rb.rb_right; - rb_parent = &tmp->vm_rb; - - mm->map_count++; - if (!(tmp->vm_flags & VM_WIPEONFORK)) - retval = copy_page_range(mm, oldmm, mpnt); - - if (tmp->vm_ops && tmp->vm_ops->open) - tmp->vm_ops->open(tmp); - - if (retval) - goto out; - } - /* a new mm has just been created */ - retval = arch_dup_mmap(oldmm, mm); -out: - up_write(&mm->mmap_sem); - flush_tlb_mm(oldmm); - up_write(&oldmm->mmap_sem); - dup_userfaultfd_complete(&uf); -fail_uprobe_end: - uprobe_end_dup_mmap(); - return retval; -fail_nomem_anon_vma_fork: - mpol_put(vma_policy(tmp)); -fail_nomem_policy: - kmem_cache_free(vm_area_cachep, tmp); -fail_nomem: - retval = -ENOMEM; - vm_unacct_memory(charge); - goto out; -} - -static inline int mm_alloc_pgd(struct mm_struct *mm) -{ - mm->pgd = pgd_alloc(mm); - if (unlikely(!mm->pgd)) - return -ENOMEM; - return 0; -} - -static inline void mm_free_pgd(struct mm_struct *mm) -{ - pgd_free(mm, mm->pgd); -} -#else -static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) -{ - down_write(&oldmm->mmap_sem); - RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); - up_write(&oldmm->mmap_sem); - return 0; -} -#define mm_alloc_pgd(mm) (0) -#define mm_free_pgd(mm) -#endif /* CONFIG_MMU */ - __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); -#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) -#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) - static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT; static int __init coredump_filter_setup(char *s) @@ -858,27 +921,6 @@ fail_nopgd: return NULL; } -static void check_mm(struct mm_struct *mm) -{ - int i; - - for (i = 0; i < NR_MM_COUNTERS; i++) { - long x = atomic_long_read(&mm->rss_stat.count[i]); - - if (unlikely(x)) - printk(KERN_ALERT "BUG: Bad rss-counter state " - "mm:%p idx:%d val:%ld\n", mm, i, x); - } - - if (mm_pgtables_bytes(mm)) - pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n", - mm_pgtables_bytes(mm)); - -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS - VM_BUG_ON_MM(mm->pmd_huge_pte, mm); -#endif -} - /* * Allocate and initialize an mm_struct. */ @@ -894,24 +936,6 @@ struct mm_struct *mm_alloc(void) return mm_init(mm, current, current_user_ns()); } -/* - * Called when the last reference to the mm - * is dropped: either by a lazy thread or by - * mmput. Free the page directory and the mm. - */ -void __mmdrop(struct mm_struct *mm) -{ - BUG_ON(mm == &init_mm); - mm_free_pgd(mm); - destroy_context(mm); - hmm_mm_destroy(mm); - mmu_notifier_mm_destroy(mm); - check_mm(mm); - put_user_ns(mm->user_ns); - free_mm(mm); -} -EXPORT_SYMBOL_GPL(__mmdrop); - static inline void __mmput(struct mm_struct *mm) { VM_BUG_ON(atomic_read(&mm->mm_users)); -- cgit v1.2.2 From d6cb41cc44c63492702281b1d329955ca767d399 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 31 Jan 2018 16:17:10 -0800 Subject: mm, hugetlb: remove hugepages_treat_as_movable sysctl hugepages_treat_as_movable has been introduced by 396faf0303d2 ("Allow huge page allocations to use GFP_HIGH_MOVABLE") to allow hugetlb allocations from ZONE_MOVABLE even when hugetlb pages were not migrateable. The purpose of the movable zone was different at the time. It aimed at reducing memory fragmentation and hugetlb pages being long lived and large werre not contributing to the fragmentation so it was acceptable to use the zone back then. Things have changed though and the primary purpose of the zone became migratability guarantee. If we allow non migrateable hugetlb pages to be in ZONE_MOVABLE memory hotplug might fail to offline the memory. Remove the knob and only rely on hugepage_migration_supported to allow movable zones. Mel said: : Primarily it was aimed at allowing the hugetlb pool to safely shrink with : the ability to grow it again. The use case was for batched jobs, some of : which needed huge pages and others that did not but didn't want the memory : useless pinned in the huge pages pool. : : I suspect that more users rely on THP than hugetlbfs for flexible use of : huge pages with fallback options so I think that removing the option : should be ok. Link: http://lkml.kernel.org/r/20171003072619.8654-1-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Alexandru Moise <00moses.alexander00@gmail.com> Acked-by: Mel Gorman Cc: Alexandru Moise <00moses.alexander00@gmail.com> Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 557d46728577..2fb4e27c636a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1374,13 +1374,6 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "hugepages_treat_as_movable", - .data = &hugepages_treat_as_movable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { .procname = "nr_overcommit_hugepages", .data = NULL, -- cgit v1.2.2 From 1beaeacdc88b537703d04d5536235d0bbb36db93 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 Jan 2018 19:36:32 +0100 Subject: genirq: Make legacy autoprobing work again Meelis reported the following warning on a quad P3 HP NetServer museum piece: WARNING: CPU: 3 PID: 258 at kernel/irq/chip.c:244 __irq_startup+0x80/0x100 EIP: __irq_startup+0x80/0x100 irq_startup+0x7e/0x170 probe_irq_on+0x128/0x2b0 parport_irq_probe.constprop.18+0x8d/0x1af [parport_pc] parport_pc_probe_port+0xf11/0x1260 [parport_pc] parport_pc_init+0x78a/0xf10 [parport_pc] parport_parse_param.constprop.16+0xf0/0xf0 [parport_pc] do_one_initcall+0x45/0x1e0 This is caused by the rewrite of the irq activation/startup sequence which missed to convert a callsite in the irq legacy auto probing code. To fix this irq_activate_and_startup() needs to gain a return value so the pending logic can work proper. Fixes: c942cee46bba ("genirq: Separate activation and startup") Reported-by: Meelis Roos Signed-off-by: Thomas Gleixner Tested-by: Meelis Roos Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801301935410.1797@nanos --- kernel/irq/autoprobe.c | 2 +- kernel/irq/chip.c | 6 +++--- kernel/irq/internals.h | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 4e8089b319ae..8c82ea26e837 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -71,7 +71,7 @@ unsigned long probe_irq_on(void) raw_spin_lock_irq(&desc->lock); if (!desc->action && irq_settings_can_probe(desc)) { desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; - if (irq_startup(desc, IRQ_NORESEND, IRQ_START_FORCE)) + if (irq_activate_and_startup(desc, IRQ_NORESEND)) desc->istate |= IRQS_PENDING; } raw_spin_unlock_irq(&desc->lock); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 043bfc35b353..c69357a43849 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -294,11 +294,11 @@ int irq_activate(struct irq_desc *desc) return 0; } -void irq_activate_and_startup(struct irq_desc *desc, bool resend) +int irq_activate_and_startup(struct irq_desc *desc, bool resend) { if (WARN_ON(irq_activate(desc))) - return; - irq_startup(desc, resend, IRQ_START_FORCE); + return 0; + return irq_startup(desc, resend, IRQ_START_FORCE); } static void __irq_disable(struct irq_desc *desc, bool mask); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index ab19371eab9b..ca6afa267070 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -76,7 +76,7 @@ extern void __enable_irq(struct irq_desc *desc); #define IRQ_START_COND false extern int irq_activate(struct irq_desc *desc); -extern void irq_activate_and_startup(struct irq_desc *desc, bool resend); +extern int irq_activate_and_startup(struct irq_desc *desc, bool resend); extern int irq_startup(struct irq_desc *desc, bool resend, bool force); extern void irq_shutdown(struct irq_desc *desc); -- cgit v1.2.2 From 328008a72d38b5bde6491e463405c34a81a65d3e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 2 Feb 2018 15:56:18 +0100 Subject: x86/power: Fix swsusp_arch_resume prototype The declaration for swsusp_arch_resume marks it as 'asmlinkage', but the definition in x86-32 does not, and it fails to include the header with the declaration. This leads to a warning when building with link-time-optimizations: kernel/power/power.h:108:23: error: type of 'swsusp_arch_resume' does not match original declaration [-Werror=lto-type-mismatch] extern asmlinkage int swsusp_arch_resume(void); ^ arch/x86/power/hibernate_32.c:148:0: note: 'swsusp_arch_resume' was previously declared here int swsusp_arch_resume(void) This moves the declaration into a globally visible header file and fixes up both x86 definitions to match it. Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Cc: Len Brown Cc: Andi Kleen Cc: Nicolas Pitre Cc: linux-pm@vger.kernel.org Cc: "Rafael J. Wysocki" Cc: Pavel Machek Cc: Bart Van Assche Link: https://lkml.kernel.org/r/20180202145634.200291-2-arnd@arndb.de --- kernel/power/power.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/power.h b/kernel/power/power.h index f29cd178df90..9e58bdc8a562 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -104,9 +104,6 @@ extern int in_suspend; extern dev_t swsusp_resume_device; extern sector_t swsusp_resume_block; -extern asmlinkage int swsusp_arch_suspend(void); -extern asmlinkage int swsusp_arch_resume(void); - extern int create_basic_memory_bitmaps(void); extern void free_basic_memory_bitmaps(void); extern int hibernate_preallocate_memory(void); -- cgit v1.2.2