aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.freezer1
-rw-r--r--kernel/Kconfig.hz1
-rw-r--r--kernel/Kconfig.locks1
-rw-r--r--kernel/Kconfig.preempt1
-rw-r--r--kernel/Makefile4
-rw-r--r--kernel/async.c6
-rw-r--r--kernel/audit.c42
-rw-r--r--kernel/audit.h23
-rw-r--r--kernel/audit_fsnotify.c11
-rw-r--r--kernel/audit_watch.c15
-rw-r--r--kernel/auditfilter.c77
-rw-r--r--kernel/auditsc.c42
-rw-r--r--kernel/backtracetest.c6
-rw-r--r--kernel/bpf/Makefile1
-rw-r--r--kernel/bpf/arraymap.c28
-rw-r--r--kernel/bpf/bpf_lru_list.c5
-rw-r--r--kernel/bpf/bpf_lru_list.h5
-rw-r--r--kernel/bpf/btf.c12
-rw-r--r--kernel/bpf/cgroup.c453
-rw-r--r--kernel/bpf/core.c67
-rw-r--r--kernel/bpf/cpumap.c119
-rw-r--r--kernel/bpf/devmap.c142
-rw-r--r--kernel/bpf/disasm.c10
-rw-r--r--kernel/bpf/disasm.h10
-rw-r--r--kernel/bpf/hashtab.c47
-rw-r--r--kernel/bpf/helpers.c10
-rw-r--r--kernel/bpf/inode.c7
-rw-r--r--kernel/bpf/local_storage.c13
-rw-r--r--kernel/bpf/lpm_trie.c22
-rw-r--r--kernel/bpf/map_in_map.c5
-rw-r--r--kernel/bpf/map_in_map.h5
-rw-r--r--kernel/bpf/percpu_freelist.c5
-rw-r--r--kernel/bpf/percpu_freelist.h5
-rw-r--r--kernel/bpf/queue_stack_maps.c13
-rw-r--r--kernel/bpf/reuseport_array.c17
-rw-r--r--kernel/bpf/stackmap.c33
-rw-r--r--kernel/bpf/syscall.c147
-rw-r--r--kernel/bpf/tnum.c1
-rw-r--r--kernel/bpf/verifier.c1305
-rw-r--r--kernel/bpf/xskmap.c22
-rw-r--r--kernel/cgroup/cgroup-v1.c1
-rw-r--r--kernel/cgroup/cgroup.c269
-rw-r--r--kernel/cgroup/cpuset.c19
-rw-r--r--kernel/cgroup/pids.c5
-rw-r--r--kernel/cgroup/rdma.c5
-rw-r--r--kernel/cgroup/rstat.c1
-rw-r--r--kernel/compat.c8
-rw-r--r--kernel/context_tracking.c1
-rw-r--r--kernel/cpu.c19
-rw-r--r--kernel/cpu_pm.c11
-rw-r--r--kernel/crash_core.c4
-rw-r--r--kernel/crash_dump.c1
-rw-r--r--kernel/cred.c28
-rw-r--r--kernel/debug/Makefile1
-rw-r--r--kernel/debug/gdbstub.c9
-rw-r--r--kernel/debug/kdb/Makefile1
-rw-r--r--kernel/debug/kdb/kdb_io.c2
-rw-r--r--kernel/debug/kdb/kdb_main.c3
-rw-r--r--kernel/debug/kdb/kdb_support.c2
-rw-r--r--kernel/delayacct.c11
-rw-r--r--kernel/dma/Kconfig1
-rw-r--r--kernel/dma/contiguous.c56
-rw-r--r--kernel/dma/debug.c14
-rw-r--r--kernel/dma/direct.c55
-rw-r--r--kernel/dma/mapping.c12
-rw-r--r--kernel/dma/remap.c16
-rw-r--r--kernel/dma/swiotlb.c26
-rw-r--r--kernel/events/core.c79
-rw-r--r--kernel/events/internal.h4
-rw-r--r--kernel/events/ring_buffer.c64
-rw-r--r--kernel/events/uprobes.c8
-rw-r--r--kernel/exit.c9
-rw-r--r--kernel/extable.c14
-rw-r--r--kernel/fail_function.c23
-rw-r--r--kernel/fork.c315
-rw-r--r--kernel/freezer.c1
-rw-r--r--kernel/futex.c84
-rw-r--r--kernel/gcov/Kconfig4
-rw-r--r--kernel/gcov/Makefile5
-rw-r--r--kernel/gcov/base.c86
-rw-r--r--kernel/gcov/clang.c581
-rw-r--r--kernel/gcov/fs.c24
-rw-r--r--kernel/gcov/gcc_3_4.c12
-rw-r--r--kernel/gcov/gcc_4_7.c12
-rw-r--r--kernel/gcov/gcc_base.c86
-rw-r--r--kernel/gcov/gcov.h5
-rwxr-xr-xkernel/gen_kheaders.sh (renamed from kernel/gen_ikh_data.sh)56
-rw-r--r--kernel/hung_task.c1
-rw-r--r--kernel/iomem.c2
-rw-r--r--kernel/irq/Kconfig4
-rw-r--r--kernel/irq/Makefile3
-rw-r--r--kernel/irq/affinity.c12
-rw-r--r--kernel/irq/autoprobe.c6
-rw-r--r--kernel/irq/chip.c37
-rw-r--r--kernel/irq/cpuhotplug.c2
-rw-r--r--kernel/irq/internals.h26
-rw-r--r--kernel/irq/irqdesc.c16
-rw-r--r--kernel/irq/irqdomain.c6
-rw-r--r--kernel/irq/manage.c90
-rw-r--r--kernel/irq/timings.c453
-rw-r--r--kernel/irq_work.c1
-rw-r--r--kernel/jump_label.c65
-rw-r--r--kernel/kallsyms.c1
-rw-r--r--kernel/kexec.c4
-rw-r--r--kernel/kexec_core.c4
-rw-r--r--kernel/kexec_file.c13
-rw-r--r--kernel/kheaders.c40
-rw-r--r--kernel/kprobes.c40
-rw-r--r--kernel/ksysfs.c4
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/latencytop.c14
-rw-r--r--kernel/livepatch/Kconfig1
-rw-r--r--kernel/livepatch/Makefile1
-rw-r--r--kernel/livepatch/core.c28
-rw-r--r--kernel/livepatch/patch.c14
-rw-r--r--kernel/livepatch/shadow.c14
-rw-r--r--kernel/livepatch/transition.c25
-rw-r--r--kernel/locking/Makefile2
-rw-r--r--kernel/locking/lock_events.h7
-rw-r--r--kernel/locking/lock_events_list.h12
-rw-r--r--kernel/locking/lockdep.c743
-rw-r--r--kernel/locking/lockdep_internals.h36
-rw-r--r--kernel/locking/locktorture.c2
-rw-r--r--kernel/locking/mutex.c1
-rw-r--r--kernel/locking/percpu-rwsem.c3
-rw-r--r--kernel/locking/qrwlock.c11
-rw-r--r--kernel/locking/qspinlock.c11
-rw-r--r--kernel/locking/qspinlock_stat.h10
-rw-r--r--kernel/locking/rtmutex.c1
-rw-r--r--kernel/locking/rwsem-xadd.c729
-rw-r--r--kernel/locking/rwsem.c1453
-rw-r--r--kernel/locking/rwsem.h306
-rw-r--r--kernel/locking/semaphore.c3
-rw-r--r--kernel/locking/test-ww_mutex.c15
-rw-r--r--kernel/memremap.c23
-rw-r--r--kernel/module-internal.h8
-rw-r--r--kernel/module.c40
-rw-r--r--kernel/module_signing.c6
-rw-r--r--kernel/notifier.c2
-rw-r--r--kernel/nsproxy.c6
-rw-r--r--kernel/panic.c12
-rw-r--r--kernel/params.c14
-rw-r--r--kernel/pid.c73
-rw-r--r--kernel/pid_namespace.c3
-rw-r--r--kernel/power/Kconfig1
-rw-r--r--kernel/power/energy_model.c2
-rw-r--r--kernel/power/hibernate.c16
-rw-r--r--kernel/power/main.c4
-rw-r--r--kernel/power/power.h2
-rw-r--r--kernel/power/poweroff.c3
-rw-r--r--kernel/power/qos.c1
-rw-r--r--kernel/power/snapshot.c4
-rw-r--r--kernel/power/suspend.c16
-rw-r--r--kernel/power/suspend_test.c3
-rw-r--r--kernel/power/swap.c7
-rw-r--r--kernel/power/user.c4
-rw-r--r--kernel/printk/Makefile1
-rw-r--r--kernel/printk/internal.h14
-rw-r--r--kernel/printk/printk.c23
-rw-r--r--kernel/printk/printk_safe.c14
-rw-r--r--kernel/profile.c1
-rw-r--r--kernel/ptrace.c28
-rw-r--r--kernel/rcu/Kconfig1
-rw-r--r--kernel/rcu/Kconfig.debug1
-rw-r--r--kernel/rcu/rcu.h14
-rw-r--r--kernel/rcu/rcutorture.c96
-rw-r--r--kernel/rcu/srcutree.c69
-rw-r--r--kernel/rcu/sync.c214
-rw-r--r--kernel/rcu/tree.c172
-rw-r--r--kernel/rcu/tree.h6
-rw-r--r--kernel/rcu/tree_exp.h53
-rw-r--r--kernel/rcu/tree_plugin.h195
-rw-r--r--kernel/rcu/tree_stall.h4
-rw-r--r--kernel/rcu/update.c13
-rw-r--r--kernel/reboot.c21
-rw-r--r--kernel/resource.c1
-rw-r--r--kernel/rseq.c4
-rw-r--r--kernel/sched/autogroup.c2
-rw-r--r--kernel/sched/clock.c1
-rw-r--r--kernel/sched/core.c534
-rw-r--r--kernel/sched/cpudeadline.c10
-rw-r--r--kernel/sched/cpufreq_schedutil.c24
-rw-r--r--kernel/sched/cpupri.c10
-rw-r--r--kernel/sched/cputime.c1
-rw-r--r--kernel/sched/deadline.c10
-rw-r--r--kernel/sched/debug.c48
-rw-r--r--kernel/sched/fair.c628
-rw-r--r--kernel/sched/features.h1
-rw-r--r--kernel/sched/idle.c1
-rw-r--r--kernel/sched/isolation.c1
-rw-r--r--kernel/sched/membarrier.c11
-rw-r--r--kernel/sched/pelt.c13
-rw-r--r--kernel/sched/pelt.h2
-rw-r--r--kernel/sched/psi.c617
-rw-r--r--kernel/sched/rt.c8
-rw-r--r--kernel/sched/sched-pelt.h2
-rw-r--r--kernel/sched/sched.h134
-rw-r--r--kernel/sched/topology.c18
-rw-r--r--kernel/sched/wait.c9
-rw-r--r--kernel/sched/wait_bit.c1
-rw-r--r--kernel/seccomp.c2
-rw-r--r--kernel/signal.c283
-rw-r--r--kernel/smp.c13
-rw-r--r--kernel/smpboot.c1
-rw-r--r--kernel/softirq.c5
-rw-r--r--kernel/stacktrace.c13
-rw-r--r--kernel/stop_machine.c22
-rw-r--r--kernel/sys.c62
-rw-r--r--kernel/sys_ni.c2
-rw-r--r--kernel/sysctl.c102
-rw-r--r--kernel/taskstats.c12
-rw-r--r--kernel/test_kprobes.c11
-rw-r--r--kernel/time/Kconfig1
-rw-r--r--kernel/time/Makefile1
-rw-r--r--kernel/time/alarmtimer.c1
-rw-r--r--kernel/time/clocksource.c4
-rw-r--r--kernel/time/hrtimer.c8
-rw-r--r--kernel/time/ntp.c4
-rw-r--r--kernel/time/posix-timers.c13
-rw-r--r--kernel/time/tick-sched.c2
-rw-r--r--kernel/time/time.c4
-rw-r--r--kernel/time/timekeeping.c5
-rw-r--r--kernel/time/timer_list.c36
-rw-r--r--kernel/time/vsyscall.c129
-rw-r--r--kernel/torture.c23
-rw-r--r--kernel/trace/Kconfig1
-rw-r--r--kernel/trace/blktrace.c6
-rw-r--r--kernel/trace/bpf_trace.c202
-rw-r--r--kernel/trace/ftrace.c21
-rw-r--r--kernel/trace/ring_buffer.c2
-rw-r--r--kernel/trace/ring_buffer_benchmark.c2
-rw-r--r--kernel/trace/trace.c451
-rw-r--r--kernel/trace/trace.h32
-rw-r--r--kernel/trace/trace_events.c4
-rw-r--r--kernel/trace/trace_events_filter.c92
-rw-r--r--kernel/trace/trace_events_hist.c279
-rw-r--r--kernel/trace/trace_events_trigger.c3
-rw-r--r--kernel/trace/trace_hwlat.c2
-rw-r--r--kernel/trace/trace_kdb.c67
-rw-r--r--kernel/trace/trace_kprobe.c77
-rw-r--r--kernel/trace/trace_output.c2
-rw-r--r--kernel/trace/trace_probe.c291
-rw-r--r--kernel/trace/trace_probe.h78
-rw-r--r--kernel/trace/trace_probe_tmpl.h2
-rw-r--r--kernel/trace/trace_selftest.c5
-rw-r--r--kernel/trace/trace_uprobe.c74
-rw-r--r--kernel/tracepoint.c15
-rw-r--r--kernel/tsacct.c13
-rw-r--r--kernel/ucount.c7
-rw-r--r--kernel/umh.c1
-rw-r--r--kernel/up.c4
-rw-r--r--kernel/user-return-notifier.c1
-rw-r--r--kernel/user.c16
-rw-r--r--kernel/user_namespace.c16
-rw-r--r--kernel/utsname.c6
-rw-r--r--kernel/utsname_sysctl.c6
-rw-r--r--kernel/workqueue.c29
257 files changed, 10205 insertions, 4708 deletions
diff --git a/kernel/Kconfig.freezer b/kernel/Kconfig.freezer
index a3bb4cb52539..68646feefb3d 100644
--- a/kernel/Kconfig.freezer
+++ b/kernel/Kconfig.freezer
@@ -1,2 +1,3 @@
1# SPDX-License-Identifier: GPL-2.0-only
1config FREEZER 2config FREEZER
2 def_bool PM_SLEEP || CGROUP_FREEZER 3 def_bool PM_SLEEP || CGROUP_FREEZER
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 2a202a846757..38ef6d06888e 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -1,3 +1,4 @@
1# SPDX-License-Identifier: GPL-2.0-only
1# 2#
2# Timer Interrupt Frequency Configuration 3# Timer Interrupt Frequency Configuration
3# 4#
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index bf770d7556f7..e0852dc333ac 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -1,3 +1,4 @@
1# SPDX-License-Identifier: GPL-2.0-only
1# 2#
2# The ARCH_INLINE foo is necessary because select ignores "depends on" 3# The ARCH_INLINE foo is necessary because select ignores "depends on"
3# 4#
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 0fee5fe6c899..dc0b682ec2d9 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -1,3 +1,4 @@
1# SPDX-License-Identifier: GPL-2.0-only
1 2
2choice 3choice
3 prompt "Preemption Model" 4 prompt "Preemption Model"
diff --git a/kernel/Makefile b/kernel/Makefile
index 298437bb2c6a..a8d923b5481b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -71,7 +71,7 @@ obj-$(CONFIG_UTS_NS) += utsname.o
71obj-$(CONFIG_USER_NS) += user_namespace.o 71obj-$(CONFIG_USER_NS) += user_namespace.o
72obj-$(CONFIG_PID_NS) += pid_namespace.o 72obj-$(CONFIG_PID_NS) += pid_namespace.o
73obj-$(CONFIG_IKCONFIG) += configs.o 73obj-$(CONFIG_IKCONFIG) += configs.o
74obj-$(CONFIG_IKHEADERS_PROC) += kheaders.o 74obj-$(CONFIG_IKHEADERS) += kheaders.o
75obj-$(CONFIG_SMP) += stop_machine.o 75obj-$(CONFIG_SMP) += stop_machine.o
76obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o 76obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
77obj-$(CONFIG_AUDIT) += audit.o auditfilter.o 77obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
@@ -127,7 +127,7 @@ $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
127$(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz 127$(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz
128 128
129quiet_cmd_genikh = CHK $(obj)/kheaders_data.tar.xz 129quiet_cmd_genikh = CHK $(obj)/kheaders_data.tar.xz
130cmd_genikh = $(srctree)/kernel/gen_ikh_data.sh $@ 130cmd_genikh = $(CONFIG_SHELL) $(srctree)/kernel/gen_kheaders.sh $@
131$(obj)/kheaders_data.tar.xz: FORCE 131$(obj)/kheaders_data.tar.xz: FORCE
132 $(call cmd,genikh) 132 $(call cmd,genikh)
133 133
diff --git a/kernel/async.c b/kernel/async.c
index 12c332e4e13e..4f9c1d614016 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -1,13 +1,9 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * async.c: Asynchronous function calls for boot performance 3 * async.c: Asynchronous function calls for boot performance
3 * 4 *
4 * (C) Copyright 2009 Intel Corporation 5 * (C) Copyright 2009 Intel Corporation
5 * Author: Arjan van de Ven <arjan@linux.intel.com> 6 * Author: Arjan van de Ven <arjan@linux.intel.com>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; version 2
10 * of the License.
11 */ 7 */
12 8
13 9
diff --git a/kernel/audit.c b/kernel/audit.c
index b96bf69183f4..da8dc0db5bd3 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
1/* audit.c -- Auditing support 2/* audit.c -- Auditing support
2 * Gateway between the kernel (e.g., selinux) and the user-space audit daemon. 3 * Gateway between the kernel (e.g., selinux) and the user-space audit daemon.
3 * System-call specific features have moved to auditsc.c 4 * System-call specific features have moved to auditsc.c
@@ -5,20 +6,6 @@
5 * Copyright 2003-2007 Red Hat Inc., Durham, North Carolina. 6 * Copyright 2003-2007 Red Hat Inc., Durham, North Carolina.
6 * All Rights Reserved. 7 * All Rights Reserved.
7 * 8 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 *
22 * Written by Rickard E. (Rik) Faith <faith@redhat.com> 9 * Written by Rickard E. (Rik) Faith <faith@redhat.com>
23 * 10 *
24 * Goals: 1) Integrate fully with Security Modules. 11 * Goals: 1) Integrate fully with Security Modules.
@@ -2274,6 +2261,33 @@ out:
2274} 2261}
2275 2262
2276/** 2263/**
2264 * audit_signal_info - record signal info for shutting down audit subsystem
2265 * @sig: signal value
2266 * @t: task being signaled
2267 *
2268 * If the audit subsystem is being terminated, record the task (pid)
2269 * and uid that is doing that.
2270 */
2271int audit_signal_info(int sig, struct task_struct *t)
2272{
2273 kuid_t uid = current_uid(), auid;
2274
2275 if (auditd_test_task(t) &&
2276 (sig == SIGTERM || sig == SIGHUP ||
2277 sig == SIGUSR1 || sig == SIGUSR2)) {
2278 audit_sig_pid = task_tgid_nr(current);
2279 auid = audit_get_loginuid(current);
2280 if (uid_valid(auid))
2281 audit_sig_uid = auid;
2282 else
2283 audit_sig_uid = uid;
2284 security_task_getsecid(current, &audit_sig_sid);
2285 }
2286
2287 return audit_signal_info_syscall(t);
2288}
2289
2290/**
2277 * audit_log_end - end one audit record 2291 * audit_log_end - end one audit record
2278 * @ab: the audit_buffer 2292 * @ab: the audit_buffer
2279 * 2293 *
diff --git a/kernel/audit.h b/kernel/audit.h
index 2071725a999f..6fb7160412d4 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -1,22 +1,9 @@
1/* SPDX-License-Identifier: GPL-2.0-or-later */
1/* audit -- definition of audit_context structure and supporting types 2/* audit -- definition of audit_context structure and supporting types
2 * 3 *
3 * Copyright 2003-2004 Red Hat, Inc. 4 * Copyright 2003-2004 Red Hat, Inc.
4 * Copyright 2005 Hewlett-Packard Development Company, L.P. 5 * Copyright 2005 Hewlett-Packard Development Company, L.P.
5 * Copyright 2005 IBM Corporation 6 * Copyright 2005 IBM Corporation
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */ 7 */
21 8
22#include <linux/fs.h> 9#include <linux/fs.h>
@@ -299,7 +286,7 @@ extern const char *audit_tree_path(struct audit_tree *tree);
299extern void audit_put_tree(struct audit_tree *tree); 286extern void audit_put_tree(struct audit_tree *tree);
300extern void audit_kill_trees(struct audit_context *context); 287extern void audit_kill_trees(struct audit_context *context);
301 288
302extern int audit_signal_info(int sig, struct task_struct *t); 289extern int audit_signal_info_syscall(struct task_struct *t);
303extern void audit_filter_inodes(struct task_struct *tsk, 290extern void audit_filter_inodes(struct task_struct *tsk,
304 struct audit_context *ctx); 291 struct audit_context *ctx);
305extern struct list_head *audit_killed_trees(void); 292extern struct list_head *audit_killed_trees(void);
@@ -330,7 +317,11 @@ extern struct list_head *audit_killed_trees(void);
330#define audit_tree_path(rule) "" /* never called */ 317#define audit_tree_path(rule) "" /* never called */
331#define audit_kill_trees(context) BUG() 318#define audit_kill_trees(context) BUG()
332 319
333#define audit_signal_info(s, t) AUDIT_DISABLED 320static inline int audit_signal_info_syscall(struct task_struct *t)
321{
322 return 0;
323}
324
334#define audit_filter_inodes(t, c) AUDIT_DISABLED 325#define audit_filter_inodes(t, c) AUDIT_DISABLED
335#endif /* CONFIG_AUDITSYSCALL */ 326#endif /* CONFIG_AUDITSYSCALL */
336 327
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index b5737b826951..f0d243318452 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -1,18 +1,9 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
1/* audit_fsnotify.c -- tracking inodes 2/* audit_fsnotify.c -- tracking inodes
2 * 3 *
3 * Copyright 2003-2009,2014-2015 Red Hat, Inc. 4 * Copyright 2003-2009,2014-2015 Red Hat, Inc.
4 * Copyright 2005 Hewlett-Packard Development Company, L.P. 5 * Copyright 2005 Hewlett-Packard Development Company, L.P.
5 * Copyright 2005 IBM Corporation 6 * Copyright 2005 IBM Corporation
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 */ 7 */
17 8
18#include <linux/kernel.h> 9#include <linux/kernel.h>
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index b50c574223fa..1f31c2f1e6fc 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -1,22 +1,9 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
1/* audit_watch.c -- watching inodes 2/* audit_watch.c -- watching inodes
2 * 3 *
3 * Copyright 2003-2009 Red Hat, Inc. 4 * Copyright 2003-2009 Red Hat, Inc.
4 * Copyright 2005 Hewlett-Packard Development Company, L.P. 5 * Copyright 2005 Hewlett-Packard Development Company, L.P.
5 * Copyright 2005 IBM Corporation 6 * Copyright 2005 IBM Corporation
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */ 7 */
21 8
22#include <linux/file.h> 9#include <linux/file.h>
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 303fb04770ce..b0126e9c0743 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1,22 +1,9 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
1/* auditfilter.c -- filtering of audit events 2/* auditfilter.c -- filtering of audit events
2 * 3 *
3 * Copyright 2003-2004 Red Hat, Inc. 4 * Copyright 2003-2004 Red Hat, Inc.
4 * Copyright 2005 Hewlett-Packard Development Company, L.P. 5 * Copyright 2005 Hewlett-Packard Development Company, L.P.
5 * Copyright 2005 IBM Corporation 6 * Copyright 2005 IBM Corporation
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */ 7 */
21 8
22#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 9#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -335,7 +322,7 @@ static u32 audit_to_op(u32 op)
335/* check if an audit field is valid */ 322/* check if an audit field is valid */
336static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) 323static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
337{ 324{
338 switch(f->type) { 325 switch (f->type) {
339 case AUDIT_MSGTYPE: 326 case AUDIT_MSGTYPE:
340 if (entry->rule.listnr != AUDIT_FILTER_EXCLUDE && 327 if (entry->rule.listnr != AUDIT_FILTER_EXCLUDE &&
341 entry->rule.listnr != AUDIT_FILTER_USER) 328 entry->rule.listnr != AUDIT_FILTER_USER)
@@ -347,7 +334,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
347 break; 334 break;
348 } 335 }
349 336
350 switch(entry->rule.listnr) { 337 switch (entry->rule.listnr) {
351 case AUDIT_FILTER_FS: 338 case AUDIT_FILTER_FS:
352 switch(f->type) { 339 switch(f->type) {
353 case AUDIT_FSTYPE: 340 case AUDIT_FSTYPE:
@@ -358,9 +345,16 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
358 } 345 }
359 } 346 }
360 347
361 switch(f->type) { 348 /* Check for valid field type and op */
362 default: 349 switch (f->type) {
363 return -EINVAL; 350 case AUDIT_ARG0:
351 case AUDIT_ARG1:
352 case AUDIT_ARG2:
353 case AUDIT_ARG3:
354 case AUDIT_PERS: /* <uapi/linux/personality.h> */
355 case AUDIT_DEVMINOR:
356 /* all ops are valid */
357 break;
364 case AUDIT_UID: 358 case AUDIT_UID:
365 case AUDIT_EUID: 359 case AUDIT_EUID:
366 case AUDIT_SUID: 360 case AUDIT_SUID:
@@ -373,46 +367,53 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
373 case AUDIT_FSGID: 367 case AUDIT_FSGID:
374 case AUDIT_OBJ_GID: 368 case AUDIT_OBJ_GID:
375 case AUDIT_PID: 369 case AUDIT_PID:
376 case AUDIT_PERS:
377 case AUDIT_MSGTYPE: 370 case AUDIT_MSGTYPE:
378 case AUDIT_PPID: 371 case AUDIT_PPID:
379 case AUDIT_DEVMAJOR: 372 case AUDIT_DEVMAJOR:
380 case AUDIT_DEVMINOR:
381 case AUDIT_EXIT: 373 case AUDIT_EXIT:
382 case AUDIT_SUCCESS: 374 case AUDIT_SUCCESS:
383 case AUDIT_INODE: 375 case AUDIT_INODE:
384 case AUDIT_SESSIONID: 376 case AUDIT_SESSIONID:
377 case AUDIT_SUBJ_SEN:
378 case AUDIT_SUBJ_CLR:
379 case AUDIT_OBJ_LEV_LOW:
380 case AUDIT_OBJ_LEV_HIGH:
381 case AUDIT_SADDR_FAM:
385 /* bit ops are only useful on syscall args */ 382 /* bit ops are only useful on syscall args */
386 if (f->op == Audit_bitmask || f->op == Audit_bittest) 383 if (f->op == Audit_bitmask || f->op == Audit_bittest)
387 return -EINVAL; 384 return -EINVAL;
388 break; 385 break;
389 case AUDIT_ARG0:
390 case AUDIT_ARG1:
391 case AUDIT_ARG2:
392 case AUDIT_ARG3:
393 case AUDIT_SUBJ_USER: 386 case AUDIT_SUBJ_USER:
394 case AUDIT_SUBJ_ROLE: 387 case AUDIT_SUBJ_ROLE:
395 case AUDIT_SUBJ_TYPE: 388 case AUDIT_SUBJ_TYPE:
396 case AUDIT_SUBJ_SEN:
397 case AUDIT_SUBJ_CLR:
398 case AUDIT_OBJ_USER: 389 case AUDIT_OBJ_USER:
399 case AUDIT_OBJ_ROLE: 390 case AUDIT_OBJ_ROLE:
400 case AUDIT_OBJ_TYPE: 391 case AUDIT_OBJ_TYPE:
401 case AUDIT_OBJ_LEV_LOW:
402 case AUDIT_OBJ_LEV_HIGH:
403 case AUDIT_WATCH: 392 case AUDIT_WATCH:
404 case AUDIT_DIR: 393 case AUDIT_DIR:
405 case AUDIT_FILTERKEY: 394 case AUDIT_FILTERKEY:
406 break;
407 case AUDIT_LOGINUID_SET: 395 case AUDIT_LOGINUID_SET:
408 if ((f->val != 0) && (f->val != 1))
409 return -EINVAL;
410 /* FALL THROUGH */
411 case AUDIT_ARCH: 396 case AUDIT_ARCH:
412 case AUDIT_FSTYPE: 397 case AUDIT_FSTYPE:
398 case AUDIT_PERM:
399 case AUDIT_FILETYPE:
400 case AUDIT_FIELD_COMPARE:
401 case AUDIT_EXE:
402 /* only equal and not equal valid ops */
413 if (f->op != Audit_not_equal && f->op != Audit_equal) 403 if (f->op != Audit_not_equal && f->op != Audit_equal)
414 return -EINVAL; 404 return -EINVAL;
415 break; 405 break;
406 default:
407 /* field not recognized */
408 return -EINVAL;
409 }
410
411 /* Check for select valid field values */
412 switch (f->type) {
413 case AUDIT_LOGINUID_SET:
414 if ((f->val != 0) && (f->val != 1))
415 return -EINVAL;
416 break;
416 case AUDIT_PERM: 417 case AUDIT_PERM:
417 if (f->val & ~15) 418 if (f->val & ~15)
418 return -EINVAL; 419 return -EINVAL;
@@ -425,11 +426,14 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
425 if (f->val > AUDIT_MAX_FIELD_COMPARE) 426 if (f->val > AUDIT_MAX_FIELD_COMPARE)
426 return -EINVAL; 427 return -EINVAL;
427 break; 428 break;
428 case AUDIT_EXE: 429 case AUDIT_SADDR_FAM:
429 if (f->op != Audit_not_equal && f->op != Audit_equal) 430 if (f->val >= AF_MAX)
430 return -EINVAL; 431 return -EINVAL;
431 break; 432 break;
433 default:
434 break;
432 } 435 }
436
433 return 0; 437 return 0;
434} 438}
435 439
@@ -1203,7 +1207,6 @@ int audit_comparator(u32 left, u32 op, u32 right)
1203 case Audit_bittest: 1207 case Audit_bittest:
1204 return ((left & right) == right); 1208 return ((left & right) == right);
1205 default: 1209 default:
1206 BUG();
1207 return 0; 1210 return 0;
1208 } 1211 }
1209} 1212}
@@ -1226,7 +1229,6 @@ int audit_uid_comparator(kuid_t left, u32 op, kuid_t right)
1226 case Audit_bitmask: 1229 case Audit_bitmask:
1227 case Audit_bittest: 1230 case Audit_bittest:
1228 default: 1231 default:
1229 BUG();
1230 return 0; 1232 return 0;
1231 } 1233 }
1232} 1234}
@@ -1249,7 +1251,6 @@ int audit_gid_comparator(kgid_t left, u32 op, kgid_t right)
1249 case Audit_bitmask: 1251 case Audit_bitmask:
1250 case Audit_bittest: 1252 case Audit_bittest:
1251 default: 1253 default:
1252 BUG();
1253 return 0; 1254 return 0;
1254 } 1255 }
1255} 1256}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 95ae27edd417..4effe01ebbe2 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -601,12 +601,20 @@ static int audit_filter_rules(struct task_struct *tsk,
601 } 601 }
602 break; 602 break;
603 case AUDIT_WATCH: 603 case AUDIT_WATCH:
604 if (name) 604 if (name) {
605 result = audit_watch_compare(rule->watch, name->ino, name->dev); 605 result = audit_watch_compare(rule->watch,
606 name->ino,
607 name->dev);
608 if (f->op == Audit_not_equal)
609 result = !result;
610 }
606 break; 611 break;
607 case AUDIT_DIR: 612 case AUDIT_DIR:
608 if (ctx) 613 if (ctx) {
609 result = match_tree_refs(ctx, rule->tree); 614 result = match_tree_refs(ctx, rule->tree);
615 if (f->op == Audit_not_equal)
616 result = !result;
617 }
610 break; 618 break;
611 case AUDIT_LOGINUID: 619 case AUDIT_LOGINUID:
612 result = audit_uid_comparator(audit_get_loginuid(tsk), 620 result = audit_uid_comparator(audit_get_loginuid(tsk),
@@ -615,6 +623,11 @@ static int audit_filter_rules(struct task_struct *tsk,
615 case AUDIT_LOGINUID_SET: 623 case AUDIT_LOGINUID_SET:
616 result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val); 624 result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val);
617 break; 625 break;
626 case AUDIT_SADDR_FAM:
627 if (ctx->sockaddr)
628 result = audit_comparator(ctx->sockaddr->ss_family,
629 f->op, f->val);
630 break;
618 case AUDIT_SUBJ_USER: 631 case AUDIT_SUBJ_USER:
619 case AUDIT_SUBJ_ROLE: 632 case AUDIT_SUBJ_ROLE:
620 case AUDIT_SUBJ_TYPE: 633 case AUDIT_SUBJ_TYPE:
@@ -684,9 +697,13 @@ static int audit_filter_rules(struct task_struct *tsk,
684 break; 697 break;
685 case AUDIT_PERM: 698 case AUDIT_PERM:
686 result = audit_match_perm(ctx, f->val); 699 result = audit_match_perm(ctx, f->val);
700 if (f->op == Audit_not_equal)
701 result = !result;
687 break; 702 break;
688 case AUDIT_FILETYPE: 703 case AUDIT_FILETYPE:
689 result = audit_match_filetype(ctx, f->val); 704 result = audit_match_filetype(ctx, f->val);
705 if (f->op == Audit_not_equal)
706 result = !result;
690 break; 707 break;
691 case AUDIT_FIELD_COMPARE: 708 case AUDIT_FIELD_COMPARE:
692 result = audit_field_compare(tsk, cred, f, ctx, name); 709 result = audit_field_compare(tsk, cred, f, ctx, name);
@@ -2360,30 +2377,17 @@ void __audit_ptrace(struct task_struct *t)
2360} 2377}
2361 2378
2362/** 2379/**
2363 * audit_signal_info - record signal info for shutting down audit subsystem 2380 * audit_signal_info_syscall - record signal info for syscalls
2364 * @sig: signal value
2365 * @t: task being signaled 2381 * @t: task being signaled
2366 * 2382 *
2367 * If the audit subsystem is being terminated, record the task (pid) 2383 * If the audit subsystem is being terminated, record the task (pid)
2368 * and uid that is doing that. 2384 * and uid that is doing that.
2369 */ 2385 */
2370int audit_signal_info(int sig, struct task_struct *t) 2386int audit_signal_info_syscall(struct task_struct *t)
2371{ 2387{
2372 struct audit_aux_data_pids *axp; 2388 struct audit_aux_data_pids *axp;
2373 struct audit_context *ctx = audit_context(); 2389 struct audit_context *ctx = audit_context();
2374 kuid_t uid = current_uid(), auid, t_uid = task_uid(t); 2390 kuid_t t_uid = task_uid(t);
2375
2376 if (auditd_test_task(t) &&
2377 (sig == SIGTERM || sig == SIGHUP ||
2378 sig == SIGUSR1 || sig == SIGUSR2)) {
2379 audit_sig_pid = task_tgid_nr(current);
2380 auid = audit_get_loginuid(current);
2381 if (uid_valid(auid))
2382 audit_sig_uid = auid;
2383 else
2384 audit_sig_uid = uid;
2385 security_task_getsecid(current, &audit_sig_sid);
2386 }
2387 2391
2388 if (!audit_signals || audit_dummy_context()) 2392 if (!audit_signals || audit_dummy_context())
2389 return 0; 2393 return 0;
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
index a563c8fdad0d..a2a97fa3071b 100644
--- a/kernel/backtracetest.c
+++ b/kernel/backtracetest.c
@@ -1,13 +1,9 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * Simple stack backtrace regression test module 3 * Simple stack backtrace regression test module
3 * 4 *
4 * (C) Copyright 2008 Intel Corporation 5 * (C) Copyright 2008 Intel Corporation
5 * Author: Arjan van de Ven <arjan@linux.intel.com> 6 * Author: Arjan van de Ven <arjan@linux.intel.com>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; version 2
10 * of the License.
11 */ 7 */
12 8
13#include <linux/completion.h> 9#include <linux/completion.h>
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 4c2fa3ac56f6..29d781061cd5 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,5 +1,6 @@
1# SPDX-License-Identifier: GPL-2.0 1# SPDX-License-Identifier: GPL-2.0
2obj-y := core.o 2obj-y := core.o
3CFLAGS_core.o += $(call cc-disable-warning, override-init)
3 4
4obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o 5obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
5obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o 6obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 584636c9e2eb..1c65ce0098a9 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -1,14 +1,6 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com 2/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2 * Copyright (c) 2016,2017 Facebook 3 * Copyright (c) 2016,2017 Facebook
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 */ 4 */
13#include <linux/bpf.h> 5#include <linux/bpf.h>
14#include <linux/btf.h> 6#include <linux/btf.h>
@@ -83,6 +75,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
83 u32 elem_size, index_mask, max_entries; 75 u32 elem_size, index_mask, max_entries;
84 bool unpriv = !capable(CAP_SYS_ADMIN); 76 bool unpriv = !capable(CAP_SYS_ADMIN);
85 u64 cost, array_size, mask64; 77 u64 cost, array_size, mask64;
78 struct bpf_map_memory mem;
86 struct bpf_array *array; 79 struct bpf_array *array;
87 80
88 elem_size = round_up(attr->value_size, 8); 81 elem_size = round_up(attr->value_size, 8);
@@ -116,32 +109,29 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
116 109
117 /* make sure there is no u32 overflow later in round_up() */ 110 /* make sure there is no u32 overflow later in round_up() */
118 cost = array_size; 111 cost = array_size;
119 if (cost >= U32_MAX - PAGE_SIZE) 112 if (percpu)
120 return ERR_PTR(-ENOMEM);
121 if (percpu) {
122 cost += (u64)attr->max_entries * elem_size * num_possible_cpus(); 113 cost += (u64)attr->max_entries * elem_size * num_possible_cpus();
123 if (cost >= U32_MAX - PAGE_SIZE)
124 return ERR_PTR(-ENOMEM);
125 }
126 cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
127 114
128 ret = bpf_map_precharge_memlock(cost); 115 ret = bpf_map_charge_init(&mem, cost);
129 if (ret < 0) 116 if (ret < 0)
130 return ERR_PTR(ret); 117 return ERR_PTR(ret);
131 118
132 /* allocate all map elements and zero-initialize them */ 119 /* allocate all map elements and zero-initialize them */
133 array = bpf_map_area_alloc(array_size, numa_node); 120 array = bpf_map_area_alloc(array_size, numa_node);
134 if (!array) 121 if (!array) {
122 bpf_map_charge_finish(&mem);
135 return ERR_PTR(-ENOMEM); 123 return ERR_PTR(-ENOMEM);
124 }
136 array->index_mask = index_mask; 125 array->index_mask = index_mask;
137 array->map.unpriv_array = unpriv; 126 array->map.unpriv_array = unpriv;
138 127
139 /* copy mandatory map attributes */ 128 /* copy mandatory map attributes */
140 bpf_map_init_from_attr(&array->map, attr); 129 bpf_map_init_from_attr(&array->map, attr);
141 array->map.pages = cost; 130 bpf_map_charge_move(&array->map.memory, &mem);
142 array->elem_size = elem_size; 131 array->elem_size = elem_size;
143 132
144 if (percpu && bpf_array_alloc_percpu(array)) { 133 if (percpu && bpf_array_alloc_percpu(array)) {
134 bpf_map_charge_finish(&array->map.memory);
145 bpf_map_area_free(array); 135 bpf_map_area_free(array);
146 return ERR_PTR(-ENOMEM); 136 return ERR_PTR(-ENOMEM);
147 } 137 }
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
index e6ef4401a138..1b6b9349cb85 100644
--- a/kernel/bpf/bpf_lru_list.c
+++ b/kernel/bpf/bpf_lru_list.c
@@ -1,8 +1,5 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* Copyright (c) 2016 Facebook 2/* Copyright (c) 2016 Facebook
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 */ 3 */
7#include <linux/cpumask.h> 4#include <linux/cpumask.h>
8#include <linux/spinlock.h> 5#include <linux/spinlock.h>
diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h
index 7d4f89b7cb84..f02504640e18 100644
--- a/kernel/bpf/bpf_lru_list.h
+++ b/kernel/bpf/bpf_lru_list.h
@@ -1,8 +1,5 @@
1/* SPDX-License-Identifier: GPL-2.0-only */
1/* Copyright (c) 2016 Facebook 2/* Copyright (c) 2016 Facebook
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 */ 3 */
7#ifndef __BPF_LRU_LIST_H_ 4#ifndef __BPF_LRU_LIST_H_
8#define __BPF_LRU_LIST_H_ 5#define __BPF_LRU_LIST_H_
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index cad09858a5f2..546ebee39e2a 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -1928,8 +1928,8 @@ static int btf_array_resolve(struct btf_verifier_env *env,
1928 /* Check array->index_type */ 1928 /* Check array->index_type */
1929 index_type_id = array->index_type; 1929 index_type_id = array->index_type;
1930 index_type = btf_type_by_id(btf, index_type_id); 1930 index_type = btf_type_by_id(btf, index_type_id);
1931 if (btf_type_is_resolve_source_only(index_type) || 1931 if (btf_type_nosize_or_null(index_type) ||
1932 btf_type_nosize_or_null(index_type)) { 1932 btf_type_is_resolve_source_only(index_type)) {
1933 btf_verifier_log_type(env, v->t, "Invalid index"); 1933 btf_verifier_log_type(env, v->t, "Invalid index");
1934 return -EINVAL; 1934 return -EINVAL;
1935 } 1935 }
@@ -1948,8 +1948,8 @@ static int btf_array_resolve(struct btf_verifier_env *env,
1948 /* Check array->type */ 1948 /* Check array->type */
1949 elem_type_id = array->type; 1949 elem_type_id = array->type;
1950 elem_type = btf_type_by_id(btf, elem_type_id); 1950 elem_type = btf_type_by_id(btf, elem_type_id);
1951 if (btf_type_is_resolve_source_only(elem_type) || 1951 if (btf_type_nosize_or_null(elem_type) ||
1952 btf_type_nosize_or_null(elem_type)) { 1952 btf_type_is_resolve_source_only(elem_type)) {
1953 btf_verifier_log_type(env, v->t, 1953 btf_verifier_log_type(env, v->t,
1954 "Invalid elem"); 1954 "Invalid elem");
1955 return -EINVAL; 1955 return -EINVAL;
@@ -2170,8 +2170,8 @@ static int btf_struct_resolve(struct btf_verifier_env *env,
2170 const struct btf_type *member_type = btf_type_by_id(env->btf, 2170 const struct btf_type *member_type = btf_type_by_id(env->btf,
2171 member_type_id); 2171 member_type_id);
2172 2172
2173 if (btf_type_is_resolve_source_only(member_type) || 2173 if (btf_type_nosize_or_null(member_type) ||
2174 btf_type_nosize_or_null(member_type)) { 2174 btf_type_is_resolve_source_only(member_type)) {
2175 btf_verifier_log_member(env, v->t, member, 2175 btf_verifier_log_member(env, v->t, member,
2176 "Invalid member"); 2176 "Invalid member");
2177 return -EINVAL; 2177 return -EINVAL;
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index fcde0f7b2585..0a00eaca6fae 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1,11 +1,8 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * Functions to manage eBPF programs attached to cgroups 3 * Functions to manage eBPF programs attached to cgroups
3 * 4 *
4 * Copyright (c) 2016 Daniel Mack 5 * Copyright (c) 2016 Daniel Mack
5 *
6 * This file is subject to the terms and conditions of version 2 of the GNU
7 * General Public License. See the file COPYING in the main directory of the
8 * Linux distribution for more details.
9 */ 6 */
10 7
11#include <linux/kernel.h> 8#include <linux/kernel.h>
@@ -18,19 +15,34 @@
18#include <linux/bpf.h> 15#include <linux/bpf.h>
19#include <linux/bpf-cgroup.h> 16#include <linux/bpf-cgroup.h>
20#include <net/sock.h> 17#include <net/sock.h>
18#include <net/bpf_sk_storage.h>
19
20#include "../cgroup/cgroup-internal.h"
21 21
22DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key); 22DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key);
23EXPORT_SYMBOL(cgroup_bpf_enabled_key); 23EXPORT_SYMBOL(cgroup_bpf_enabled_key);
24 24
25void cgroup_bpf_offline(struct cgroup *cgrp)
26{
27 cgroup_get(cgrp);
28 percpu_ref_kill(&cgrp->bpf.refcnt);
29}
30
25/** 31/**
26 * cgroup_bpf_put() - put references of all bpf programs 32 * cgroup_bpf_release() - put references of all bpf programs and
27 * @cgrp: the cgroup to modify 33 * release all cgroup bpf data
34 * @work: work structure embedded into the cgroup to modify
28 */ 35 */
29void cgroup_bpf_put(struct cgroup *cgrp) 36static void cgroup_bpf_release(struct work_struct *work)
30{ 37{
38 struct cgroup *cgrp = container_of(work, struct cgroup,
39 bpf.release_work);
31 enum bpf_cgroup_storage_type stype; 40 enum bpf_cgroup_storage_type stype;
41 struct bpf_prog_array *old_array;
32 unsigned int type; 42 unsigned int type;
33 43
44 mutex_lock(&cgroup_mutex);
45
34 for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { 46 for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
35 struct list_head *progs = &cgrp->bpf.progs[type]; 47 struct list_head *progs = &cgrp->bpf.progs[type];
36 struct bpf_prog_list *pl, *tmp; 48 struct bpf_prog_list *pl, *tmp;
@@ -45,8 +57,29 @@ void cgroup_bpf_put(struct cgroup *cgrp)
45 kfree(pl); 57 kfree(pl);
46 static_branch_dec(&cgroup_bpf_enabled_key); 58 static_branch_dec(&cgroup_bpf_enabled_key);
47 } 59 }
48 bpf_prog_array_free(cgrp->bpf.effective[type]); 60 old_array = rcu_dereference_protected(
61 cgrp->bpf.effective[type],
62 lockdep_is_held(&cgroup_mutex));
63 bpf_prog_array_free(old_array);
49 } 64 }
65
66 mutex_unlock(&cgroup_mutex);
67
68 percpu_ref_exit(&cgrp->bpf.refcnt);
69 cgroup_put(cgrp);
70}
71
72/**
73 * cgroup_bpf_release_fn() - callback used to schedule releasing
74 * of bpf cgroup data
75 * @ref: percpu ref counter structure
76 */
77static void cgroup_bpf_release_fn(struct percpu_ref *ref)
78{
79 struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt);
80
81 INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release);
82 queue_work(system_wq, &cgrp->bpf.release_work);
50} 83}
51 84
52/* count number of elements in the list. 85/* count number of elements in the list.
@@ -101,7 +134,7 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp,
101 */ 134 */
102static int compute_effective_progs(struct cgroup *cgrp, 135static int compute_effective_progs(struct cgroup *cgrp,
103 enum bpf_attach_type type, 136 enum bpf_attach_type type,
104 struct bpf_prog_array __rcu **array) 137 struct bpf_prog_array **array)
105{ 138{
106 enum bpf_cgroup_storage_type stype; 139 enum bpf_cgroup_storage_type stype;
107 struct bpf_prog_array *progs; 140 struct bpf_prog_array *progs;
@@ -139,17 +172,16 @@ static int compute_effective_progs(struct cgroup *cgrp,
139 } 172 }
140 } while ((p = cgroup_parent(p))); 173 } while ((p = cgroup_parent(p)));
141 174
142 rcu_assign_pointer(*array, progs); 175 *array = progs;
143 return 0; 176 return 0;
144} 177}
145 178
146static void activate_effective_progs(struct cgroup *cgrp, 179static void activate_effective_progs(struct cgroup *cgrp,
147 enum bpf_attach_type type, 180 enum bpf_attach_type type,
148 struct bpf_prog_array __rcu *array) 181 struct bpf_prog_array *old_array)
149{ 182{
150 struct bpf_prog_array __rcu *old_array; 183 rcu_swap_protected(cgrp->bpf.effective[type], old_array,
151 184 lockdep_is_held(&cgroup_mutex));
152 old_array = xchg(&cgrp->bpf.effective[type], array);
153 /* free prog array after grace period, since __cgroup_bpf_run_*() 185 /* free prog array after grace period, since __cgroup_bpf_run_*()
154 * might be still walking the array 186 * might be still walking the array
155 */ 187 */
@@ -166,8 +198,13 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
166 * that array below is variable length 198 * that array below is variable length
167 */ 199 */
168#define NR ARRAY_SIZE(cgrp->bpf.effective) 200#define NR ARRAY_SIZE(cgrp->bpf.effective)
169 struct bpf_prog_array __rcu *arrays[NR] = {}; 201 struct bpf_prog_array *arrays[NR] = {};
170 int i; 202 int ret, i;
203
204 ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0,
205 GFP_KERNEL);
206 if (ret)
207 return ret;
171 208
172 for (i = 0; i < NR; i++) 209 for (i = 0; i < NR; i++)
173 INIT_LIST_HEAD(&cgrp->bpf.progs[i]); 210 INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
@@ -183,6 +220,9 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
183cleanup: 220cleanup:
184 for (i = 0; i < NR; i++) 221 for (i = 0; i < NR; i++)
185 bpf_prog_array_free(arrays[i]); 222 bpf_prog_array_free(arrays[i]);
223
224 percpu_ref_exit(&cgrp->bpf.refcnt);
225
186 return -ENOMEM; 226 return -ENOMEM;
187} 227}
188 228
@@ -196,6 +236,9 @@ static int update_effective_progs(struct cgroup *cgrp,
196 css_for_each_descendant_pre(css, &cgrp->self) { 236 css_for_each_descendant_pre(css, &cgrp->self) {
197 struct cgroup *desc = container_of(css, struct cgroup, self); 237 struct cgroup *desc = container_of(css, struct cgroup, self);
198 238
239 if (percpu_ref_is_zero(&desc->bpf.refcnt))
240 continue;
241
199 err = compute_effective_progs(desc, type, &desc->bpf.inactive); 242 err = compute_effective_progs(desc, type, &desc->bpf.inactive);
200 if (err) 243 if (err)
201 goto cleanup; 244 goto cleanup;
@@ -205,6 +248,14 @@ static int update_effective_progs(struct cgroup *cgrp,
205 css_for_each_descendant_pre(css, &cgrp->self) { 248 css_for_each_descendant_pre(css, &cgrp->self) {
206 struct cgroup *desc = container_of(css, struct cgroup, self); 249 struct cgroup *desc = container_of(css, struct cgroup, self);
207 250
251 if (percpu_ref_is_zero(&desc->bpf.refcnt)) {
252 if (unlikely(desc->bpf.inactive)) {
253 bpf_prog_array_free(desc->bpf.inactive);
254 desc->bpf.inactive = NULL;
255 }
256 continue;
257 }
258
208 activate_effective_progs(desc, type, desc->bpf.inactive); 259 activate_effective_progs(desc, type, desc->bpf.inactive);
209 desc->bpf.inactive = NULL; 260 desc->bpf.inactive = NULL;
210 } 261 }
@@ -444,10 +495,14 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
444 enum bpf_attach_type type = attr->query.attach_type; 495 enum bpf_attach_type type = attr->query.attach_type;
445 struct list_head *progs = &cgrp->bpf.progs[type]; 496 struct list_head *progs = &cgrp->bpf.progs[type];
446 u32 flags = cgrp->bpf.flags[type]; 497 u32 flags = cgrp->bpf.flags[type];
498 struct bpf_prog_array *effective;
447 int cnt, ret = 0, i; 499 int cnt, ret = 0, i;
448 500
501 effective = rcu_dereference_protected(cgrp->bpf.effective[type],
502 lockdep_is_held(&cgroup_mutex));
503
449 if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) 504 if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
450 cnt = bpf_prog_array_length(cgrp->bpf.effective[type]); 505 cnt = bpf_prog_array_length(effective);
451 else 506 else
452 cnt = prog_list_length(progs); 507 cnt = prog_list_length(progs);
453 508
@@ -464,8 +519,7 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
464 } 519 }
465 520
466 if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { 521 if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
467 return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type], 522 return bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
468 prog_ids, cnt);
469 } else { 523 } else {
470 struct bpf_prog_list *pl; 524 struct bpf_prog_list *pl;
471 u32 id; 525 u32 id;
@@ -548,8 +602,16 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
548 * The program type passed in via @type must be suitable for network 602 * The program type passed in via @type must be suitable for network
549 * filtering. No further check is performed to assert that. 603 * filtering. No further check is performed to assert that.
550 * 604 *
551 * This function will return %-EPERM if any if an attached program was found 605 * For egress packets, this function can return:
552 * and if it returned != 1 during execution. In all other cases, 0 is returned. 606 * NET_XMIT_SUCCESS (0) - continue with packet output
607 * NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr
608 * NET_XMIT_CN (2) - continue with packet output and notify TCP
609 * to call cwr
610 * -EPERM - drop packet
611 *
612 * For ingress packets, this function will return -EPERM if any
613 * attached program was found and if it returned != 1 during execution.
614 * Otherwise 0 is returned.
553 */ 615 */
554int __cgroup_bpf_run_filter_skb(struct sock *sk, 616int __cgroup_bpf_run_filter_skb(struct sock *sk,
555 struct sk_buff *skb, 617 struct sk_buff *skb,
@@ -575,12 +637,19 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
575 /* compute pointers for the bpf prog */ 637 /* compute pointers for the bpf prog */
576 bpf_compute_and_save_data_end(skb, &saved_data_end); 638 bpf_compute_and_save_data_end(skb, &saved_data_end);
577 639
578 ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, 640 if (type == BPF_CGROUP_INET_EGRESS) {
579 __bpf_prog_run_save_cb); 641 ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(
642 cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb);
643 } else {
644 ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
645 __bpf_prog_run_save_cb);
646 ret = (ret == 1 ? 0 : -EPERM);
647 }
580 bpf_restore_data_end(skb, saved_data_end); 648 bpf_restore_data_end(skb, saved_data_end);
581 __skb_pull(skb, offset); 649 __skb_pull(skb, offset);
582 skb->sk = save_sk; 650 skb->sk = save_sk;
583 return ret == 1 ? 0 : -EPERM; 651
652 return ret;
584} 653}
585EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); 654EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
586 655
@@ -870,6 +939,190 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
870} 939}
871EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); 940EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl);
872 941
942#ifdef CONFIG_NET
943static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
944 enum bpf_attach_type attach_type)
945{
946 struct bpf_prog_array *prog_array;
947 bool empty;
948
949 rcu_read_lock();
950 prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]);
951 empty = bpf_prog_array_is_empty(prog_array);
952 rcu_read_unlock();
953
954 return empty;
955}
956
957static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
958{
959 if (unlikely(max_optlen > PAGE_SIZE) || max_optlen < 0)
960 return -EINVAL;
961
962 ctx->optval = kzalloc(max_optlen, GFP_USER);
963 if (!ctx->optval)
964 return -ENOMEM;
965
966 ctx->optval_end = ctx->optval + max_optlen;
967 ctx->optlen = max_optlen;
968
969 return 0;
970}
971
972static void sockopt_free_buf(struct bpf_sockopt_kern *ctx)
973{
974 kfree(ctx->optval);
975}
976
977int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
978 int *optname, char __user *optval,
979 int *optlen, char **kernel_optval)
980{
981 struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
982 struct bpf_sockopt_kern ctx = {
983 .sk = sk,
984 .level = *level,
985 .optname = *optname,
986 };
987 int ret;
988
989 /* Opportunistic check to see whether we have any BPF program
990 * attached to the hook so we don't waste time allocating
991 * memory and locking the socket.
992 */
993 if (!cgroup_bpf_enabled ||
994 __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT))
995 return 0;
996
997 ret = sockopt_alloc_buf(&ctx, *optlen);
998 if (ret)
999 return ret;
1000
1001 if (copy_from_user(ctx.optval, optval, *optlen) != 0) {
1002 ret = -EFAULT;
1003 goto out;
1004 }
1005
1006 lock_sock(sk);
1007 ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT],
1008 &ctx, BPF_PROG_RUN);
1009 release_sock(sk);
1010
1011 if (!ret) {
1012 ret = -EPERM;
1013 goto out;
1014 }
1015
1016 if (ctx.optlen == -1) {
1017 /* optlen set to -1, bypass kernel */
1018 ret = 1;
1019 } else if (ctx.optlen > *optlen || ctx.optlen < -1) {
1020 /* optlen is out of bounds */
1021 ret = -EFAULT;
1022 } else {
1023 /* optlen within bounds, run kernel handler */
1024 ret = 0;
1025
1026 /* export any potential modifications */
1027 *level = ctx.level;
1028 *optname = ctx.optname;
1029 *optlen = ctx.optlen;
1030 *kernel_optval = ctx.optval;
1031 }
1032
1033out:
1034 if (ret)
1035 sockopt_free_buf(&ctx);
1036 return ret;
1037}
1038EXPORT_SYMBOL(__cgroup_bpf_run_filter_setsockopt);
1039
1040int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
1041 int optname, char __user *optval,
1042 int __user *optlen, int max_optlen,
1043 int retval)
1044{
1045 struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1046 struct bpf_sockopt_kern ctx = {
1047 .sk = sk,
1048 .level = level,
1049 .optname = optname,
1050 .retval = retval,
1051 };
1052 int ret;
1053
1054 /* Opportunistic check to see whether we have any BPF program
1055 * attached to the hook so we don't waste time allocating
1056 * memory and locking the socket.
1057 */
1058 if (!cgroup_bpf_enabled ||
1059 __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT))
1060 return retval;
1061
1062 ret = sockopt_alloc_buf(&ctx, max_optlen);
1063 if (ret)
1064 return ret;
1065
1066 if (!retval) {
1067 /* If kernel getsockopt finished successfully,
1068 * copy whatever was returned to the user back
1069 * into our temporary buffer. Set optlen to the
1070 * one that kernel returned as well to let
1071 * BPF programs inspect the value.
1072 */
1073
1074 if (get_user(ctx.optlen, optlen)) {
1075 ret = -EFAULT;
1076 goto out;
1077 }
1078
1079 if (ctx.optlen > max_optlen)
1080 ctx.optlen = max_optlen;
1081
1082 if (copy_from_user(ctx.optval, optval, ctx.optlen) != 0) {
1083 ret = -EFAULT;
1084 goto out;
1085 }
1086 }
1087
1088 lock_sock(sk);
1089 ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
1090 &ctx, BPF_PROG_RUN);
1091 release_sock(sk);
1092
1093 if (!ret) {
1094 ret = -EPERM;
1095 goto out;
1096 }
1097
1098 if (ctx.optlen > max_optlen) {
1099 ret = -EFAULT;
1100 goto out;
1101 }
1102
1103 /* BPF programs only allowed to set retval to 0, not some
1104 * arbitrary value.
1105 */
1106 if (ctx.retval != 0 && ctx.retval != retval) {
1107 ret = -EFAULT;
1108 goto out;
1109 }
1110
1111 if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
1112 put_user(ctx.optlen, optlen)) {
1113 ret = -EFAULT;
1114 goto out;
1115 }
1116
1117 ret = ctx.retval;
1118
1119out:
1120 sockopt_free_buf(&ctx);
1121 return ret;
1122}
1123EXPORT_SYMBOL(__cgroup_bpf_run_filter_getsockopt);
1124#endif
1125
873static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, 1126static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
874 size_t *lenp) 1127 size_t *lenp)
875{ 1128{
@@ -1130,3 +1383,155 @@ const struct bpf_verifier_ops cg_sysctl_verifier_ops = {
1130 1383
1131const struct bpf_prog_ops cg_sysctl_prog_ops = { 1384const struct bpf_prog_ops cg_sysctl_prog_ops = {
1132}; 1385};
1386
1387static const struct bpf_func_proto *
1388cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
1389{
1390 switch (func_id) {
1391#ifdef CONFIG_NET
1392 case BPF_FUNC_sk_storage_get:
1393 return &bpf_sk_storage_get_proto;
1394 case BPF_FUNC_sk_storage_delete:
1395 return &bpf_sk_storage_delete_proto;
1396#endif
1397#ifdef CONFIG_INET
1398 case BPF_FUNC_tcp_sock:
1399 return &bpf_tcp_sock_proto;
1400#endif
1401 default:
1402 return cgroup_base_func_proto(func_id, prog);
1403 }
1404}
1405
1406static bool cg_sockopt_is_valid_access(int off, int size,
1407 enum bpf_access_type type,
1408 const struct bpf_prog *prog,
1409 struct bpf_insn_access_aux *info)
1410{
1411 const int size_default = sizeof(__u32);
1412
1413 if (off < 0 || off >= sizeof(struct bpf_sockopt))
1414 return false;
1415
1416 if (off % size != 0)
1417 return false;
1418
1419 if (type == BPF_WRITE) {
1420 switch (off) {
1421 case offsetof(struct bpf_sockopt, retval):
1422 if (size != size_default)
1423 return false;
1424 return prog->expected_attach_type ==
1425 BPF_CGROUP_GETSOCKOPT;
1426 case offsetof(struct bpf_sockopt, optname):
1427 /* fallthrough */
1428 case offsetof(struct bpf_sockopt, level):
1429 if (size != size_default)
1430 return false;
1431 return prog->expected_attach_type ==
1432 BPF_CGROUP_SETSOCKOPT;
1433 case offsetof(struct bpf_sockopt, optlen):
1434 return size == size_default;
1435 default:
1436 return false;
1437 }
1438 }
1439
1440 switch (off) {
1441 case offsetof(struct bpf_sockopt, sk):
1442 if (size != sizeof(__u64))
1443 return false;
1444 info->reg_type = PTR_TO_SOCKET;
1445 break;
1446 case offsetof(struct bpf_sockopt, optval):
1447 if (size != sizeof(__u64))
1448 return false;
1449 info->reg_type = PTR_TO_PACKET;
1450 break;
1451 case offsetof(struct bpf_sockopt, optval_end):
1452 if (size != sizeof(__u64))
1453 return false;
1454 info->reg_type = PTR_TO_PACKET_END;
1455 break;
1456 case offsetof(struct bpf_sockopt, retval):
1457 if (size != size_default)
1458 return false;
1459 return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT;
1460 default:
1461 if (size != size_default)
1462 return false;
1463 break;
1464 }
1465 return true;
1466}
1467
1468#define CG_SOCKOPT_ACCESS_FIELD(T, F) \
1469 T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F), \
1470 si->dst_reg, si->src_reg, \
1471 offsetof(struct bpf_sockopt_kern, F))
1472
1473static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
1474 const struct bpf_insn *si,
1475 struct bpf_insn *insn_buf,
1476 struct bpf_prog *prog,
1477 u32 *target_size)
1478{
1479 struct bpf_insn *insn = insn_buf;
1480
1481 switch (si->off) {
1482 case offsetof(struct bpf_sockopt, sk):
1483 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk);
1484 break;
1485 case offsetof(struct bpf_sockopt, level):
1486 if (type == BPF_WRITE)
1487 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level);
1488 else
1489 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level);
1490 break;
1491 case offsetof(struct bpf_sockopt, optname):
1492 if (type == BPF_WRITE)
1493 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname);
1494 else
1495 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname);
1496 break;
1497 case offsetof(struct bpf_sockopt, optlen):
1498 if (type == BPF_WRITE)
1499 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen);
1500 else
1501 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen);
1502 break;
1503 case offsetof(struct bpf_sockopt, retval):
1504 if (type == BPF_WRITE)
1505 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval);
1506 else
1507 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval);
1508 break;
1509 case offsetof(struct bpf_sockopt, optval):
1510 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval);
1511 break;
1512 case offsetof(struct bpf_sockopt, optval_end):
1513 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end);
1514 break;
1515 }
1516
1517 return insn - insn_buf;
1518}
1519
1520static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf,
1521 bool direct_write,
1522 const struct bpf_prog *prog)
1523{
1524 /* Nothing to do for sockopt argument. The data is kzalloc'ated.
1525 */
1526 return 0;
1527}
1528
1529const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
1530 .get_func_proto = cg_sockopt_func_proto,
1531 .is_valid_access = cg_sockopt_is_valid_access,
1532 .convert_ctx_access = cg_sockopt_convert_ctx_access,
1533 .gen_prologue = cg_sockopt_get_prologue,
1534};
1535
1536const struct bpf_prog_ops cg_sockopt_prog_ops = {
1537};
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 242a643af82f..16079550db6d 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
1/* 2/*
2 * Linux Socket Filter - Kernel level socket filtering 3 * Linux Socket Filter - Kernel level socket filtering
3 * 4 *
@@ -12,11 +13,6 @@
12 * Alexei Starovoitov <ast@plumgrid.com> 13 * Alexei Starovoitov <ast@plumgrid.com>
13 * Daniel Borkmann <dborkman@redhat.com> 14 * Daniel Borkmann <dborkman@redhat.com>
14 * 15 *
15 * This program is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU General Public License
17 * as published by the Free Software Foundation; either version
18 * 2 of the License, or (at your option) any later version.
19 *
20 * Andi Kleen - Fix a few bad bugs and races. 16 * Andi Kleen - Fix a few bad bugs and races.
21 * Kris Katterjohn - Added many additional checks in bpf_check_classic() 17 * Kris Katterjohn - Added many additional checks in bpf_check_classic()
22 */ 18 */
@@ -1368,10 +1364,10 @@ select_insn:
1368 insn++; 1364 insn++;
1369 CONT; 1365 CONT;
1370 ALU_ARSH_X: 1366 ALU_ARSH_X:
1371 DST = (u64) (u32) ((*(s32 *) &DST) >> SRC); 1367 DST = (u64) (u32) (((s32) DST) >> SRC);
1372 CONT; 1368 CONT;
1373 ALU_ARSH_K: 1369 ALU_ARSH_K:
1374 DST = (u64) (u32) ((*(s32 *) &DST) >> IMM); 1370 DST = (u64) (u32) (((s32) DST) >> IMM);
1375 CONT; 1371 CONT;
1376 ALU64_ARSH_X: 1372 ALU64_ARSH_X:
1377 (*(s64 *) &DST) >>= SRC; 1373 (*(s64 *) &DST) >>= SRC;
@@ -1795,38 +1791,42 @@ struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
1795 return &empty_prog_array.hdr; 1791 return &empty_prog_array.hdr;
1796} 1792}
1797 1793
1798void bpf_prog_array_free(struct bpf_prog_array __rcu *progs) 1794void bpf_prog_array_free(struct bpf_prog_array *progs)
1799{ 1795{
1800 if (!progs || 1796 if (!progs || progs == &empty_prog_array.hdr)
1801 progs == (struct bpf_prog_array __rcu *)&empty_prog_array.hdr)
1802 return; 1797 return;
1803 kfree_rcu(progs, rcu); 1798 kfree_rcu(progs, rcu);
1804} 1799}
1805 1800
1806int bpf_prog_array_length(struct bpf_prog_array __rcu *array) 1801int bpf_prog_array_length(struct bpf_prog_array *array)
1807{ 1802{
1808 struct bpf_prog_array_item *item; 1803 struct bpf_prog_array_item *item;
1809 u32 cnt = 0; 1804 u32 cnt = 0;
1810 1805
1811 rcu_read_lock(); 1806 for (item = array->items; item->prog; item++)
1812 item = rcu_dereference(array)->items;
1813 for (; item->prog; item++)
1814 if (item->prog != &dummy_bpf_prog.prog) 1807 if (item->prog != &dummy_bpf_prog.prog)
1815 cnt++; 1808 cnt++;
1816 rcu_read_unlock();
1817 return cnt; 1809 return cnt;
1818} 1810}
1819 1811
1812bool bpf_prog_array_is_empty(struct bpf_prog_array *array)
1813{
1814 struct bpf_prog_array_item *item;
1815
1816 for (item = array->items; item->prog; item++)
1817 if (item->prog != &dummy_bpf_prog.prog)
1818 return false;
1819 return true;
1820}
1820 1821
1821static bool bpf_prog_array_copy_core(struct bpf_prog_array __rcu *array, 1822static bool bpf_prog_array_copy_core(struct bpf_prog_array *array,
1822 u32 *prog_ids, 1823 u32 *prog_ids,
1823 u32 request_cnt) 1824 u32 request_cnt)
1824{ 1825{
1825 struct bpf_prog_array_item *item; 1826 struct bpf_prog_array_item *item;
1826 int i = 0; 1827 int i = 0;
1827 1828
1828 item = rcu_dereference_check(array, 1)->items; 1829 for (item = array->items; item->prog; item++) {
1829 for (; item->prog; item++) {
1830 if (item->prog == &dummy_bpf_prog.prog) 1830 if (item->prog == &dummy_bpf_prog.prog)
1831 continue; 1831 continue;
1832 prog_ids[i] = item->prog->aux->id; 1832 prog_ids[i] = item->prog->aux->id;
@@ -1839,7 +1839,7 @@ static bool bpf_prog_array_copy_core(struct bpf_prog_array __rcu *array,
1839 return !!(item->prog); 1839 return !!(item->prog);
1840} 1840}
1841 1841
1842int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array, 1842int bpf_prog_array_copy_to_user(struct bpf_prog_array *array,
1843 __u32 __user *prog_ids, u32 cnt) 1843 __u32 __user *prog_ids, u32 cnt)
1844{ 1844{
1845 unsigned long err = 0; 1845 unsigned long err = 0;
@@ -1850,18 +1850,12 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array,
1850 * cnt = bpf_prog_array_length(); 1850 * cnt = bpf_prog_array_length();
1851 * if (cnt > 0) 1851 * if (cnt > 0)
1852 * bpf_prog_array_copy_to_user(..., cnt); 1852 * bpf_prog_array_copy_to_user(..., cnt);
1853 * so below kcalloc doesn't need extra cnt > 0 check, but 1853 * so below kcalloc doesn't need extra cnt > 0 check.
1854 * bpf_prog_array_length() releases rcu lock and
1855 * prog array could have been swapped with empty or larger array,
1856 * so always copy 'cnt' prog_ids to the user.
1857 * In a rare race the user will see zero prog_ids
1858 */ 1854 */
1859 ids = kcalloc(cnt, sizeof(u32), GFP_USER | __GFP_NOWARN); 1855 ids = kcalloc(cnt, sizeof(u32), GFP_USER | __GFP_NOWARN);
1860 if (!ids) 1856 if (!ids)
1861 return -ENOMEM; 1857 return -ENOMEM;
1862 rcu_read_lock();
1863 nospc = bpf_prog_array_copy_core(array, ids, cnt); 1858 nospc = bpf_prog_array_copy_core(array, ids, cnt);
1864 rcu_read_unlock();
1865 err = copy_to_user(prog_ids, ids, cnt * sizeof(u32)); 1859 err = copy_to_user(prog_ids, ids, cnt * sizeof(u32));
1866 kfree(ids); 1860 kfree(ids);
1867 if (err) 1861 if (err)
@@ -1871,19 +1865,19 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array,
1871 return 0; 1865 return 0;
1872} 1866}
1873 1867
1874void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *array, 1868void bpf_prog_array_delete_safe(struct bpf_prog_array *array,
1875 struct bpf_prog *old_prog) 1869 struct bpf_prog *old_prog)
1876{ 1870{
1877 struct bpf_prog_array_item *item = array->items; 1871 struct bpf_prog_array_item *item;
1878 1872
1879 for (; item->prog; item++) 1873 for (item = array->items; item->prog; item++)
1880 if (item->prog == old_prog) { 1874 if (item->prog == old_prog) {
1881 WRITE_ONCE(item->prog, &dummy_bpf_prog.prog); 1875 WRITE_ONCE(item->prog, &dummy_bpf_prog.prog);
1882 break; 1876 break;
1883 } 1877 }
1884} 1878}
1885 1879
1886int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, 1880int bpf_prog_array_copy(struct bpf_prog_array *old_array,
1887 struct bpf_prog *exclude_prog, 1881 struct bpf_prog *exclude_prog,
1888 struct bpf_prog *include_prog, 1882 struct bpf_prog *include_prog,
1889 struct bpf_prog_array **new_array) 1883 struct bpf_prog_array **new_array)
@@ -1947,7 +1941,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
1947 return 0; 1941 return 0;
1948} 1942}
1949 1943
1950int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, 1944int bpf_prog_array_copy_info(struct bpf_prog_array *array,
1951 u32 *prog_ids, u32 request_cnt, 1945 u32 *prog_ids, u32 request_cnt,
1952 u32 *prog_cnt) 1946 u32 *prog_cnt)
1953{ 1947{
@@ -2090,6 +2084,15 @@ bool __weak bpf_helper_changes_pkt_data(void *func)
2090 return false; 2084 return false;
2091} 2085}
2092 2086
2087/* Return TRUE if the JIT backend wants verifier to enable sub-register usage
2088 * analysis code and wants explicit zero extension inserted by verifier.
2089 * Otherwise, return FALSE.
2090 */
2091bool __weak bpf_jit_needs_zext(void)
2092{
2093 return false;
2094}
2095
2093/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call 2096/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
2094 * skb_copy_bits(), so provide a weak definition of it for NET-less config. 2097 * skb_copy_bits(), so provide a weak definition of it for NET-less config.
2095 */ 2098 */
@@ -2101,10 +2104,10 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
2101 2104
2102DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key); 2105DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
2103EXPORT_SYMBOL(bpf_stats_enabled_key); 2106EXPORT_SYMBOL(bpf_stats_enabled_key);
2104int sysctl_bpf_stats_enabled __read_mostly;
2105 2107
2106/* All definitions of tracepoints related to BPF. */ 2108/* All definitions of tracepoints related to BPF. */
2107#define CREATE_TRACE_POINTS 2109#define CREATE_TRACE_POINTS
2108#include <linux/bpf_trace.h> 2110#include <linux/bpf_trace.h>
2109 2111
2110EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception); 2112EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception);
2113EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx);
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index cf727d77c6c6..ef49e17ae47c 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -1,7 +1,7 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* bpf/cpumap.c 2/* bpf/cpumap.c
2 * 3 *
3 * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. 4 * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
4 * Released under terms in GPL version 2. See COPYING.
5 */ 5 */
6 6
7/* The 'cpumap' is primarily used as a backend map for XDP BPF helper 7/* The 'cpumap' is primarily used as a backend map for XDP BPF helper
@@ -32,14 +32,19 @@
32 32
33/* General idea: XDP packets getting XDP redirected to another CPU, 33/* General idea: XDP packets getting XDP redirected to another CPU,
34 * will maximum be stored/queued for one driver ->poll() call. It is 34 * will maximum be stored/queued for one driver ->poll() call. It is
35 * guaranteed that setting flush bit and flush operation happen on 35 * guaranteed that queueing the frame and the flush operation happen on
36 * same CPU. Thus, cpu_map_flush operation can deduct via this_cpu_ptr() 36 * same CPU. Thus, cpu_map_flush operation can deduct via this_cpu_ptr()
37 * which queue in bpf_cpu_map_entry contains packets. 37 * which queue in bpf_cpu_map_entry contains packets.
38 */ 38 */
39 39
40#define CPU_MAP_BULK_SIZE 8 /* 8 == one cacheline on 64-bit archs */ 40#define CPU_MAP_BULK_SIZE 8 /* 8 == one cacheline on 64-bit archs */
41struct bpf_cpu_map_entry;
42struct bpf_cpu_map;
43
41struct xdp_bulk_queue { 44struct xdp_bulk_queue {
42 void *q[CPU_MAP_BULK_SIZE]; 45 void *q[CPU_MAP_BULK_SIZE];
46 struct list_head flush_node;
47 struct bpf_cpu_map_entry *obj;
43 unsigned int count; 48 unsigned int count;
44}; 49};
45 50
@@ -52,6 +57,8 @@ struct bpf_cpu_map_entry {
52 /* XDP can run multiple RX-ring queues, need __percpu enqueue store */ 57 /* XDP can run multiple RX-ring queues, need __percpu enqueue store */
53 struct xdp_bulk_queue __percpu *bulkq; 58 struct xdp_bulk_queue __percpu *bulkq;
54 59
60 struct bpf_cpu_map *cmap;
61
55 /* Queue with potential multi-producers, and single-consumer kthread */ 62 /* Queue with potential multi-producers, and single-consumer kthread */
56 struct ptr_ring *queue; 63 struct ptr_ring *queue;
57 struct task_struct *kthread; 64 struct task_struct *kthread;
@@ -65,23 +72,17 @@ struct bpf_cpu_map {
65 struct bpf_map map; 72 struct bpf_map map;
66 /* Below members specific for map type */ 73 /* Below members specific for map type */
67 struct bpf_cpu_map_entry **cpu_map; 74 struct bpf_cpu_map_entry **cpu_map;
68 unsigned long __percpu *flush_needed; 75 struct list_head __percpu *flush_list;
69}; 76};
70 77
71static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, 78static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx);
72 struct xdp_bulk_queue *bq, bool in_napi_ctx);
73
74static u64 cpu_map_bitmap_size(const union bpf_attr *attr)
75{
76 return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long);
77}
78 79
79static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) 80static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
80{ 81{
81 struct bpf_cpu_map *cmap; 82 struct bpf_cpu_map *cmap;
82 int err = -ENOMEM; 83 int err = -ENOMEM;
84 int ret, cpu;
83 u64 cost; 85 u64 cost;
84 int ret;
85 86
86 if (!capable(CAP_SYS_ADMIN)) 87 if (!capable(CAP_SYS_ADMIN))
87 return ERR_PTR(-EPERM); 88 return ERR_PTR(-EPERM);
@@ -105,23 +106,21 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
105 106
106 /* make sure page count doesn't overflow */ 107 /* make sure page count doesn't overflow */
107 cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *); 108 cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *);
108 cost += cpu_map_bitmap_size(attr) * num_possible_cpus(); 109 cost += sizeof(struct list_head) * num_possible_cpus();
109 if (cost >= U32_MAX - PAGE_SIZE)
110 goto free_cmap;
111 cmap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
112 110
113 /* Notice returns -EPERM on if map size is larger than memlock limit */ 111 /* Notice returns -EPERM on if map size is larger than memlock limit */
114 ret = bpf_map_precharge_memlock(cmap->map.pages); 112 ret = bpf_map_charge_init(&cmap->map.memory, cost);
115 if (ret) { 113 if (ret) {
116 err = ret; 114 err = ret;
117 goto free_cmap; 115 goto free_cmap;
118 } 116 }
119 117
120 /* A per cpu bitfield with a bit per possible CPU in map */ 118 cmap->flush_list = alloc_percpu(struct list_head);
121 cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr), 119 if (!cmap->flush_list)
122 __alignof__(unsigned long)); 120 goto free_charge;
123 if (!cmap->flush_needed) 121
124 goto free_cmap; 122 for_each_possible_cpu(cpu)
123 INIT_LIST_HEAD(per_cpu_ptr(cmap->flush_list, cpu));
125 124
126 /* Alloc array for possible remote "destination" CPUs */ 125 /* Alloc array for possible remote "destination" CPUs */
127 cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * 126 cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
@@ -132,7 +131,9 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
132 131
133 return &cmap->map; 132 return &cmap->map;
134free_percpu: 133free_percpu:
135 free_percpu(cmap->flush_needed); 134 free_percpu(cmap->flush_list);
135free_charge:
136 bpf_map_charge_finish(&cmap->map.memory);
136free_cmap: 137free_cmap:
137 kfree(cmap); 138 kfree(cmap);
138 return ERR_PTR(err); 139 return ERR_PTR(err);
@@ -209,6 +210,9 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
209 * - RX ring dev queue index (skb_record_rx_queue) 210 * - RX ring dev queue index (skb_record_rx_queue)
210 */ 211 */
211 212
213 /* Until page_pool get SKB return path, release DMA here */
214 xdp_release_frame(xdpf);
215
212 /* Allow SKB to reuse area used by xdp_frame */ 216 /* Allow SKB to reuse area used by xdp_frame */
213 xdp_scrub_frame(xdpf); 217 xdp_scrub_frame(xdpf);
214 218
@@ -332,7 +336,8 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu,
332{ 336{
333 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; 337 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
334 struct bpf_cpu_map_entry *rcpu; 338 struct bpf_cpu_map_entry *rcpu;
335 int numa, err; 339 struct xdp_bulk_queue *bq;
340 int numa, err, i;
336 341
337 /* Have map->numa_node, but choose node of redirect target CPU */ 342 /* Have map->numa_node, but choose node of redirect target CPU */
338 numa = cpu_to_node(cpu); 343 numa = cpu_to_node(cpu);
@@ -347,6 +352,11 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu,
347 if (!rcpu->bulkq) 352 if (!rcpu->bulkq)
348 goto free_rcu; 353 goto free_rcu;
349 354
355 for_each_possible_cpu(i) {
356 bq = per_cpu_ptr(rcpu->bulkq, i);
357 bq->obj = rcpu;
358 }
359
350 /* Alloc queue */ 360 /* Alloc queue */
351 rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa); 361 rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa);
352 if (!rcpu->queue) 362 if (!rcpu->queue)
@@ -403,7 +413,7 @@ static void __cpu_map_entry_free(struct rcu_head *rcu)
403 struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu); 413 struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu);
404 414
405 /* No concurrent bq_enqueue can run at this point */ 415 /* No concurrent bq_enqueue can run at this point */
406 bq_flush_to_queue(rcpu, bq, false); 416 bq_flush_to_queue(bq, false);
407 } 417 }
408 free_percpu(rcpu->bulkq); 418 free_percpu(rcpu->bulkq);
409 /* Cannot kthread_stop() here, last put free rcpu resources */ 419 /* Cannot kthread_stop() here, last put free rcpu resources */
@@ -486,6 +496,7 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
486 rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id); 496 rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id);
487 if (!rcpu) 497 if (!rcpu)
488 return -ENOMEM; 498 return -ENOMEM;
499 rcpu->cmap = cmap;
489 } 500 }
490 rcu_read_lock(); 501 rcu_read_lock();
491 __cpu_map_entry_replace(cmap, key_cpu, rcpu); 502 __cpu_map_entry_replace(cmap, key_cpu, rcpu);
@@ -512,14 +523,14 @@ static void cpu_map_free(struct bpf_map *map)
512 synchronize_rcu(); 523 synchronize_rcu();
513 524
514 /* To ensure all pending flush operations have completed wait for flush 525 /* To ensure all pending flush operations have completed wait for flush
515 * bitmap to indicate all flush_needed bits to be zero on _all_ cpus. 526 * list be empty on _all_ cpus. Because the above synchronize_rcu()
516 * Because the above synchronize_rcu() ensures the map is disconnected 527 * ensures the map is disconnected from the program we can assume no new
517 * from the program we can assume no new bits will be set. 528 * items will be added to the list.
518 */ 529 */
519 for_each_online_cpu(cpu) { 530 for_each_online_cpu(cpu) {
520 unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu); 531 struct list_head *flush_list = per_cpu_ptr(cmap->flush_list, cpu);
521 532
522 while (!bitmap_empty(bitmap, cmap->map.max_entries)) 533 while (!list_empty(flush_list))
523 cond_resched(); 534 cond_resched();
524 } 535 }
525 536
@@ -536,7 +547,7 @@ static void cpu_map_free(struct bpf_map *map)
536 /* bq flush and cleanup happens after RCU graze-period */ 547 /* bq flush and cleanup happens after RCU graze-period */
537 __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */ 548 __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */
538 } 549 }
539 free_percpu(cmap->flush_needed); 550 free_percpu(cmap->flush_list);
540 bpf_map_area_free(cmap->cpu_map); 551 bpf_map_area_free(cmap->cpu_map);
541 kfree(cmap); 552 kfree(cmap);
542} 553}
@@ -588,9 +599,9 @@ const struct bpf_map_ops cpu_map_ops = {
588 .map_check_btf = map_check_no_btf, 599 .map_check_btf = map_check_no_btf,
589}; 600};
590 601
591static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, 602static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx)
592 struct xdp_bulk_queue *bq, bool in_napi_ctx)
593{ 603{
604 struct bpf_cpu_map_entry *rcpu = bq->obj;
594 unsigned int processed = 0, drops = 0; 605 unsigned int processed = 0, drops = 0;
595 const int to_cpu = rcpu->cpu; 606 const int to_cpu = rcpu->cpu;
596 struct ptr_ring *q; 607 struct ptr_ring *q;
@@ -619,6 +630,8 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
619 bq->count = 0; 630 bq->count = 0;
620 spin_unlock(&q->producer_lock); 631 spin_unlock(&q->producer_lock);
621 632
633 __list_del_clearprev(&bq->flush_node);
634
622 /* Feedback loop via tracepoints */ 635 /* Feedback loop via tracepoints */
623 trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu); 636 trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu);
624 return 0; 637 return 0;
@@ -629,10 +642,11 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
629 */ 642 */
630static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) 643static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
631{ 644{
645 struct list_head *flush_list = this_cpu_ptr(rcpu->cmap->flush_list);
632 struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); 646 struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
633 647
634 if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) 648 if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
635 bq_flush_to_queue(rcpu, bq, true); 649 bq_flush_to_queue(bq, true);
636 650
637 /* Notice, xdp_buff/page MUST be queued here, long enough for 651 /* Notice, xdp_buff/page MUST be queued here, long enough for
638 * driver to code invoking us to finished, due to driver 652 * driver to code invoking us to finished, due to driver
@@ -644,6 +658,10 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
644 * operation, when completing napi->poll call. 658 * operation, when completing napi->poll call.
645 */ 659 */
646 bq->q[bq->count++] = xdpf; 660 bq->q[bq->count++] = xdpf;
661
662 if (!bq->flush_node.prev)
663 list_add(&bq->flush_node, flush_list);
664
647 return 0; 665 return 0;
648} 666}
649 667
@@ -663,41 +681,16 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
663 return 0; 681 return 0;
664} 682}
665 683
666void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit)
667{
668 struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
669 unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed);
670
671 __set_bit(bit, bitmap);
672}
673
674void __cpu_map_flush(struct bpf_map *map) 684void __cpu_map_flush(struct bpf_map *map)
675{ 685{
676 struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); 686 struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
677 unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed); 687 struct list_head *flush_list = this_cpu_ptr(cmap->flush_list);
678 u32 bit; 688 struct xdp_bulk_queue *bq, *tmp;
679
680 /* The napi->poll softirq makes sure __cpu_map_insert_ctx()
681 * and __cpu_map_flush() happen on same CPU. Thus, the percpu
682 * bitmap indicate which percpu bulkq have packets.
683 */
684 for_each_set_bit(bit, bitmap, map->max_entries) {
685 struct bpf_cpu_map_entry *rcpu = READ_ONCE(cmap->cpu_map[bit]);
686 struct xdp_bulk_queue *bq;
687
688 /* This is possible if entry is removed by user space
689 * between xdp redirect and flush op.
690 */
691 if (unlikely(!rcpu))
692 continue;
693
694 __clear_bit(bit, bitmap);
695 689
696 /* Flush all frames in bulkq to real queue */ 690 list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
697 bq = this_cpu_ptr(rcpu->bulkq); 691 bq_flush_to_queue(bq, true);
698 bq_flush_to_queue(rcpu, bq, true);
699 692
700 /* If already running, costs spin_lock_irqsave + smb_mb */ 693 /* If already running, costs spin_lock_irqsave + smb_mb */
701 wake_up_process(rcpu->kthread); 694 wake_up_process(bq->obj->kthread);
702 } 695 }
703} 696}
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 191b79948424..d83cf8ccc872 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -1,13 +1,5 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io 2/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 *
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
11 */ 3 */
12 4
13/* Devmaps primary use is as a backend map for XDP BPF helper call 5/* Devmaps primary use is as a backend map for XDP BPF helper call
@@ -25,9 +17,8 @@
25 * datapath always has a valid copy. However, the datapath does a "flush" 17 * datapath always has a valid copy. However, the datapath does a "flush"
26 * operation that pushes any pending packets in the driver outside the RCU 18 * operation that pushes any pending packets in the driver outside the RCU
27 * critical section. Each bpf_dtab_netdev tracks these pending operations using 19 * critical section. Each bpf_dtab_netdev tracks these pending operations using
28 * an atomic per-cpu bitmap. The bpf_dtab_netdev object will not be destroyed 20 * a per-cpu flush list. The bpf_dtab_netdev object will not be destroyed until
29 * until all bits are cleared indicating outstanding flush operations have 21 * this list is empty, indicating outstanding flush operations have completed.
30 * completed.
31 * 22 *
32 * BPF syscalls may race with BPF program calls on any of the update, delete 23 * BPF syscalls may race with BPF program calls on any of the update, delete
33 * or lookup operations. As noted above the xchg() operation also keep the 24 * or lookup operations. As noted above the xchg() operation also keep the
@@ -56,9 +47,13 @@
56 (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) 47 (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
57 48
58#define DEV_MAP_BULK_SIZE 16 49#define DEV_MAP_BULK_SIZE 16
50struct bpf_dtab_netdev;
51
59struct xdp_bulk_queue { 52struct xdp_bulk_queue {
60 struct xdp_frame *q[DEV_MAP_BULK_SIZE]; 53 struct xdp_frame *q[DEV_MAP_BULK_SIZE];
54 struct list_head flush_node;
61 struct net_device *dev_rx; 55 struct net_device *dev_rx;
56 struct bpf_dtab_netdev *obj;
62 unsigned int count; 57 unsigned int count;
63}; 58};
64 59
@@ -73,22 +68,17 @@ struct bpf_dtab_netdev {
73struct bpf_dtab { 68struct bpf_dtab {
74 struct bpf_map map; 69 struct bpf_map map;
75 struct bpf_dtab_netdev **netdev_map; 70 struct bpf_dtab_netdev **netdev_map;
76 unsigned long __percpu *flush_needed; 71 struct list_head __percpu *flush_list;
77 struct list_head list; 72 struct list_head list;
78}; 73};
79 74
80static DEFINE_SPINLOCK(dev_map_lock); 75static DEFINE_SPINLOCK(dev_map_lock);
81static LIST_HEAD(dev_map_list); 76static LIST_HEAD(dev_map_list);
82 77
83static u64 dev_map_bitmap_size(const union bpf_attr *attr)
84{
85 return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long);
86}
87
88static struct bpf_map *dev_map_alloc(union bpf_attr *attr) 78static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
89{ 79{
90 struct bpf_dtab *dtab; 80 struct bpf_dtab *dtab;
91 int err = -EINVAL; 81 int err, cpu;
92 u64 cost; 82 u64 cost;
93 83
94 if (!capable(CAP_NET_ADMIN)) 84 if (!capable(CAP_NET_ADMIN))
@@ -99,6 +89,11 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
99 attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) 89 attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
100 return ERR_PTR(-EINVAL); 90 return ERR_PTR(-EINVAL);
101 91
92 /* Lookup returns a pointer straight to dev->ifindex, so make sure the
93 * verifier prevents writes from the BPF side
94 */
95 attr->map_flags |= BPF_F_RDONLY_PROG;
96
102 dtab = kzalloc(sizeof(*dtab), GFP_USER); 97 dtab = kzalloc(sizeof(*dtab), GFP_USER);
103 if (!dtab) 98 if (!dtab)
104 return ERR_PTR(-ENOMEM); 99 return ERR_PTR(-ENOMEM);
@@ -107,39 +102,39 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
107 102
108 /* make sure page count doesn't overflow */ 103 /* make sure page count doesn't overflow */
109 cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); 104 cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
110 cost += dev_map_bitmap_size(attr) * num_possible_cpus(); 105 cost += sizeof(struct list_head) * num_possible_cpus();
111 if (cost >= U32_MAX - PAGE_SIZE)
112 goto free_dtab;
113 106
114 dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; 107 /* if map size is larger than memlock limit, reject it */
115 108 err = bpf_map_charge_init(&dtab->map.memory, cost);
116 /* if map size is larger than memlock limit, reject it early */
117 err = bpf_map_precharge_memlock(dtab->map.pages);
118 if (err) 109 if (err)
119 goto free_dtab; 110 goto free_dtab;
120 111
121 err = -ENOMEM; 112 err = -ENOMEM;
122 113
123 /* A per cpu bitfield with a bit per possible net device */ 114 dtab->flush_list = alloc_percpu(struct list_head);
124 dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr), 115 if (!dtab->flush_list)
125 __alignof__(unsigned long), 116 goto free_charge;
126 GFP_KERNEL | __GFP_NOWARN); 117
127 if (!dtab->flush_needed) 118 for_each_possible_cpu(cpu)
128 goto free_dtab; 119 INIT_LIST_HEAD(per_cpu_ptr(dtab->flush_list, cpu));
129 120
130 dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * 121 dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
131 sizeof(struct bpf_dtab_netdev *), 122 sizeof(struct bpf_dtab_netdev *),
132 dtab->map.numa_node); 123 dtab->map.numa_node);
133 if (!dtab->netdev_map) 124 if (!dtab->netdev_map)
134 goto free_dtab; 125 goto free_percpu;
135 126
136 spin_lock(&dev_map_lock); 127 spin_lock(&dev_map_lock);
137 list_add_tail_rcu(&dtab->list, &dev_map_list); 128 list_add_tail_rcu(&dtab->list, &dev_map_list);
138 spin_unlock(&dev_map_lock); 129 spin_unlock(&dev_map_lock);
139 130
140 return &dtab->map; 131 return &dtab->map;
132
133free_percpu:
134 free_percpu(dtab->flush_list);
135free_charge:
136 bpf_map_charge_finish(&dtab->map.memory);
141free_dtab: 137free_dtab:
142 free_percpu(dtab->flush_needed);
143 kfree(dtab); 138 kfree(dtab);
144 return ERR_PTR(err); 139 return ERR_PTR(err);
145} 140}
@@ -164,15 +159,18 @@ static void dev_map_free(struct bpf_map *map)
164 bpf_clear_redirect_map(map); 159 bpf_clear_redirect_map(map);
165 synchronize_rcu(); 160 synchronize_rcu();
166 161
162 /* Make sure prior __dev_map_entry_free() have completed. */
163 rcu_barrier();
164
167 /* To ensure all pending flush operations have completed wait for flush 165 /* To ensure all pending flush operations have completed wait for flush
168 * bitmap to indicate all flush_needed bits to be zero on _all_ cpus. 166 * list to empty on _all_ cpus.
169 * Because the above synchronize_rcu() ensures the map is disconnected 167 * Because the above synchronize_rcu() ensures the map is disconnected
170 * from the program we can assume no new bits will be set. 168 * from the program we can assume no new items will be added.
171 */ 169 */
172 for_each_online_cpu(cpu) { 170 for_each_online_cpu(cpu) {
173 unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu); 171 struct list_head *flush_list = per_cpu_ptr(dtab->flush_list, cpu);
174 172
175 while (!bitmap_empty(bitmap, dtab->map.max_entries)) 173 while (!list_empty(flush_list))
176 cond_resched(); 174 cond_resched();
177 } 175 }
178 176
@@ -183,11 +181,12 @@ static void dev_map_free(struct bpf_map *map)
183 if (!dev) 181 if (!dev)
184 continue; 182 continue;
185 183
184 free_percpu(dev->bulkq);
186 dev_put(dev->dev); 185 dev_put(dev->dev);
187 kfree(dev); 186 kfree(dev);
188 } 187 }
189 188
190 free_percpu(dtab->flush_needed); 189 free_percpu(dtab->flush_list);
191 bpf_map_area_free(dtab->netdev_map); 190 bpf_map_area_free(dtab->netdev_map);
192 kfree(dtab); 191 kfree(dtab);
193} 192}
@@ -209,18 +208,10 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
209 return 0; 208 return 0;
210} 209}
211 210
212void __dev_map_insert_ctx(struct bpf_map *map, u32 bit) 211static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags,
213{
214 struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
215 unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
216
217 __set_bit(bit, bitmap);
218}
219
220static int bq_xmit_all(struct bpf_dtab_netdev *obj,
221 struct xdp_bulk_queue *bq, u32 flags,
222 bool in_napi_ctx) 212 bool in_napi_ctx)
223{ 213{
214 struct bpf_dtab_netdev *obj = bq->obj;
224 struct net_device *dev = obj->dev; 215 struct net_device *dev = obj->dev;
225 int sent = 0, drops = 0, err = 0; 216 int sent = 0, drops = 0, err = 0;
226 int i; 217 int i;
@@ -247,6 +238,7 @@ out:
247 trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit, 238 trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit,
248 sent, drops, bq->dev_rx, dev, err); 239 sent, drops, bq->dev_rx, dev, err);
249 bq->dev_rx = NULL; 240 bq->dev_rx = NULL;
241 __list_del_clearprev(&bq->flush_node);
250 return 0; 242 return 0;
251error: 243error:
252 /* If ndo_xdp_xmit fails with an errno, no frames have been 244 /* If ndo_xdp_xmit fails with an errno, no frames have been
@@ -269,30 +261,19 @@ error:
269 * from the driver before returning from its napi->poll() routine. The poll() 261 * from the driver before returning from its napi->poll() routine. The poll()
270 * routine is called either from busy_poll context or net_rx_action signaled 262 * routine is called either from busy_poll context or net_rx_action signaled
271 * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the 263 * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the
272 * net device can be torn down. On devmap tear down we ensure the ctx bitmap 264 * net device can be torn down. On devmap tear down we ensure the flush list
273 * is zeroed before completing to ensure all flush operations have completed. 265 * is empty before completing to ensure all flush operations have completed.
274 */ 266 */
275void __dev_map_flush(struct bpf_map *map) 267void __dev_map_flush(struct bpf_map *map)
276{ 268{
277 struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); 269 struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
278 unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); 270 struct list_head *flush_list = this_cpu_ptr(dtab->flush_list);
279 u32 bit; 271 struct xdp_bulk_queue *bq, *tmp;
280
281 for_each_set_bit(bit, bitmap, map->max_entries) {
282 struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]);
283 struct xdp_bulk_queue *bq;
284
285 /* This is possible if the dev entry is removed by user space
286 * between xdp redirect and flush op.
287 */
288 if (unlikely(!dev))
289 continue;
290
291 __clear_bit(bit, bitmap);
292 272
293 bq = this_cpu_ptr(dev->bulkq); 273 rcu_read_lock();
294 bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, true); 274 list_for_each_entry_safe(bq, tmp, flush_list, flush_node)
295 } 275 bq_xmit_all(bq, XDP_XMIT_FLUSH, true);
276 rcu_read_unlock();
296} 277}
297 278
298/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or 279/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
@@ -318,10 +299,11 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
318 struct net_device *dev_rx) 299 struct net_device *dev_rx)
319 300
320{ 301{
302 struct list_head *flush_list = this_cpu_ptr(obj->dtab->flush_list);
321 struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); 303 struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
322 304
323 if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) 305 if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
324 bq_xmit_all(obj, bq, 0, true); 306 bq_xmit_all(bq, 0, true);
325 307
326 /* Ingress dev_rx will be the same for all xdp_frame's in 308 /* Ingress dev_rx will be the same for all xdp_frame's in
327 * bulk_queue, because bq stored per-CPU and must be flushed 309 * bulk_queue, because bq stored per-CPU and must be flushed
@@ -331,6 +313,10 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
331 bq->dev_rx = dev_rx; 313 bq->dev_rx = dev_rx;
332 314
333 bq->q[bq->count++] = xdpf; 315 bq->q[bq->count++] = xdpf;
316
317 if (!bq->flush_node.prev)
318 list_add(&bq->flush_node, flush_list);
319
334 return 0; 320 return 0;
335} 321}
336 322
@@ -381,17 +367,14 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
381{ 367{
382 if (dev->dev->netdev_ops->ndo_xdp_xmit) { 368 if (dev->dev->netdev_ops->ndo_xdp_xmit) {
383 struct xdp_bulk_queue *bq; 369 struct xdp_bulk_queue *bq;
384 unsigned long *bitmap;
385
386 int cpu; 370 int cpu;
387 371
372 rcu_read_lock();
388 for_each_online_cpu(cpu) { 373 for_each_online_cpu(cpu) {
389 bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu);
390 __clear_bit(dev->bit, bitmap);
391
392 bq = per_cpu_ptr(dev->bulkq, cpu); 374 bq = per_cpu_ptr(dev->bulkq, cpu);
393 bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, false); 375 bq_xmit_all(bq, XDP_XMIT_FLUSH, false);
394 } 376 }
377 rcu_read_unlock();
395 } 378 }
396} 379}
397 380
@@ -436,8 +419,10 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
436 struct net *net = current->nsproxy->net_ns; 419 struct net *net = current->nsproxy->net_ns;
437 gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; 420 gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
438 struct bpf_dtab_netdev *dev, *old_dev; 421 struct bpf_dtab_netdev *dev, *old_dev;
439 u32 i = *(u32 *)key;
440 u32 ifindex = *(u32 *)value; 422 u32 ifindex = *(u32 *)value;
423 struct xdp_bulk_queue *bq;
424 u32 i = *(u32 *)key;
425 int cpu;
441 426
442 if (unlikely(map_flags > BPF_EXIST)) 427 if (unlikely(map_flags > BPF_EXIST))
443 return -EINVAL; 428 return -EINVAL;
@@ -460,6 +445,11 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
460 return -ENOMEM; 445 return -ENOMEM;
461 } 446 }
462 447
448 for_each_possible_cpu(cpu) {
449 bq = per_cpu_ptr(dev->bulkq, cpu);
450 bq->obj = dev;
451 }
452
463 dev->dev = dev_get_by_index(net, ifindex); 453 dev->dev = dev_get_by_index(net, ifindex);
464 if (!dev->dev) { 454 if (!dev->dev) {
465 free_percpu(dev->bulkq); 455 free_percpu(dev->bulkq);
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index d9ce383c0f9c..b44d8c447afd 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -1,14 +1,6 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com 2/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2 * Copyright (c) 2016 Facebook 3 * Copyright (c) 2016 Facebook
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 */ 4 */
13 5
14#include <linux/bpf.h> 6#include <linux/bpf.h>
diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h
index e1324a834a24..e546b18d27da 100644
--- a/kernel/bpf/disasm.h
+++ b/kernel/bpf/disasm.h
@@ -1,14 +1,6 @@
1/* SPDX-License-Identifier: GPL-2.0-only */
1/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com 2/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2 * Copyright (c) 2016 Facebook 3 * Copyright (c) 2016 Facebook
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 */ 4 */
13 5
14#ifndef __BPF_DISASM_H__ 6#ifndef __BPF_DISASM_H__
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 192d32e77db3..22066a62c8c9 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1,14 +1,6 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com 2/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2 * Copyright (c) 2016 Facebook 3 * Copyright (c) 2016 Facebook
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 */ 4 */
13#include <linux/bpf.h> 5#include <linux/bpf.h>
14#include <linux/btf.h> 6#include <linux/btf.h>
@@ -360,14 +352,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
360 else 352 else
361 cost += (u64) htab->elem_size * num_possible_cpus(); 353 cost += (u64) htab->elem_size * num_possible_cpus();
362 354
363 if (cost >= U32_MAX - PAGE_SIZE) 355 /* if map size is larger than memlock limit, reject it */
364 /* make sure page count doesn't overflow */ 356 err = bpf_map_charge_init(&htab->map.memory, cost);
365 goto free_htab;
366
367 htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
368
369 /* if map size is larger than memlock limit, reject it early */
370 err = bpf_map_precharge_memlock(htab->map.pages);
371 if (err) 357 if (err)
372 goto free_htab; 358 goto free_htab;
373 359
@@ -376,7 +362,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
376 sizeof(struct bucket), 362 sizeof(struct bucket),
377 htab->map.numa_node); 363 htab->map.numa_node);
378 if (!htab->buckets) 364 if (!htab->buckets)
379 goto free_htab; 365 goto free_charge;
380 366
381 if (htab->map.map_flags & BPF_F_ZERO_SEED) 367 if (htab->map.map_flags & BPF_F_ZERO_SEED)
382 htab->hashrnd = 0; 368 htab->hashrnd = 0;
@@ -409,6 +395,8 @@ free_prealloc:
409 prealloc_destroy(htab); 395 prealloc_destroy(htab);
410free_buckets: 396free_buckets:
411 bpf_map_area_free(htab->buckets); 397 bpf_map_area_free(htab->buckets);
398free_charge:
399 bpf_map_charge_finish(&htab->map.memory);
412free_htab: 400free_htab:
413 kfree(htab); 401 kfree(htab);
414 return ERR_PTR(err); 402 return ERR_PTR(err);
@@ -527,18 +515,30 @@ static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
527 return insn - insn_buf; 515 return insn - insn_buf;
528} 516}
529 517
530static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) 518static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map,
519 void *key, const bool mark)
531{ 520{
532 struct htab_elem *l = __htab_map_lookup_elem(map, key); 521 struct htab_elem *l = __htab_map_lookup_elem(map, key);
533 522
534 if (l) { 523 if (l) {
535 bpf_lru_node_set_ref(&l->lru_node); 524 if (mark)
525 bpf_lru_node_set_ref(&l->lru_node);
536 return l->key + round_up(map->key_size, 8); 526 return l->key + round_up(map->key_size, 8);
537 } 527 }
538 528
539 return NULL; 529 return NULL;
540} 530}
541 531
532static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
533{
534 return __htab_lru_map_lookup_elem(map, key, true);
535}
536
537static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key)
538{
539 return __htab_lru_map_lookup_elem(map, key, false);
540}
541
542static u32 htab_lru_map_gen_lookup(struct bpf_map *map, 542static u32 htab_lru_map_gen_lookup(struct bpf_map *map,
543 struct bpf_insn *insn_buf) 543 struct bpf_insn *insn_buf)
544{ 544{
@@ -1250,6 +1250,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
1250 .map_free = htab_map_free, 1250 .map_free = htab_map_free,
1251 .map_get_next_key = htab_map_get_next_key, 1251 .map_get_next_key = htab_map_get_next_key,
1252 .map_lookup_elem = htab_lru_map_lookup_elem, 1252 .map_lookup_elem = htab_lru_map_lookup_elem,
1253 .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
1253 .map_update_elem = htab_lru_map_update_elem, 1254 .map_update_elem = htab_lru_map_update_elem,
1254 .map_delete_elem = htab_lru_map_delete_elem, 1255 .map_delete_elem = htab_lru_map_delete_elem,
1255 .map_gen_lookup = htab_lru_map_gen_lookup, 1256 .map_gen_lookup = htab_lru_map_gen_lookup,
@@ -1281,7 +1282,6 @@ static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key)
1281 1282
1282int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) 1283int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
1283{ 1284{
1284 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1285 struct htab_elem *l; 1285 struct htab_elem *l;
1286 void __percpu *pptr; 1286 void __percpu *pptr;
1287 int ret = -ENOENT; 1287 int ret = -ENOENT;
@@ -1297,8 +1297,9 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
1297 l = __htab_map_lookup_elem(map, key); 1297 l = __htab_map_lookup_elem(map, key);
1298 if (!l) 1298 if (!l)
1299 goto out; 1299 goto out;
1300 if (htab_is_lru(htab)) 1300 /* We do not mark LRU map element here in order to not mess up
1301 bpf_lru_node_set_ref(&l->lru_node); 1301 * eviction heuristics when user space does a map walk.
1302 */
1302 pptr = htab_elem_get_ptr(l, map->key_size); 1303 pptr = htab_elem_get_ptr(l, map->key_size);
1303 for_each_possible_cpu(cpu) { 1304 for_each_possible_cpu(cpu) {
1304 bpf_long_memcpy(value + off, 1305 bpf_long_memcpy(value + off,
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 4266ffde07ca..5e28718928ca 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1,13 +1,5 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com 2/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 *
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
11 */ 3 */
12#include <linux/bpf.h> 4#include <linux/bpf.h>
13#include <linux/rcupdate.h> 5#include <linux/rcupdate.h>
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index bc53e5b20ddc..cc0d0cf114e3 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * Minimal file system backend for holding eBPF maps and programs, 3 * Minimal file system backend for holding eBPF maps and programs,
3 * used by bpf(2) object pinning. 4 * used by bpf(2) object pinning.
@@ -5,10 +6,6 @@
5 * Authors: 6 * Authors:
6 * 7 *
7 * Daniel Borkmann <daniel@iogearbox.net> 8 * Daniel Borkmann <daniel@iogearbox.net>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * version 2 as published by the Free Software Foundation.
12 */ 9 */
13 10
14#include <linux/init.h> 11#include <linux/init.h>
@@ -518,7 +515,7 @@ out:
518static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type) 515static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type)
519{ 516{
520 struct bpf_prog *prog; 517 struct bpf_prog *prog;
521 int ret = inode_permission(inode, MAY_READ | MAY_WRITE); 518 int ret = inode_permission(inode, MAY_READ);
522 if (ret) 519 if (ret)
523 return ERR_PTR(ret); 520 return ERR_PTR(ret);
524 521
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 980e8f1f6cb5..addd6fdceec8 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -272,6 +272,8 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
272{ 272{
273 int numa_node = bpf_map_attr_numa_node(attr); 273 int numa_node = bpf_map_attr_numa_node(attr);
274 struct bpf_cgroup_storage_map *map; 274 struct bpf_cgroup_storage_map *map;
275 struct bpf_map_memory mem;
276 int ret;
275 277
276 if (attr->key_size != sizeof(struct bpf_cgroup_storage_key)) 278 if (attr->key_size != sizeof(struct bpf_cgroup_storage_key))
277 return ERR_PTR(-EINVAL); 279 return ERR_PTR(-EINVAL);
@@ -290,13 +292,18 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
290 /* max_entries is not used and enforced to be 0 */ 292 /* max_entries is not used and enforced to be 0 */
291 return ERR_PTR(-EINVAL); 293 return ERR_PTR(-EINVAL);
292 294
295 ret = bpf_map_charge_init(&mem, sizeof(struct bpf_cgroup_storage_map));
296 if (ret < 0)
297 return ERR_PTR(ret);
298
293 map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map), 299 map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map),
294 __GFP_ZERO | GFP_USER, numa_node); 300 __GFP_ZERO | GFP_USER, numa_node);
295 if (!map) 301 if (!map) {
302 bpf_map_charge_finish(&mem);
296 return ERR_PTR(-ENOMEM); 303 return ERR_PTR(-ENOMEM);
304 }
297 305
298 map->map.pages = round_up(sizeof(struct bpf_cgroup_storage_map), 306 bpf_map_charge_move(&map->map.memory, &mem);
299 PAGE_SIZE) >> PAGE_SHIFT;
300 307
301 /* copy mandatory map attributes */ 308 /* copy mandatory map attributes */
302 bpf_map_init_from_attr(&map->map, attr); 309 bpf_map_init_from_attr(&map->map, attr);
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index e61630c2e50b..56e6c75d354d 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -1,12 +1,9 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * Longest prefix match list implementation 3 * Longest prefix match list implementation
3 * 4 *
4 * Copyright (c) 2016,2017 Daniel Mack 5 * Copyright (c) 2016,2017 Daniel Mack
5 * Copyright (c) 2016 David Herrmann 6 * Copyright (c) 2016 David Herrmann
6 *
7 * This file is subject to the terms and conditions of version 2 of the GNU
8 * General Public License. See the file COPYING in the main directory of the
9 * Linux distribution for more details.
10 */ 7 */
11 8
12#include <linux/bpf.h> 9#include <linux/bpf.h>
@@ -573,14 +570,8 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
573 cost_per_node = sizeof(struct lpm_trie_node) + 570 cost_per_node = sizeof(struct lpm_trie_node) +
574 attr->value_size + trie->data_size; 571 attr->value_size + trie->data_size;
575 cost += (u64) attr->max_entries * cost_per_node; 572 cost += (u64) attr->max_entries * cost_per_node;
576 if (cost >= U32_MAX - PAGE_SIZE) {
577 ret = -E2BIG;
578 goto out_err;
579 }
580
581 trie->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
582 573
583 ret = bpf_map_precharge_memlock(trie->map.pages); 574 ret = bpf_map_charge_init(&trie->map.memory, cost);
584 if (ret) 575 if (ret)
585 goto out_err; 576 goto out_err;
586 577
@@ -716,9 +707,14 @@ find_leftmost:
716 * have exact two children, so this function will never return NULL. 707 * have exact two children, so this function will never return NULL.
717 */ 708 */
718 for (node = search_root; node;) { 709 for (node = search_root; node;) {
719 if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) 710 if (node->flags & LPM_TREE_NODE_FLAG_IM) {
711 node = rcu_dereference(node->child[0]);
712 } else {
720 next_node = node; 713 next_node = node;
721 node = rcu_dereference(node->child[0]); 714 node = rcu_dereference(node->child[0]);
715 if (!node)
716 node = rcu_dereference(next_node->child[1]);
717 }
722 } 718 }
723do_copy: 719do_copy:
724 next_key->prefixlen = next_node->prefixlen; 720 next_key->prefixlen = next_node->prefixlen;
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 3dff41403583..fab4fb134547 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -1,8 +1,5 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* Copyright (c) 2017 Facebook 2/* Copyright (c) 2017 Facebook
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 */ 3 */
7#include <linux/slab.h> 4#include <linux/slab.h>
8#include <linux/bpf.h> 5#include <linux/bpf.h>
diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h
index 6183db9ec08c..a507bf6ef8b9 100644
--- a/kernel/bpf/map_in_map.h
+++ b/kernel/bpf/map_in_map.h
@@ -1,8 +1,5 @@
1/* SPDX-License-Identifier: GPL-2.0-only */
1/* Copyright (c) 2017 Facebook 2/* Copyright (c) 2017 Facebook
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 */ 3 */
7#ifndef __MAP_IN_MAP_H__ 4#ifndef __MAP_IN_MAP_H__
8#define __MAP_IN_MAP_H__ 5#define __MAP_IN_MAP_H__
diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
index 0c1b4ba9e90e..6e090140b924 100644
--- a/kernel/bpf/percpu_freelist.c
+++ b/kernel/bpf/percpu_freelist.c
@@ -1,8 +1,5 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* Copyright (c) 2016 Facebook 2/* Copyright (c) 2016 Facebook
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 */ 3 */
7#include "percpu_freelist.h" 4#include "percpu_freelist.h"
8 5
diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h
index c3960118e617..fbf8a8a28979 100644
--- a/kernel/bpf/percpu_freelist.h
+++ b/kernel/bpf/percpu_freelist.h
@@ -1,8 +1,5 @@
1/* SPDX-License-Identifier: GPL-2.0-only */
1/* Copyright (c) 2016 Facebook 2/* Copyright (c) 2016 Facebook
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 */ 3 */
7#ifndef __PERCPU_FREELIST_H__ 4#ifndef __PERCPU_FREELIST_H__
8#define __PERCPU_FREELIST_H__ 5#define __PERCPU_FREELIST_H__
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index 0b140d236889..f697647ceb54 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -67,29 +67,28 @@ static int queue_stack_map_alloc_check(union bpf_attr *attr)
67static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) 67static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
68{ 68{
69 int ret, numa_node = bpf_map_attr_numa_node(attr); 69 int ret, numa_node = bpf_map_attr_numa_node(attr);
70 struct bpf_map_memory mem = {0};
70 struct bpf_queue_stack *qs; 71 struct bpf_queue_stack *qs;
71 u64 size, queue_size, cost; 72 u64 size, queue_size, cost;
72 73
73 size = (u64) attr->max_entries + 1; 74 size = (u64) attr->max_entries + 1;
74 cost = queue_size = sizeof(*qs) + size * attr->value_size; 75 cost = queue_size = sizeof(*qs) + size * attr->value_size;
75 if (cost >= U32_MAX - PAGE_SIZE)
76 return ERR_PTR(-E2BIG);
77 76
78 cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; 77 ret = bpf_map_charge_init(&mem, cost);
79
80 ret = bpf_map_precharge_memlock(cost);
81 if (ret < 0) 78 if (ret < 0)
82 return ERR_PTR(ret); 79 return ERR_PTR(ret);
83 80
84 qs = bpf_map_area_alloc(queue_size, numa_node); 81 qs = bpf_map_area_alloc(queue_size, numa_node);
85 if (!qs) 82 if (!qs) {
83 bpf_map_charge_finish(&mem);
86 return ERR_PTR(-ENOMEM); 84 return ERR_PTR(-ENOMEM);
85 }
87 86
88 memset(qs, 0, sizeof(*qs)); 87 memset(qs, 0, sizeof(*qs));
89 88
90 bpf_map_init_from_attr(&qs->map, attr); 89 bpf_map_init_from_attr(&qs->map, attr);
91 90
92 qs->map.pages = cost; 91 bpf_map_charge_move(&qs->map.memory, &mem);
93 qs->size = size; 92 qs->size = size;
94 93
95 raw_spin_lock_init(&qs->lock); 94 raw_spin_lock_init(&qs->lock);
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 18e225de80ff..50c083ba978c 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -151,7 +151,8 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
151{ 151{
152 int err, numa_node = bpf_map_attr_numa_node(attr); 152 int err, numa_node = bpf_map_attr_numa_node(attr);
153 struct reuseport_array *array; 153 struct reuseport_array *array;
154 u64 cost, array_size; 154 struct bpf_map_memory mem;
155 u64 array_size;
155 156
156 if (!capable(CAP_SYS_ADMIN)) 157 if (!capable(CAP_SYS_ADMIN))
157 return ERR_PTR(-EPERM); 158 return ERR_PTR(-EPERM);
@@ -159,24 +160,20 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
159 array_size = sizeof(*array); 160 array_size = sizeof(*array);
160 array_size += (u64)attr->max_entries * sizeof(struct sock *); 161 array_size += (u64)attr->max_entries * sizeof(struct sock *);
161 162
162 /* make sure there is no u32 overflow later in round_up() */ 163 err = bpf_map_charge_init(&mem, array_size);
163 cost = array_size;
164 if (cost >= U32_MAX - PAGE_SIZE)
165 return ERR_PTR(-ENOMEM);
166 cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
167
168 err = bpf_map_precharge_memlock(cost);
169 if (err) 164 if (err)
170 return ERR_PTR(err); 165 return ERR_PTR(err);
171 166
172 /* allocate all map elements and zero-initialize them */ 167 /* allocate all map elements and zero-initialize them */
173 array = bpf_map_area_alloc(array_size, numa_node); 168 array = bpf_map_area_alloc(array_size, numa_node);
174 if (!array) 169 if (!array) {
170 bpf_map_charge_finish(&mem);
175 return ERR_PTR(-ENOMEM); 171 return ERR_PTR(-ENOMEM);
172 }
176 173
177 /* copy mandatory map attributes */ 174 /* copy mandatory map attributes */
178 bpf_map_init_from_attr(&array->map, attr); 175 bpf_map_init_from_attr(&array->map, attr);
179 array->map.pages = cost; 176 bpf_map_charge_move(&array->map.memory, &mem);
180 177
181 return &array->map; 178 return &array->map;
182} 179}
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 950ab2f28922..052580c33d26 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -1,8 +1,5 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* Copyright (c) 2016 Facebook 2/* Copyright (c) 2016 Facebook
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 */ 3 */
7#include <linux/bpf.h> 4#include <linux/bpf.h>
8#include <linux/jhash.h> 5#include <linux/jhash.h>
@@ -89,6 +86,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
89{ 86{
90 u32 value_size = attr->value_size; 87 u32 value_size = attr->value_size;
91 struct bpf_stack_map *smap; 88 struct bpf_stack_map *smap;
89 struct bpf_map_memory mem;
92 u64 cost, n_buckets; 90 u64 cost, n_buckets;
93 int err; 91 int err;
94 92
@@ -116,40 +114,37 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
116 n_buckets = roundup_pow_of_two(attr->max_entries); 114 n_buckets = roundup_pow_of_two(attr->max_entries);
117 115
118 cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); 116 cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap);
119 if (cost >= U32_MAX - PAGE_SIZE) 117 cost += n_buckets * (value_size + sizeof(struct stack_map_bucket));
120 return ERR_PTR(-E2BIG); 118 err = bpf_map_charge_init(&mem, cost);
119 if (err)
120 return ERR_PTR(err);
121 121
122 smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr)); 122 smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr));
123 if (!smap) 123 if (!smap) {
124 bpf_map_charge_finish(&mem);
124 return ERR_PTR(-ENOMEM); 125 return ERR_PTR(-ENOMEM);
125 126 }
126 err = -E2BIG;
127 cost += n_buckets * (value_size + sizeof(struct stack_map_bucket));
128 if (cost >= U32_MAX - PAGE_SIZE)
129 goto free_smap;
130 127
131 bpf_map_init_from_attr(&smap->map, attr); 128 bpf_map_init_from_attr(&smap->map, attr);
132 smap->map.value_size = value_size; 129 smap->map.value_size = value_size;
133 smap->n_buckets = n_buckets; 130 smap->n_buckets = n_buckets;
134 smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
135
136 err = bpf_map_precharge_memlock(smap->map.pages);
137 if (err)
138 goto free_smap;
139 131
140 err = get_callchain_buffers(sysctl_perf_event_max_stack); 132 err = get_callchain_buffers(sysctl_perf_event_max_stack);
141 if (err) 133 if (err)
142 goto free_smap; 134 goto free_charge;
143 135
144 err = prealloc_elems_and_freelist(smap); 136 err = prealloc_elems_and_freelist(smap);
145 if (err) 137 if (err)
146 goto put_buffers; 138 goto put_buffers;
147 139
140 bpf_map_charge_move(&smap->map.memory, &mem);
141
148 return &smap->map; 142 return &smap->map;
149 143
150put_buffers: 144put_buffers:
151 put_callchain_buffers(); 145 put_callchain_buffers();
152free_smap: 146free_charge:
147 bpf_map_charge_finish(&mem);
153 bpf_map_area_free(smap); 148 bpf_map_area_free(smap);
154 return ERR_PTR(err); 149 return ERR_PTR(err);
155} 150}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ad3ccf82f31d..5d141f16f6fa 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1,13 +1,5 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com 2/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 *
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
11 */ 3 */
12#include <linux/bpf.h> 4#include <linux/bpf.h>
13#include <linux/bpf_trace.h> 5#include <linux/bpf_trace.h>
@@ -188,19 +180,6 @@ void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr)
188 map->numa_node = bpf_map_attr_numa_node(attr); 180 map->numa_node = bpf_map_attr_numa_node(attr);
189} 181}
190 182
191int bpf_map_precharge_memlock(u32 pages)
192{
193 struct user_struct *user = get_current_user();
194 unsigned long memlock_limit, cur;
195
196 memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
197 cur = atomic_long_read(&user->locked_vm);
198 free_uid(user);
199 if (cur + pages > memlock_limit)
200 return -EPERM;
201 return 0;
202}
203
204static int bpf_charge_memlock(struct user_struct *user, u32 pages) 183static int bpf_charge_memlock(struct user_struct *user, u32 pages)
205{ 184{
206 unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 185 unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
@@ -214,45 +193,62 @@ static int bpf_charge_memlock(struct user_struct *user, u32 pages)
214 193
215static void bpf_uncharge_memlock(struct user_struct *user, u32 pages) 194static void bpf_uncharge_memlock(struct user_struct *user, u32 pages)
216{ 195{
217 atomic_long_sub(pages, &user->locked_vm); 196 if (user)
197 atomic_long_sub(pages, &user->locked_vm);
218} 198}
219 199
220static int bpf_map_init_memlock(struct bpf_map *map) 200int bpf_map_charge_init(struct bpf_map_memory *mem, size_t size)
221{ 201{
222 struct user_struct *user = get_current_user(); 202 u32 pages = round_up(size, PAGE_SIZE) >> PAGE_SHIFT;
203 struct user_struct *user;
223 int ret; 204 int ret;
224 205
225 ret = bpf_charge_memlock(user, map->pages); 206 if (size >= U32_MAX - PAGE_SIZE)
207 return -E2BIG;
208
209 user = get_current_user();
210 ret = bpf_charge_memlock(user, pages);
226 if (ret) { 211 if (ret) {
227 free_uid(user); 212 free_uid(user);
228 return ret; 213 return ret;
229 } 214 }
230 map->user = user; 215
231 return ret; 216 mem->pages = pages;
217 mem->user = user;
218
219 return 0;
232} 220}
233 221
234static void bpf_map_release_memlock(struct bpf_map *map) 222void bpf_map_charge_finish(struct bpf_map_memory *mem)
235{ 223{
236 struct user_struct *user = map->user; 224 bpf_uncharge_memlock(mem->user, mem->pages);
237 bpf_uncharge_memlock(user, map->pages); 225 free_uid(mem->user);
238 free_uid(user); 226}
227
228void bpf_map_charge_move(struct bpf_map_memory *dst,
229 struct bpf_map_memory *src)
230{
231 *dst = *src;
232
233 /* Make sure src will not be used for the redundant uncharging. */
234 memset(src, 0, sizeof(struct bpf_map_memory));
239} 235}
240 236
241int bpf_map_charge_memlock(struct bpf_map *map, u32 pages) 237int bpf_map_charge_memlock(struct bpf_map *map, u32 pages)
242{ 238{
243 int ret; 239 int ret;
244 240
245 ret = bpf_charge_memlock(map->user, pages); 241 ret = bpf_charge_memlock(map->memory.user, pages);
246 if (ret) 242 if (ret)
247 return ret; 243 return ret;
248 map->pages += pages; 244 map->memory.pages += pages;
249 return ret; 245 return ret;
250} 246}
251 247
252void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages) 248void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages)
253{ 249{
254 bpf_uncharge_memlock(map->user, pages); 250 bpf_uncharge_memlock(map->memory.user, pages);
255 map->pages -= pages; 251 map->memory.pages -= pages;
256} 252}
257 253
258static int bpf_map_alloc_id(struct bpf_map *map) 254static int bpf_map_alloc_id(struct bpf_map *map)
@@ -303,11 +299,13 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
303static void bpf_map_free_deferred(struct work_struct *work) 299static void bpf_map_free_deferred(struct work_struct *work)
304{ 300{
305 struct bpf_map *map = container_of(work, struct bpf_map, work); 301 struct bpf_map *map = container_of(work, struct bpf_map, work);
302 struct bpf_map_memory mem;
306 303
307 bpf_map_release_memlock(map); 304 bpf_map_charge_move(&mem, &map->memory);
308 security_bpf_map_free(map); 305 security_bpf_map_free(map);
309 /* implementation dependent freeing */ 306 /* implementation dependent freeing */
310 map->ops->map_free(map); 307 map->ops->map_free(map);
308 bpf_map_charge_finish(&mem);
311} 309}
312 310
313static void bpf_map_put_uref(struct bpf_map *map) 311static void bpf_map_put_uref(struct bpf_map *map)
@@ -395,7 +393,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
395 map->value_size, 393 map->value_size,
396 map->max_entries, 394 map->max_entries,
397 map->map_flags, 395 map->map_flags,
398 map->pages * 1ULL << PAGE_SHIFT, 396 map->memory.pages * 1ULL << PAGE_SHIFT,
399 map->id, 397 map->id,
400 READ_ONCE(map->frozen)); 398 READ_ONCE(map->frozen));
401 399
@@ -549,6 +547,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
549static int map_create(union bpf_attr *attr) 547static int map_create(union bpf_attr *attr)
550{ 548{
551 int numa_node = bpf_map_attr_numa_node(attr); 549 int numa_node = bpf_map_attr_numa_node(attr);
550 struct bpf_map_memory mem;
552 struct bpf_map *map; 551 struct bpf_map *map;
553 int f_flags; 552 int f_flags;
554 int err; 553 int err;
@@ -573,7 +572,7 @@ static int map_create(union bpf_attr *attr)
573 572
574 err = bpf_obj_name_cpy(map->name, attr->map_name); 573 err = bpf_obj_name_cpy(map->name, attr->map_name);
575 if (err) 574 if (err)
576 goto free_map_nouncharge; 575 goto free_map;
577 576
578 atomic_set(&map->refcnt, 1); 577 atomic_set(&map->refcnt, 1);
579 atomic_set(&map->usercnt, 1); 578 atomic_set(&map->usercnt, 1);
@@ -583,20 +582,20 @@ static int map_create(union bpf_attr *attr)
583 582
584 if (!attr->btf_value_type_id) { 583 if (!attr->btf_value_type_id) {
585 err = -EINVAL; 584 err = -EINVAL;
586 goto free_map_nouncharge; 585 goto free_map;
587 } 586 }
588 587
589 btf = btf_get_by_fd(attr->btf_fd); 588 btf = btf_get_by_fd(attr->btf_fd);
590 if (IS_ERR(btf)) { 589 if (IS_ERR(btf)) {
591 err = PTR_ERR(btf); 590 err = PTR_ERR(btf);
592 goto free_map_nouncharge; 591 goto free_map;
593 } 592 }
594 593
595 err = map_check_btf(map, btf, attr->btf_key_type_id, 594 err = map_check_btf(map, btf, attr->btf_key_type_id,
596 attr->btf_value_type_id); 595 attr->btf_value_type_id);
597 if (err) { 596 if (err) {
598 btf_put(btf); 597 btf_put(btf);
599 goto free_map_nouncharge; 598 goto free_map;
600 } 599 }
601 600
602 map->btf = btf; 601 map->btf = btf;
@@ -608,15 +607,11 @@ static int map_create(union bpf_attr *attr)
608 607
609 err = security_bpf_map_alloc(map); 608 err = security_bpf_map_alloc(map);
610 if (err) 609 if (err)
611 goto free_map_nouncharge; 610 goto free_map;
612
613 err = bpf_map_init_memlock(map);
614 if (err)
615 goto free_map_sec;
616 611
617 err = bpf_map_alloc_id(map); 612 err = bpf_map_alloc_id(map);
618 if (err) 613 if (err)
619 goto free_map; 614 goto free_map_sec;
620 615
621 err = bpf_map_new_fd(map, f_flags); 616 err = bpf_map_new_fd(map, f_flags);
622 if (err < 0) { 617 if (err < 0) {
@@ -632,13 +627,13 @@ static int map_create(union bpf_attr *attr)
632 627
633 return err; 628 return err;
634 629
635free_map:
636 bpf_map_release_memlock(map);
637free_map_sec: 630free_map_sec:
638 security_bpf_map_free(map); 631 security_bpf_map_free(map);
639free_map_nouncharge: 632free_map:
640 btf_put(map->btf); 633 btf_put(map->btf);
634 bpf_map_charge_move(&mem, &map->memory);
641 map->ops->map_free(map); 635 map->ops->map_free(map);
636 bpf_map_charge_finish(&mem);
642 return err; 637 return err;
643} 638}
644 639
@@ -808,7 +803,10 @@ static int map_lookup_elem(union bpf_attr *attr)
808 err = map->ops->map_peek_elem(map, value); 803 err = map->ops->map_peek_elem(map, value);
809 } else { 804 } else {
810 rcu_read_lock(); 805 rcu_read_lock();
811 ptr = map->ops->map_lookup_elem(map, key); 806 if (map->ops->map_lookup_elem_sys_only)
807 ptr = map->ops->map_lookup_elem_sys_only(map, key);
808 else
809 ptr = map->ops->map_lookup_elem(map, key);
812 if (IS_ERR(ptr)) { 810 if (IS_ERR(ptr)) {
813 err = PTR_ERR(ptr); 811 err = PTR_ERR(ptr);
814 } else if (!ptr) { 812 } else if (!ptr) {
@@ -1578,6 +1576,24 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
1578 case BPF_CGROUP_INET6_CONNECT: 1576 case BPF_CGROUP_INET6_CONNECT:
1579 case BPF_CGROUP_UDP4_SENDMSG: 1577 case BPF_CGROUP_UDP4_SENDMSG:
1580 case BPF_CGROUP_UDP6_SENDMSG: 1578 case BPF_CGROUP_UDP6_SENDMSG:
1579 case BPF_CGROUP_UDP4_RECVMSG:
1580 case BPF_CGROUP_UDP6_RECVMSG:
1581 return 0;
1582 default:
1583 return -EINVAL;
1584 }
1585 case BPF_PROG_TYPE_CGROUP_SKB:
1586 switch (expected_attach_type) {
1587 case BPF_CGROUP_INET_INGRESS:
1588 case BPF_CGROUP_INET_EGRESS:
1589 return 0;
1590 default:
1591 return -EINVAL;
1592 }
1593 case BPF_PROG_TYPE_CGROUP_SOCKOPT:
1594 switch (expected_attach_type) {
1595 case BPF_CGROUP_SETSOCKOPT:
1596 case BPF_CGROUP_GETSOCKOPT:
1581 return 0; 1597 return 0;
1582 default: 1598 default:
1583 return -EINVAL; 1599 return -EINVAL;
@@ -1601,7 +1617,9 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
1601 if (CHECK_ATTR(BPF_PROG_LOAD)) 1617 if (CHECK_ATTR(BPF_PROG_LOAD))
1602 return -EINVAL; 1618 return -EINVAL;
1603 1619
1604 if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | BPF_F_ANY_ALIGNMENT)) 1620 if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT |
1621 BPF_F_ANY_ALIGNMENT |
1622 BPF_F_TEST_RND_HI32))
1605 return -EINVAL; 1623 return -EINVAL;
1606 1624
1607 if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && 1625 if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
@@ -1671,7 +1689,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
1671 if (err < 0) 1689 if (err < 0)
1672 goto free_prog; 1690 goto free_prog;
1673 1691
1674 prog->aux->load_time = ktime_get_boot_ns(); 1692 prog->aux->load_time = ktime_get_boottime_ns();
1675 err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name); 1693 err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name);
1676 if (err) 1694 if (err)
1677 goto free_prog; 1695 goto free_prog;
@@ -1830,7 +1848,12 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
1830 switch (prog->type) { 1848 switch (prog->type) {
1831 case BPF_PROG_TYPE_CGROUP_SOCK: 1849 case BPF_PROG_TYPE_CGROUP_SOCK:
1832 case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: 1850 case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
1851 case BPF_PROG_TYPE_CGROUP_SOCKOPT:
1833 return attach_type == prog->expected_attach_type ? 0 : -EINVAL; 1852 return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
1853 case BPF_PROG_TYPE_CGROUP_SKB:
1854 return prog->enforce_expected_attach_type &&
1855 prog->expected_attach_type != attach_type ?
1856 -EINVAL : 0;
1834 default: 1857 default:
1835 return 0; 1858 return 0;
1836 } 1859 }
@@ -1872,6 +1895,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
1872 case BPF_CGROUP_INET6_CONNECT: 1895 case BPF_CGROUP_INET6_CONNECT:
1873 case BPF_CGROUP_UDP4_SENDMSG: 1896 case BPF_CGROUP_UDP4_SENDMSG:
1874 case BPF_CGROUP_UDP6_SENDMSG: 1897 case BPF_CGROUP_UDP6_SENDMSG:
1898 case BPF_CGROUP_UDP4_RECVMSG:
1899 case BPF_CGROUP_UDP6_RECVMSG:
1875 ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; 1900 ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
1876 break; 1901 break;
1877 case BPF_CGROUP_SOCK_OPS: 1902 case BPF_CGROUP_SOCK_OPS:
@@ -1896,6 +1921,10 @@ static int bpf_prog_attach(const union bpf_attr *attr)
1896 case BPF_CGROUP_SYSCTL: 1921 case BPF_CGROUP_SYSCTL:
1897 ptype = BPF_PROG_TYPE_CGROUP_SYSCTL; 1922 ptype = BPF_PROG_TYPE_CGROUP_SYSCTL;
1898 break; 1923 break;
1924 case BPF_CGROUP_GETSOCKOPT:
1925 case BPF_CGROUP_SETSOCKOPT:
1926 ptype = BPF_PROG_TYPE_CGROUP_SOCKOPT;
1927 break;
1899 default: 1928 default:
1900 return -EINVAL; 1929 return -EINVAL;
1901 } 1930 }
@@ -1957,6 +1986,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
1957 case BPF_CGROUP_INET6_CONNECT: 1986 case BPF_CGROUP_INET6_CONNECT:
1958 case BPF_CGROUP_UDP4_SENDMSG: 1987 case BPF_CGROUP_UDP4_SENDMSG:
1959 case BPF_CGROUP_UDP6_SENDMSG: 1988 case BPF_CGROUP_UDP6_SENDMSG:
1989 case BPF_CGROUP_UDP4_RECVMSG:
1990 case BPF_CGROUP_UDP6_RECVMSG:
1960 ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; 1991 ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
1961 break; 1992 break;
1962 case BPF_CGROUP_SOCK_OPS: 1993 case BPF_CGROUP_SOCK_OPS:
@@ -1977,6 +2008,10 @@ static int bpf_prog_detach(const union bpf_attr *attr)
1977 case BPF_CGROUP_SYSCTL: 2008 case BPF_CGROUP_SYSCTL:
1978 ptype = BPF_PROG_TYPE_CGROUP_SYSCTL; 2009 ptype = BPF_PROG_TYPE_CGROUP_SYSCTL;
1979 break; 2010 break;
2011 case BPF_CGROUP_GETSOCKOPT:
2012 case BPF_CGROUP_SETSOCKOPT:
2013 ptype = BPF_PROG_TYPE_CGROUP_SOCKOPT;
2014 break;
1980 default: 2015 default:
1981 return -EINVAL; 2016 return -EINVAL;
1982 } 2017 }
@@ -2008,9 +2043,13 @@ static int bpf_prog_query(const union bpf_attr *attr,
2008 case BPF_CGROUP_INET6_CONNECT: 2043 case BPF_CGROUP_INET6_CONNECT:
2009 case BPF_CGROUP_UDP4_SENDMSG: 2044 case BPF_CGROUP_UDP4_SENDMSG:
2010 case BPF_CGROUP_UDP6_SENDMSG: 2045 case BPF_CGROUP_UDP6_SENDMSG:
2046 case BPF_CGROUP_UDP4_RECVMSG:
2047 case BPF_CGROUP_UDP6_RECVMSG:
2011 case BPF_CGROUP_SOCK_OPS: 2048 case BPF_CGROUP_SOCK_OPS:
2012 case BPF_CGROUP_DEVICE: 2049 case BPF_CGROUP_DEVICE:
2013 case BPF_CGROUP_SYSCTL: 2050 case BPF_CGROUP_SYSCTL:
2051 case BPF_CGROUP_GETSOCKOPT:
2052 case BPF_CGROUP_SETSOCKOPT:
2014 break; 2053 break;
2015 case BPF_LIRC_MODE2: 2054 case BPF_LIRC_MODE2:
2016 return lirc_prog_query(attr, uattr); 2055 return lirc_prog_query(attr, uattr);
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
index 938d41211be7..ca52b9642943 100644
--- a/kernel/bpf/tnum.c
+++ b/kernel/bpf/tnum.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* tnum: tracked (or tristate) numbers 2/* tnum: tracked (or tristate) numbers
2 * 3 *
3 * A tnum tracks knowledge about the bits of a value. Each bit can be either 4 * A tnum tracks knowledge about the bits of a value. Each bit can be either
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 95f9354495ad..a2e763703c30 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1,15 +1,7 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com 2/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2 * Copyright (c) 2016 Facebook 3 * Copyright (c) 2016 Facebook
3 * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io 4 * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of version 2 of the GNU General Public
7 * License as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 */ 5 */
14#include <uapi/linux/btf.h> 6#include <uapi/linux/btf.h>
15#include <linux/kernel.h> 7#include <linux/kernel.h>
@@ -176,7 +168,7 @@ struct bpf_verifier_stack_elem {
176 struct bpf_verifier_stack_elem *next; 168 struct bpf_verifier_stack_elem *next;
177}; 169};
178 170
179#define BPF_COMPLEXITY_LIMIT_STACK 1024 171#define BPF_COMPLEXITY_LIMIT_JMP_SEQ 8192
180#define BPF_COMPLEXITY_LIMIT_STATES 64 172#define BPF_COMPLEXITY_LIMIT_STATES 64
181 173
182#define BPF_MAP_PTR_UNPRIV 1UL 174#define BPF_MAP_PTR_UNPRIV 1UL
@@ -334,7 +326,8 @@ static bool type_is_sk_pointer(enum bpf_reg_type type)
334{ 326{
335 return type == PTR_TO_SOCKET || 327 return type == PTR_TO_SOCKET ||
336 type == PTR_TO_SOCK_COMMON || 328 type == PTR_TO_SOCK_COMMON ||
337 type == PTR_TO_TCP_SOCK; 329 type == PTR_TO_TCP_SOCK ||
330 type == PTR_TO_XDP_SOCK;
338} 331}
339 332
340static bool reg_type_may_be_null(enum bpf_reg_type type) 333static bool reg_type_may_be_null(enum bpf_reg_type type)
@@ -406,6 +399,7 @@ static const char * const reg_type_str[] = {
406 [PTR_TO_TCP_SOCK] = "tcp_sock", 399 [PTR_TO_TCP_SOCK] = "tcp_sock",
407 [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null", 400 [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null",
408 [PTR_TO_TP_BUFFER] = "tp_buffer", 401 [PTR_TO_TP_BUFFER] = "tp_buffer",
402 [PTR_TO_XDP_SOCK] = "xdp_sock",
409}; 403};
410 404
411static char slot_type_char[] = { 405static char slot_type_char[] = {
@@ -453,12 +447,12 @@ static void print_verifier_state(struct bpf_verifier_env *env,
453 verbose(env, " R%d", i); 447 verbose(env, " R%d", i);
454 print_liveness(env, reg->live); 448 print_liveness(env, reg->live);
455 verbose(env, "=%s", reg_type_str[t]); 449 verbose(env, "=%s", reg_type_str[t]);
450 if (t == SCALAR_VALUE && reg->precise)
451 verbose(env, "P");
456 if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && 452 if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
457 tnum_is_const(reg->var_off)) { 453 tnum_is_const(reg->var_off)) {
458 /* reg->off should be 0 for SCALAR_VALUE */ 454 /* reg->off should be 0 for SCALAR_VALUE */
459 verbose(env, "%lld", reg->var_off.value + reg->off); 455 verbose(env, "%lld", reg->var_off.value + reg->off);
460 if (t == PTR_TO_STACK)
461 verbose(env, ",call_%d", func(env, reg)->callsite);
462 } else { 456 } else {
463 verbose(env, "(id=%d", reg->id); 457 verbose(env, "(id=%d", reg->id);
464 if (reg_type_may_be_refcounted_or_null(t)) 458 if (reg_type_may_be_refcounted_or_null(t))
@@ -520,11 +514,17 @@ static void print_verifier_state(struct bpf_verifier_env *env,
520 continue; 514 continue;
521 verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); 515 verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
522 print_liveness(env, state->stack[i].spilled_ptr.live); 516 print_liveness(env, state->stack[i].spilled_ptr.live);
523 if (state->stack[i].slot_type[0] == STACK_SPILL) 517 if (state->stack[i].slot_type[0] == STACK_SPILL) {
524 verbose(env, "=%s", 518 reg = &state->stack[i].spilled_ptr;
525 reg_type_str[state->stack[i].spilled_ptr.type]); 519 t = reg->type;
526 else 520 verbose(env, "=%s", reg_type_str[t]);
521 if (t == SCALAR_VALUE && reg->precise)
522 verbose(env, "P");
523 if (t == SCALAR_VALUE && tnum_is_const(reg->var_off))
524 verbose(env, "%lld", reg->var_off.value + reg->off);
525 } else {
527 verbose(env, "=%s", types_buf); 526 verbose(env, "=%s", types_buf);
527 }
528 } 528 }
529 if (state->acquired_refs && state->refs[0].id) { 529 if (state->acquired_refs && state->refs[0].id) {
530 verbose(env, " refs=%d", state->refs[0].id); 530 verbose(env, " refs=%d", state->refs[0].id);
@@ -673,6 +673,13 @@ static void free_func_state(struct bpf_func_state *state)
673 kfree(state); 673 kfree(state);
674} 674}
675 675
676static void clear_jmp_history(struct bpf_verifier_state *state)
677{
678 kfree(state->jmp_history);
679 state->jmp_history = NULL;
680 state->jmp_history_cnt = 0;
681}
682
676static void free_verifier_state(struct bpf_verifier_state *state, 683static void free_verifier_state(struct bpf_verifier_state *state,
677 bool free_self) 684 bool free_self)
678{ 685{
@@ -682,6 +689,7 @@ static void free_verifier_state(struct bpf_verifier_state *state,
682 free_func_state(state->frame[i]); 689 free_func_state(state->frame[i]);
683 state->frame[i] = NULL; 690 state->frame[i] = NULL;
684 } 691 }
692 clear_jmp_history(state);
685 if (free_self) 693 if (free_self)
686 kfree(state); 694 kfree(state);
687} 695}
@@ -709,8 +717,18 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
709 const struct bpf_verifier_state *src) 717 const struct bpf_verifier_state *src)
710{ 718{
711 struct bpf_func_state *dst; 719 struct bpf_func_state *dst;
720 u32 jmp_sz = sizeof(struct bpf_idx_pair) * src->jmp_history_cnt;
712 int i, err; 721 int i, err;
713 722
723 if (dst_state->jmp_history_cnt < src->jmp_history_cnt) {
724 kfree(dst_state->jmp_history);
725 dst_state->jmp_history = kmalloc(jmp_sz, GFP_USER);
726 if (!dst_state->jmp_history)
727 return -ENOMEM;
728 }
729 memcpy(dst_state->jmp_history, src->jmp_history, jmp_sz);
730 dst_state->jmp_history_cnt = src->jmp_history_cnt;
731
714 /* if dst has more stack frames then src frame, free them */ 732 /* if dst has more stack frames then src frame, free them */
715 for (i = src->curframe + 1; i <= dst_state->curframe; i++) { 733 for (i = src->curframe + 1; i <= dst_state->curframe; i++) {
716 free_func_state(dst_state->frame[i]); 734 free_func_state(dst_state->frame[i]);
@@ -719,6 +737,10 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
719 dst_state->speculative = src->speculative; 737 dst_state->speculative = src->speculative;
720 dst_state->curframe = src->curframe; 738 dst_state->curframe = src->curframe;
721 dst_state->active_spin_lock = src->active_spin_lock; 739 dst_state->active_spin_lock = src->active_spin_lock;
740 dst_state->branches = src->branches;
741 dst_state->parent = src->parent;
742 dst_state->first_insn_idx = src->first_insn_idx;
743 dst_state->last_insn_idx = src->last_insn_idx;
722 for (i = 0; i <= src->curframe; i++) { 744 for (i = 0; i <= src->curframe; i++) {
723 dst = dst_state->frame[i]; 745 dst = dst_state->frame[i];
724 if (!dst) { 746 if (!dst) {
@@ -734,6 +756,23 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
734 return 0; 756 return 0;
735} 757}
736 758
759static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
760{
761 while (st) {
762 u32 br = --st->branches;
763
764 /* WARN_ON(br > 1) technically makes sense here,
765 * but see comment in push_stack(), hence:
766 */
767 WARN_ONCE((int)br < 0,
768 "BUG update_branch_counts:branches_to_explore=%d\n",
769 br);
770 if (br)
771 break;
772 st = st->parent;
773 }
774}
775
737static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, 776static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
738 int *insn_idx) 777 int *insn_idx)
739{ 778{
@@ -782,10 +821,23 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
782 if (err) 821 if (err)
783 goto err; 822 goto err;
784 elem->st.speculative |= speculative; 823 elem->st.speculative |= speculative;
785 if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) { 824 if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) {
786 verbose(env, "BPF program is too complex\n"); 825 verbose(env, "The sequence of %d jumps is too complex.\n",
826 env->stack_size);
787 goto err; 827 goto err;
788 } 828 }
829 if (elem->st.parent) {
830 ++elem->st.parent->branches;
831 /* WARN_ON(branches > 2) technically makes sense here,
832 * but
833 * 1. speculative states will bump 'branches' for non-branch
834 * instructions
835 * 2. is_state_visited() heuristics may decide not to create
836 * a new state for a sequence of branches and all such current
837 * and cloned states will be pointing to a single parent state
838 * which might have large 'branches' count.
839 */
840 }
789 return &elem->st; 841 return &elem->st;
790err: 842err:
791 free_verifier_state(env->cur_state, true); 843 free_verifier_state(env->cur_state, true);
@@ -933,6 +985,9 @@ static void __mark_reg_unbounded(struct bpf_reg_state *reg)
933 reg->smax_value = S64_MAX; 985 reg->smax_value = S64_MAX;
934 reg->umin_value = 0; 986 reg->umin_value = 0;
935 reg->umax_value = U64_MAX; 987 reg->umax_value = U64_MAX;
988
989 /* constant backtracking is enabled for root only for now */
990 reg->precise = capable(CAP_SYS_ADMIN) ? false : true;
936} 991}
937 992
938/* Mark a register as having a completely unknown (scalar) value. */ 993/* Mark a register as having a completely unknown (scalar) value. */
@@ -981,6 +1036,7 @@ static void mark_reg_not_init(struct bpf_verifier_env *env,
981 __mark_reg_not_init(regs + regno); 1036 __mark_reg_not_init(regs + regno);
982} 1037}
983 1038
1039#define DEF_NOT_SUBREG (0)
984static void init_reg_state(struct bpf_verifier_env *env, 1040static void init_reg_state(struct bpf_verifier_env *env,
985 struct bpf_func_state *state) 1041 struct bpf_func_state *state)
986{ 1042{
@@ -991,6 +1047,7 @@ static void init_reg_state(struct bpf_verifier_env *env,
991 mark_reg_not_init(env, regs, i); 1047 mark_reg_not_init(env, regs, i);
992 regs[i].live = REG_LIVE_NONE; 1048 regs[i].live = REG_LIVE_NONE;
993 regs[i].parent = NULL; 1049 regs[i].parent = NULL;
1050 regs[i].subreg_def = DEF_NOT_SUBREG;
994 } 1051 }
995 1052
996 /* frame pointer */ 1053 /* frame pointer */
@@ -1136,7 +1193,7 @@ next:
1136 */ 1193 */
1137static int mark_reg_read(struct bpf_verifier_env *env, 1194static int mark_reg_read(struct bpf_verifier_env *env,
1138 const struct bpf_reg_state *state, 1195 const struct bpf_reg_state *state,
1139 struct bpf_reg_state *parent) 1196 struct bpf_reg_state *parent, u8 flag)
1140{ 1197{
1141 bool writes = parent == state->parent; /* Observe write marks */ 1198 bool writes = parent == state->parent; /* Observe write marks */
1142 int cnt = 0; 1199 int cnt = 0;
@@ -1151,17 +1208,26 @@ static int mark_reg_read(struct bpf_verifier_env *env,
1151 parent->var_off.value, parent->off); 1208 parent->var_off.value, parent->off);
1152 return -EFAULT; 1209 return -EFAULT;
1153 } 1210 }
1154 if (parent->live & REG_LIVE_READ) 1211 /* The first condition is more likely to be true than the
1212 * second, checked it first.
1213 */
1214 if ((parent->live & REG_LIVE_READ) == flag ||
1215 parent->live & REG_LIVE_READ64)
1155 /* The parentage chain never changes and 1216 /* The parentage chain never changes and
1156 * this parent was already marked as LIVE_READ. 1217 * this parent was already marked as LIVE_READ.
1157 * There is no need to keep walking the chain again and 1218 * There is no need to keep walking the chain again and
1158 * keep re-marking all parents as LIVE_READ. 1219 * keep re-marking all parents as LIVE_READ.
1159 * This case happens when the same register is read 1220 * This case happens when the same register is read
1160 * multiple times without writes into it in-between. 1221 * multiple times without writes into it in-between.
1222 * Also, if parent has the stronger REG_LIVE_READ64 set,
1223 * then no need to set the weak REG_LIVE_READ32.
1161 */ 1224 */
1162 break; 1225 break;
1163 /* ... then we depend on parent's value */ 1226 /* ... then we depend on parent's value */
1164 parent->live |= REG_LIVE_READ; 1227 parent->live |= flag;
1228 /* REG_LIVE_READ64 overrides REG_LIVE_READ32. */
1229 if (flag == REG_LIVE_READ64)
1230 parent->live &= ~REG_LIVE_READ32;
1165 state = parent; 1231 state = parent;
1166 parent = state->parent; 1232 parent = state->parent;
1167 writes = true; 1233 writes = true;
@@ -1173,12 +1239,129 @@ static int mark_reg_read(struct bpf_verifier_env *env,
1173 return 0; 1239 return 0;
1174} 1240}
1175 1241
1242/* This function is supposed to be used by the following 32-bit optimization
1243 * code only. It returns TRUE if the source or destination register operates
1244 * on 64-bit, otherwise return FALSE.
1245 */
1246static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn,
1247 u32 regno, struct bpf_reg_state *reg, enum reg_arg_type t)
1248{
1249 u8 code, class, op;
1250
1251 code = insn->code;
1252 class = BPF_CLASS(code);
1253 op = BPF_OP(code);
1254 if (class == BPF_JMP) {
1255 /* BPF_EXIT for "main" will reach here. Return TRUE
1256 * conservatively.
1257 */
1258 if (op == BPF_EXIT)
1259 return true;
1260 if (op == BPF_CALL) {
1261 /* BPF to BPF call will reach here because of marking
1262 * caller saved clobber with DST_OP_NO_MARK for which we
1263 * don't care the register def because they are anyway
1264 * marked as NOT_INIT already.
1265 */
1266 if (insn->src_reg == BPF_PSEUDO_CALL)
1267 return false;
1268 /* Helper call will reach here because of arg type
1269 * check, conservatively return TRUE.
1270 */
1271 if (t == SRC_OP)
1272 return true;
1273
1274 return false;
1275 }
1276 }
1277
1278 if (class == BPF_ALU64 || class == BPF_JMP ||
1279 /* BPF_END always use BPF_ALU class. */
1280 (class == BPF_ALU && op == BPF_END && insn->imm == 64))
1281 return true;
1282
1283 if (class == BPF_ALU || class == BPF_JMP32)
1284 return false;
1285
1286 if (class == BPF_LDX) {
1287 if (t != SRC_OP)
1288 return BPF_SIZE(code) == BPF_DW;
1289 /* LDX source must be ptr. */
1290 return true;
1291 }
1292
1293 if (class == BPF_STX) {
1294 if (reg->type != SCALAR_VALUE)
1295 return true;
1296 return BPF_SIZE(code) == BPF_DW;
1297 }
1298
1299 if (class == BPF_LD) {
1300 u8 mode = BPF_MODE(code);
1301
1302 /* LD_IMM64 */
1303 if (mode == BPF_IMM)
1304 return true;
1305
1306 /* Both LD_IND and LD_ABS return 32-bit data. */
1307 if (t != SRC_OP)
1308 return false;
1309
1310 /* Implicit ctx ptr. */
1311 if (regno == BPF_REG_6)
1312 return true;
1313
1314 /* Explicit source could be any width. */
1315 return true;
1316 }
1317
1318 if (class == BPF_ST)
1319 /* The only source register for BPF_ST is a ptr. */
1320 return true;
1321
1322 /* Conservatively return true at default. */
1323 return true;
1324}
1325
1326/* Return TRUE if INSN doesn't have explicit value define. */
1327static bool insn_no_def(struct bpf_insn *insn)
1328{
1329 u8 class = BPF_CLASS(insn->code);
1330
1331 return (class == BPF_JMP || class == BPF_JMP32 ||
1332 class == BPF_STX || class == BPF_ST);
1333}
1334
1335/* Return TRUE if INSN has defined any 32-bit value explicitly. */
1336static bool insn_has_def32(struct bpf_verifier_env *env, struct bpf_insn *insn)
1337{
1338 if (insn_no_def(insn))
1339 return false;
1340
1341 return !is_reg64(env, insn, insn->dst_reg, NULL, DST_OP);
1342}
1343
1344static void mark_insn_zext(struct bpf_verifier_env *env,
1345 struct bpf_reg_state *reg)
1346{
1347 s32 def_idx = reg->subreg_def;
1348
1349 if (def_idx == DEF_NOT_SUBREG)
1350 return;
1351
1352 env->insn_aux_data[def_idx - 1].zext_dst = true;
1353 /* The dst will be zero extended, so won't be sub-register anymore. */
1354 reg->subreg_def = DEF_NOT_SUBREG;
1355}
1356
1176static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, 1357static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
1177 enum reg_arg_type t) 1358 enum reg_arg_type t)
1178{ 1359{
1179 struct bpf_verifier_state *vstate = env->cur_state; 1360 struct bpf_verifier_state *vstate = env->cur_state;
1180 struct bpf_func_state *state = vstate->frame[vstate->curframe]; 1361 struct bpf_func_state *state = vstate->frame[vstate->curframe];
1362 struct bpf_insn *insn = env->prog->insnsi + env->insn_idx;
1181 struct bpf_reg_state *reg, *regs = state->regs; 1363 struct bpf_reg_state *reg, *regs = state->regs;
1364 bool rw64;
1182 1365
1183 if (regno >= MAX_BPF_REG) { 1366 if (regno >= MAX_BPF_REG) {
1184 verbose(env, "R%d is invalid\n", regno); 1367 verbose(env, "R%d is invalid\n", regno);
@@ -1186,6 +1369,7 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
1186 } 1369 }
1187 1370
1188 reg = &regs[regno]; 1371 reg = &regs[regno];
1372 rw64 = is_reg64(env, insn, regno, reg, t);
1189 if (t == SRC_OP) { 1373 if (t == SRC_OP) {
1190 /* check whether register used as source operand can be read */ 1374 /* check whether register used as source operand can be read */
1191 if (reg->type == NOT_INIT) { 1375 if (reg->type == NOT_INIT) {
@@ -1196,7 +1380,11 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
1196 if (regno == BPF_REG_FP) 1380 if (regno == BPF_REG_FP)
1197 return 0; 1381 return 0;
1198 1382
1199 return mark_reg_read(env, reg, reg->parent); 1383 if (rw64)
1384 mark_insn_zext(env, reg);
1385
1386 return mark_reg_read(env, reg, reg->parent,
1387 rw64 ? REG_LIVE_READ64 : REG_LIVE_READ32);
1200 } else { 1388 } else {
1201 /* check whether register used as dest operand can be written to */ 1389 /* check whether register used as dest operand can be written to */
1202 if (regno == BPF_REG_FP) { 1390 if (regno == BPF_REG_FP) {
@@ -1204,12 +1392,441 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
1204 return -EACCES; 1392 return -EACCES;
1205 } 1393 }
1206 reg->live |= REG_LIVE_WRITTEN; 1394 reg->live |= REG_LIVE_WRITTEN;
1395 reg->subreg_def = rw64 ? DEF_NOT_SUBREG : env->insn_idx + 1;
1207 if (t == DST_OP) 1396 if (t == DST_OP)
1208 mark_reg_unknown(env, regs, regno); 1397 mark_reg_unknown(env, regs, regno);
1209 } 1398 }
1210 return 0; 1399 return 0;
1211} 1400}
1212 1401
1402/* for any branch, call, exit record the history of jmps in the given state */
1403static int push_jmp_history(struct bpf_verifier_env *env,
1404 struct bpf_verifier_state *cur)
1405{
1406 u32 cnt = cur->jmp_history_cnt;
1407 struct bpf_idx_pair *p;
1408
1409 cnt++;
1410 p = krealloc(cur->jmp_history, cnt * sizeof(*p), GFP_USER);
1411 if (!p)
1412 return -ENOMEM;
1413 p[cnt - 1].idx = env->insn_idx;
1414 p[cnt - 1].prev_idx = env->prev_insn_idx;
1415 cur->jmp_history = p;
1416 cur->jmp_history_cnt = cnt;
1417 return 0;
1418}
1419
1420/* Backtrack one insn at a time. If idx is not at the top of recorded
1421 * history then previous instruction came from straight line execution.
1422 */
1423static int get_prev_insn_idx(struct bpf_verifier_state *st, int i,
1424 u32 *history)
1425{
1426 u32 cnt = *history;
1427
1428 if (cnt && st->jmp_history[cnt - 1].idx == i) {
1429 i = st->jmp_history[cnt - 1].prev_idx;
1430 (*history)--;
1431 } else {
1432 i--;
1433 }
1434 return i;
1435}
1436
1437/* For given verifier state backtrack_insn() is called from the last insn to
1438 * the first insn. Its purpose is to compute a bitmask of registers and
1439 * stack slots that needs precision in the parent verifier state.
1440 */
1441static int backtrack_insn(struct bpf_verifier_env *env, int idx,
1442 u32 *reg_mask, u64 *stack_mask)
1443{
1444 const struct bpf_insn_cbs cbs = {
1445 .cb_print = verbose,
1446 .private_data = env,
1447 };
1448 struct bpf_insn *insn = env->prog->insnsi + idx;
1449 u8 class = BPF_CLASS(insn->code);
1450 u8 opcode = BPF_OP(insn->code);
1451 u8 mode = BPF_MODE(insn->code);
1452 u32 dreg = 1u << insn->dst_reg;
1453 u32 sreg = 1u << insn->src_reg;
1454 u32 spi;
1455
1456 if (insn->code == 0)
1457 return 0;
1458 if (env->log.level & BPF_LOG_LEVEL) {
1459 verbose(env, "regs=%x stack=%llx before ", *reg_mask, *stack_mask);
1460 verbose(env, "%d: ", idx);
1461 print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
1462 }
1463
1464 if (class == BPF_ALU || class == BPF_ALU64) {
1465 if (!(*reg_mask & dreg))
1466 return 0;
1467 if (opcode == BPF_MOV) {
1468 if (BPF_SRC(insn->code) == BPF_X) {
1469 /* dreg = sreg
1470 * dreg needs precision after this insn
1471 * sreg needs precision before this insn
1472 */
1473 *reg_mask &= ~dreg;
1474 *reg_mask |= sreg;
1475 } else {
1476 /* dreg = K
1477 * dreg needs precision after this insn.
1478 * Corresponding register is already marked
1479 * as precise=true in this verifier state.
1480 * No further markings in parent are necessary
1481 */
1482 *reg_mask &= ~dreg;
1483 }
1484 } else {
1485 if (BPF_SRC(insn->code) == BPF_X) {
1486 /* dreg += sreg
1487 * both dreg and sreg need precision
1488 * before this insn
1489 */
1490 *reg_mask |= sreg;
1491 } /* else dreg += K
1492 * dreg still needs precision before this insn
1493 */
1494 }
1495 } else if (class == BPF_LDX) {
1496 if (!(*reg_mask & dreg))
1497 return 0;
1498 *reg_mask &= ~dreg;
1499
1500 /* scalars can only be spilled into stack w/o losing precision.
1501 * Load from any other memory can be zero extended.
1502 * The desire to keep that precision is already indicated
1503 * by 'precise' mark in corresponding register of this state.
1504 * No further tracking necessary.
1505 */
1506 if (insn->src_reg != BPF_REG_FP)
1507 return 0;
1508 if (BPF_SIZE(insn->code) != BPF_DW)
1509 return 0;
1510
1511 /* dreg = *(u64 *)[fp - off] was a fill from the stack.
1512 * that [fp - off] slot contains scalar that needs to be
1513 * tracked with precision
1514 */
1515 spi = (-insn->off - 1) / BPF_REG_SIZE;
1516 if (spi >= 64) {
1517 verbose(env, "BUG spi %d\n", spi);
1518 WARN_ONCE(1, "verifier backtracking bug");
1519 return -EFAULT;
1520 }
1521 *stack_mask |= 1ull << spi;
1522 } else if (class == BPF_STX) {
1523 if (*reg_mask & dreg)
1524 /* stx shouldn't be using _scalar_ dst_reg
1525 * to access memory. It means backtracking
1526 * encountered a case of pointer subtraction.
1527 */
1528 return -ENOTSUPP;
1529 /* scalars can only be spilled into stack */
1530 if (insn->dst_reg != BPF_REG_FP)
1531 return 0;
1532 if (BPF_SIZE(insn->code) != BPF_DW)
1533 return 0;
1534 spi = (-insn->off - 1) / BPF_REG_SIZE;
1535 if (spi >= 64) {
1536 verbose(env, "BUG spi %d\n", spi);
1537 WARN_ONCE(1, "verifier backtracking bug");
1538 return -EFAULT;
1539 }
1540 if (!(*stack_mask & (1ull << spi)))
1541 return 0;
1542 *stack_mask &= ~(1ull << spi);
1543 *reg_mask |= sreg;
1544 } else if (class == BPF_JMP || class == BPF_JMP32) {
1545 if (opcode == BPF_CALL) {
1546 if (insn->src_reg == BPF_PSEUDO_CALL)
1547 return -ENOTSUPP;
1548 /* regular helper call sets R0 */
1549 *reg_mask &= ~1;
1550 if (*reg_mask & 0x3f) {
1551 /* if backtracing was looking for registers R1-R5
1552 * they should have been found already.
1553 */
1554 verbose(env, "BUG regs %x\n", *reg_mask);
1555 WARN_ONCE(1, "verifier backtracking bug");
1556 return -EFAULT;
1557 }
1558 } else if (opcode == BPF_EXIT) {
1559 return -ENOTSUPP;
1560 }
1561 } else if (class == BPF_LD) {
1562 if (!(*reg_mask & dreg))
1563 return 0;
1564 *reg_mask &= ~dreg;
1565 /* It's ld_imm64 or ld_abs or ld_ind.
1566 * For ld_imm64 no further tracking of precision
1567 * into parent is necessary
1568 */
1569 if (mode == BPF_IND || mode == BPF_ABS)
1570 /* to be analyzed */
1571 return -ENOTSUPP;
1572 } else if (class == BPF_ST) {
1573 if (*reg_mask & dreg)
1574 /* likely pointer subtraction */
1575 return -ENOTSUPP;
1576 }
1577 return 0;
1578}
1579
1580/* the scalar precision tracking algorithm:
1581 * . at the start all registers have precise=false.
1582 * . scalar ranges are tracked as normal through alu and jmp insns.
1583 * . once precise value of the scalar register is used in:
1584 * . ptr + scalar alu
1585 * . if (scalar cond K|scalar)
1586 * . helper_call(.., scalar, ...) where ARG_CONST is expected
1587 * backtrack through the verifier states and mark all registers and
1588 * stack slots with spilled constants that these scalar regisers
1589 * should be precise.
1590 * . during state pruning two registers (or spilled stack slots)
1591 * are equivalent if both are not precise.
1592 *
1593 * Note the verifier cannot simply walk register parentage chain,
1594 * since many different registers and stack slots could have been
1595 * used to compute single precise scalar.
1596 *
1597 * The approach of starting with precise=true for all registers and then
1598 * backtrack to mark a register as not precise when the verifier detects
1599 * that program doesn't care about specific value (e.g., when helper
1600 * takes register as ARG_ANYTHING parameter) is not safe.
1601 *
1602 * It's ok to walk single parentage chain of the verifier states.
1603 * It's possible that this backtracking will go all the way till 1st insn.
1604 * All other branches will be explored for needing precision later.
1605 *
1606 * The backtracking needs to deal with cases like:
1607 * R8=map_value(id=0,off=0,ks=4,vs=1952,imm=0) R9_w=map_value(id=0,off=40,ks=4,vs=1952,imm=0)
1608 * r9 -= r8
1609 * r5 = r9
1610 * if r5 > 0x79f goto pc+7
1611 * R5_w=inv(id=0,umax_value=1951,var_off=(0x0; 0x7ff))
1612 * r5 += 1
1613 * ...
1614 * call bpf_perf_event_output#25
1615 * where .arg5_type = ARG_CONST_SIZE_OR_ZERO
1616 *
1617 * and this case:
1618 * r6 = 1
1619 * call foo // uses callee's r6 inside to compute r0
1620 * r0 += r6
1621 * if r0 == 0 goto
1622 *
1623 * to track above reg_mask/stack_mask needs to be independent for each frame.
1624 *
1625 * Also if parent's curframe > frame where backtracking started,
1626 * the verifier need to mark registers in both frames, otherwise callees
1627 * may incorrectly prune callers. This is similar to
1628 * commit 7640ead93924 ("bpf: verifier: make sure callees don't prune with caller differences")
1629 *
1630 * For now backtracking falls back into conservative marking.
1631 */
1632static void mark_all_scalars_precise(struct bpf_verifier_env *env,
1633 struct bpf_verifier_state *st)
1634{
1635 struct bpf_func_state *func;
1636 struct bpf_reg_state *reg;
1637 int i, j;
1638
1639 /* big hammer: mark all scalars precise in this path.
1640 * pop_stack may still get !precise scalars.
1641 */
1642 for (; st; st = st->parent)
1643 for (i = 0; i <= st->curframe; i++) {
1644 func = st->frame[i];
1645 for (j = 0; j < BPF_REG_FP; j++) {
1646 reg = &func->regs[j];
1647 if (reg->type != SCALAR_VALUE)
1648 continue;
1649 reg->precise = true;
1650 }
1651 for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
1652 if (func->stack[j].slot_type[0] != STACK_SPILL)
1653 continue;
1654 reg = &func->stack[j].spilled_ptr;
1655 if (reg->type != SCALAR_VALUE)
1656 continue;
1657 reg->precise = true;
1658 }
1659 }
1660}
1661
1662static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
1663 int spi)
1664{
1665 struct bpf_verifier_state *st = env->cur_state;
1666 int first_idx = st->first_insn_idx;
1667 int last_idx = env->insn_idx;
1668 struct bpf_func_state *func;
1669 struct bpf_reg_state *reg;
1670 u32 reg_mask = regno >= 0 ? 1u << regno : 0;
1671 u64 stack_mask = spi >= 0 ? 1ull << spi : 0;
1672 bool skip_first = true;
1673 bool new_marks = false;
1674 int i, err;
1675
1676 if (!env->allow_ptr_leaks)
1677 /* backtracking is root only for now */
1678 return 0;
1679
1680 func = st->frame[st->curframe];
1681 if (regno >= 0) {
1682 reg = &func->regs[regno];
1683 if (reg->type != SCALAR_VALUE) {
1684 WARN_ONCE(1, "backtracing misuse");
1685 return -EFAULT;
1686 }
1687 if (!reg->precise)
1688 new_marks = true;
1689 else
1690 reg_mask = 0;
1691 reg->precise = true;
1692 }
1693
1694 while (spi >= 0) {
1695 if (func->stack[spi].slot_type[0] != STACK_SPILL) {
1696 stack_mask = 0;
1697 break;
1698 }
1699 reg = &func->stack[spi].spilled_ptr;
1700 if (reg->type != SCALAR_VALUE) {
1701 stack_mask = 0;
1702 break;
1703 }
1704 if (!reg->precise)
1705 new_marks = true;
1706 else
1707 stack_mask = 0;
1708 reg->precise = true;
1709 break;
1710 }
1711
1712 if (!new_marks)
1713 return 0;
1714 if (!reg_mask && !stack_mask)
1715 return 0;
1716 for (;;) {
1717 DECLARE_BITMAP(mask, 64);
1718 u32 history = st->jmp_history_cnt;
1719
1720 if (env->log.level & BPF_LOG_LEVEL)
1721 verbose(env, "last_idx %d first_idx %d\n", last_idx, first_idx);
1722 for (i = last_idx;;) {
1723 if (skip_first) {
1724 err = 0;
1725 skip_first = false;
1726 } else {
1727 err = backtrack_insn(env, i, &reg_mask, &stack_mask);
1728 }
1729 if (err == -ENOTSUPP) {
1730 mark_all_scalars_precise(env, st);
1731 return 0;
1732 } else if (err) {
1733 return err;
1734 }
1735 if (!reg_mask && !stack_mask)
1736 /* Found assignment(s) into tracked register in this state.
1737 * Since this state is already marked, just return.
1738 * Nothing to be tracked further in the parent state.
1739 */
1740 return 0;
1741 if (i == first_idx)
1742 break;
1743 i = get_prev_insn_idx(st, i, &history);
1744 if (i >= env->prog->len) {
1745 /* This can happen if backtracking reached insn 0
1746 * and there are still reg_mask or stack_mask
1747 * to backtrack.
1748 * It means the backtracking missed the spot where
1749 * particular register was initialized with a constant.
1750 */
1751 verbose(env, "BUG backtracking idx %d\n", i);
1752 WARN_ONCE(1, "verifier backtracking bug");
1753 return -EFAULT;
1754 }
1755 }
1756 st = st->parent;
1757 if (!st)
1758 break;
1759
1760 new_marks = false;
1761 func = st->frame[st->curframe];
1762 bitmap_from_u64(mask, reg_mask);
1763 for_each_set_bit(i, mask, 32) {
1764 reg = &func->regs[i];
1765 if (reg->type != SCALAR_VALUE) {
1766 reg_mask &= ~(1u << i);
1767 continue;
1768 }
1769 if (!reg->precise)
1770 new_marks = true;
1771 reg->precise = true;
1772 }
1773
1774 bitmap_from_u64(mask, stack_mask);
1775 for_each_set_bit(i, mask, 64) {
1776 if (i >= func->allocated_stack / BPF_REG_SIZE) {
1777 /* This can happen if backtracking
1778 * is propagating stack precision where
1779 * caller has larger stack frame
1780 * than callee, but backtrack_insn() should
1781 * have returned -ENOTSUPP.
1782 */
1783 verbose(env, "BUG spi %d stack_size %d\n",
1784 i, func->allocated_stack);
1785 WARN_ONCE(1, "verifier backtracking bug");
1786 return -EFAULT;
1787 }
1788
1789 if (func->stack[i].slot_type[0] != STACK_SPILL) {
1790 stack_mask &= ~(1ull << i);
1791 continue;
1792 }
1793 reg = &func->stack[i].spilled_ptr;
1794 if (reg->type != SCALAR_VALUE) {
1795 stack_mask &= ~(1ull << i);
1796 continue;
1797 }
1798 if (!reg->precise)
1799 new_marks = true;
1800 reg->precise = true;
1801 }
1802 if (env->log.level & BPF_LOG_LEVEL) {
1803 print_verifier_state(env, func);
1804 verbose(env, "parent %s regs=%x stack=%llx marks\n",
1805 new_marks ? "didn't have" : "already had",
1806 reg_mask, stack_mask);
1807 }
1808
1809 if (!reg_mask && !stack_mask)
1810 break;
1811 if (!new_marks)
1812 break;
1813
1814 last_idx = st->last_insn_idx;
1815 first_idx = st->first_insn_idx;
1816 }
1817 return 0;
1818}
1819
1820static int mark_chain_precision(struct bpf_verifier_env *env, int regno)
1821{
1822 return __mark_chain_precision(env, regno, -1);
1823}
1824
1825static int mark_chain_precision_stack(struct bpf_verifier_env *env, int spi)
1826{
1827 return __mark_chain_precision(env, -1, spi);
1828}
1829
1213static bool is_spillable_regtype(enum bpf_reg_type type) 1830static bool is_spillable_regtype(enum bpf_reg_type type)
1214{ 1831{
1215 switch (type) { 1832 switch (type) {
@@ -1228,6 +1845,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
1228 case PTR_TO_SOCK_COMMON_OR_NULL: 1845 case PTR_TO_SOCK_COMMON_OR_NULL:
1229 case PTR_TO_TCP_SOCK: 1846 case PTR_TO_TCP_SOCK:
1230 case PTR_TO_TCP_SOCK_OR_NULL: 1847 case PTR_TO_TCP_SOCK_OR_NULL:
1848 case PTR_TO_XDP_SOCK:
1231 return true; 1849 return true;
1232 default: 1850 default:
1233 return false; 1851 return false;
@@ -1240,6 +1858,23 @@ static bool register_is_null(struct bpf_reg_state *reg)
1240 return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0); 1858 return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0);
1241} 1859}
1242 1860
1861static bool register_is_const(struct bpf_reg_state *reg)
1862{
1863 return reg->type == SCALAR_VALUE && tnum_is_const(reg->var_off);
1864}
1865
1866static void save_register_state(struct bpf_func_state *state,
1867 int spi, struct bpf_reg_state *reg)
1868{
1869 int i;
1870
1871 state->stack[spi].spilled_ptr = *reg;
1872 state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
1873
1874 for (i = 0; i < BPF_REG_SIZE; i++)
1875 state->stack[spi].slot_type[i] = STACK_SPILL;
1876}
1877
1243/* check_stack_read/write functions track spill/fill of registers, 1878/* check_stack_read/write functions track spill/fill of registers,
1244 * stack boundary and alignment are checked in check_mem_access() 1879 * stack boundary and alignment are checked in check_mem_access()
1245 */ 1880 */
@@ -1249,7 +1884,8 @@ static int check_stack_write(struct bpf_verifier_env *env,
1249{ 1884{
1250 struct bpf_func_state *cur; /* state of the current function */ 1885 struct bpf_func_state *cur; /* state of the current function */
1251 int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; 1886 int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
1252 enum bpf_reg_type type; 1887 u32 dst_reg = env->prog->insnsi[insn_idx].dst_reg;
1888 struct bpf_reg_state *reg = NULL;
1253 1889
1254 err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE), 1890 err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE),
1255 state->acquired_refs, true); 1891 state->acquired_refs, true);
@@ -1266,27 +1902,48 @@ static int check_stack_write(struct bpf_verifier_env *env,
1266 } 1902 }
1267 1903
1268 cur = env->cur_state->frame[env->cur_state->curframe]; 1904 cur = env->cur_state->frame[env->cur_state->curframe];
1269 if (value_regno >= 0 && 1905 if (value_regno >= 0)
1270 is_spillable_regtype((type = cur->regs[value_regno].type))) { 1906 reg = &cur->regs[value_regno];
1271 1907
1908 if (reg && size == BPF_REG_SIZE && register_is_const(reg) &&
1909 !register_is_null(reg) && env->allow_ptr_leaks) {
1910 if (dst_reg != BPF_REG_FP) {
1911 /* The backtracking logic can only recognize explicit
1912 * stack slot address like [fp - 8]. Other spill of
1913 * scalar via different register has to be conervative.
1914 * Backtrack from here and mark all registers as precise
1915 * that contributed into 'reg' being a constant.
1916 */
1917 err = mark_chain_precision(env, value_regno);
1918 if (err)
1919 return err;
1920 }
1921 save_register_state(state, spi, reg);
1922 } else if (reg && is_spillable_regtype(reg->type)) {
1272 /* register containing pointer is being spilled into stack */ 1923 /* register containing pointer is being spilled into stack */
1273 if (size != BPF_REG_SIZE) { 1924 if (size != BPF_REG_SIZE) {
1925 verbose_linfo(env, insn_idx, "; ");
1274 verbose(env, "invalid size of register spill\n"); 1926 verbose(env, "invalid size of register spill\n");
1275 return -EACCES; 1927 return -EACCES;
1276 } 1928 }
1277 1929
1278 if (state != cur && type == PTR_TO_STACK) { 1930 if (state != cur && reg->type == PTR_TO_STACK) {
1279 verbose(env, "cannot spill pointers to stack into stack frame of the caller\n"); 1931 verbose(env, "cannot spill pointers to stack into stack frame of the caller\n");
1280 return -EINVAL; 1932 return -EINVAL;
1281 } 1933 }
1282 1934
1283 /* save register state */ 1935 if (!env->allow_ptr_leaks) {
1284 state->stack[spi].spilled_ptr = cur->regs[value_regno]; 1936 bool sanitize = false;
1285 state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
1286 1937
1287 for (i = 0; i < BPF_REG_SIZE; i++) { 1938 if (state->stack[spi].slot_type[0] == STACK_SPILL &&
1288 if (state->stack[spi].slot_type[i] == STACK_MISC && 1939 register_is_const(&state->stack[spi].spilled_ptr))
1289 !env->allow_ptr_leaks) { 1940 sanitize = true;
1941 for (i = 0; i < BPF_REG_SIZE; i++)
1942 if (state->stack[spi].slot_type[i] == STACK_MISC) {
1943 sanitize = true;
1944 break;
1945 }
1946 if (sanitize) {
1290 int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off; 1947 int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off;
1291 int soff = (-spi - 1) * BPF_REG_SIZE; 1948 int soff = (-spi - 1) * BPF_REG_SIZE;
1292 1949
@@ -1309,8 +1966,8 @@ static int check_stack_write(struct bpf_verifier_env *env,
1309 } 1966 }
1310 *poff = soff; 1967 *poff = soff;
1311 } 1968 }
1312 state->stack[spi].slot_type[i] = STACK_SPILL;
1313 } 1969 }
1970 save_register_state(state, spi, reg);
1314 } else { 1971 } else {
1315 u8 type = STACK_MISC; 1972 u8 type = STACK_MISC;
1316 1973
@@ -1333,9 +1990,13 @@ static int check_stack_write(struct bpf_verifier_env *env,
1333 state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; 1990 state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
1334 1991
1335 /* when we zero initialize stack slots mark them as such */ 1992 /* when we zero initialize stack slots mark them as such */
1336 if (value_regno >= 0 && 1993 if (reg && register_is_null(reg)) {
1337 register_is_null(&cur->regs[value_regno])) 1994 /* backtracking doesn't work for STACK_ZERO yet. */
1995 err = mark_chain_precision(env, value_regno);
1996 if (err)
1997 return err;
1338 type = STACK_ZERO; 1998 type = STACK_ZERO;
1999 }
1339 2000
1340 /* Mark slots affected by this stack write. */ 2001 /* Mark slots affected by this stack write. */
1341 for (i = 0; i < size; i++) 2002 for (i = 0; i < size; i++)
@@ -1352,6 +2013,7 @@ static int check_stack_read(struct bpf_verifier_env *env,
1352 struct bpf_verifier_state *vstate = env->cur_state; 2013 struct bpf_verifier_state *vstate = env->cur_state;
1353 struct bpf_func_state *state = vstate->frame[vstate->curframe]; 2014 struct bpf_func_state *state = vstate->frame[vstate->curframe];
1354 int i, slot = -off - 1, spi = slot / BPF_REG_SIZE; 2015 int i, slot = -off - 1, spi = slot / BPF_REG_SIZE;
2016 struct bpf_reg_state *reg;
1355 u8 *stype; 2017 u8 *stype;
1356 2018
1357 if (reg_state->allocated_stack <= slot) { 2019 if (reg_state->allocated_stack <= slot) {
@@ -1360,11 +2022,21 @@ static int check_stack_read(struct bpf_verifier_env *env,
1360 return -EACCES; 2022 return -EACCES;
1361 } 2023 }
1362 stype = reg_state->stack[spi].slot_type; 2024 stype = reg_state->stack[spi].slot_type;
2025 reg = &reg_state->stack[spi].spilled_ptr;
1363 2026
1364 if (stype[0] == STACK_SPILL) { 2027 if (stype[0] == STACK_SPILL) {
1365 if (size != BPF_REG_SIZE) { 2028 if (size != BPF_REG_SIZE) {
1366 verbose(env, "invalid size of register spill\n"); 2029 if (reg->type != SCALAR_VALUE) {
1367 return -EACCES; 2030 verbose_linfo(env, env->insn_idx, "; ");
2031 verbose(env, "invalid size of register fill\n");
2032 return -EACCES;
2033 }
2034 if (value_regno >= 0) {
2035 mark_reg_unknown(env, state->regs, value_regno);
2036 state->regs[value_regno].live |= REG_LIVE_WRITTEN;
2037 }
2038 mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
2039 return 0;
1368 } 2040 }
1369 for (i = 1; i < BPF_REG_SIZE; i++) { 2041 for (i = 1; i < BPF_REG_SIZE; i++) {
1370 if (stype[(slot - i) % BPF_REG_SIZE] != STACK_SPILL) { 2042 if (stype[(slot - i) % BPF_REG_SIZE] != STACK_SPILL) {
@@ -1375,16 +2047,14 @@ static int check_stack_read(struct bpf_verifier_env *env,
1375 2047
1376 if (value_regno >= 0) { 2048 if (value_regno >= 0) {
1377 /* restore register state from stack */ 2049 /* restore register state from stack */
1378 state->regs[value_regno] = reg_state->stack[spi].spilled_ptr; 2050 state->regs[value_regno] = *reg;
1379 /* mark reg as written since spilled pointer state likely 2051 /* mark reg as written since spilled pointer state likely
1380 * has its liveness marks cleared by is_state_visited() 2052 * has its liveness marks cleared by is_state_visited()
1381 * which resets stack/reg liveness for state transitions 2053 * which resets stack/reg liveness for state transitions
1382 */ 2054 */
1383 state->regs[value_regno].live |= REG_LIVE_WRITTEN; 2055 state->regs[value_regno].live |= REG_LIVE_WRITTEN;
1384 } 2056 }
1385 mark_reg_read(env, &reg_state->stack[spi].spilled_ptr, 2057 mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
1386 reg_state->stack[spi].spilled_ptr.parent);
1387 return 0;
1388 } else { 2058 } else {
1389 int zeros = 0; 2059 int zeros = 0;
1390 2060
@@ -1399,22 +2069,32 @@ static int check_stack_read(struct bpf_verifier_env *env,
1399 off, i, size); 2069 off, i, size);
1400 return -EACCES; 2070 return -EACCES;
1401 } 2071 }
1402 mark_reg_read(env, &reg_state->stack[spi].spilled_ptr, 2072 mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
1403 reg_state->stack[spi].spilled_ptr.parent);
1404 if (value_regno >= 0) { 2073 if (value_regno >= 0) {
1405 if (zeros == size) { 2074 if (zeros == size) {
1406 /* any size read into register is zero extended, 2075 /* any size read into register is zero extended,
1407 * so the whole register == const_zero 2076 * so the whole register == const_zero
1408 */ 2077 */
1409 __mark_reg_const_zero(&state->regs[value_regno]); 2078 __mark_reg_const_zero(&state->regs[value_regno]);
2079 /* backtracking doesn't support STACK_ZERO yet,
2080 * so mark it precise here, so that later
2081 * backtracking can stop here.
2082 * Backtracking may not need this if this register
2083 * doesn't participate in pointer adjustment.
2084 * Forward propagation of precise flag is not
2085 * necessary either. This mark is only to stop
2086 * backtracking. Any register that contributed
2087 * to const 0 was marked precise before spill.
2088 */
2089 state->regs[value_regno].precise = true;
1410 } else { 2090 } else {
1411 /* have read misc data from the stack */ 2091 /* have read misc data from the stack */
1412 mark_reg_unknown(env, state->regs, value_regno); 2092 mark_reg_unknown(env, state->regs, value_regno);
1413 } 2093 }
1414 state->regs[value_regno].live |= REG_LIVE_WRITTEN; 2094 state->regs[value_regno].live |= REG_LIVE_WRITTEN;
1415 } 2095 }
1416 return 0;
1417 } 2096 }
2097 return 0;
1418} 2098}
1419 2099
1420static int check_stack_access(struct bpf_verifier_env *env, 2100static int check_stack_access(struct bpf_verifier_env *env,
@@ -1580,6 +2260,13 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
1580 2260
1581 env->seen_direct_write = true; 2261 env->seen_direct_write = true;
1582 return true; 2262 return true;
2263
2264 case BPF_PROG_TYPE_CGROUP_SOCKOPT:
2265 if (t == BPF_WRITE)
2266 env->seen_direct_write = true;
2267
2268 return true;
2269
1583 default: 2270 default:
1584 return false; 2271 return false;
1585 } 2272 }
@@ -1706,6 +2393,9 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
1706 case PTR_TO_TCP_SOCK: 2393 case PTR_TO_TCP_SOCK:
1707 valid = bpf_tcp_sock_is_valid_access(off, size, t, &info); 2394 valid = bpf_tcp_sock_is_valid_access(off, size, t, &info);
1708 break; 2395 break;
2396 case PTR_TO_XDP_SOCK:
2397 valid = bpf_xdp_sock_is_valid_access(off, size, t, &info);
2398 break;
1709 default: 2399 default:
1710 valid = false; 2400 valid = false;
1711 } 2401 }
@@ -1870,6 +2560,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
1870 case PTR_TO_TCP_SOCK: 2560 case PTR_TO_TCP_SOCK:
1871 pointer_desc = "tcp_sock "; 2561 pointer_desc = "tcp_sock ";
1872 break; 2562 break;
2563 case PTR_TO_XDP_SOCK:
2564 pointer_desc = "xdp_sock ";
2565 break;
1873 default: 2566 default:
1874 break; 2567 break;
1875 } 2568 }
@@ -2109,6 +2802,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
2109 value_regno); 2802 value_regno);
2110 if (reg_type_may_be_null(reg_type)) 2803 if (reg_type_may_be_null(reg_type))
2111 regs[value_regno].id = ++env->id_gen; 2804 regs[value_regno].id = ++env->id_gen;
2805 /* A load of ctx field could have different
2806 * actual load size with the one encoded in the
2807 * insn. When the dst is PTR, it is for sure not
2808 * a sub-register.
2809 */
2810 regs[value_regno].subreg_def = DEF_NOT_SUBREG;
2112 } 2811 }
2113 regs[value_regno].type = reg_type; 2812 regs[value_regno].type = reg_type;
2114 } 2813 }
@@ -2263,7 +2962,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
2263{ 2962{
2264 struct bpf_reg_state *reg = reg_state(env, regno); 2963 struct bpf_reg_state *reg = reg_state(env, regno);
2265 struct bpf_func_state *state = func(env, reg); 2964 struct bpf_func_state *state = func(env, reg);
2266 int err, min_off, max_off, i, slot, spi; 2965 int err, min_off, max_off, i, j, slot, spi;
2267 2966
2268 if (reg->type != PTR_TO_STACK) { 2967 if (reg->type != PTR_TO_STACK) {
2269 /* Allow zero-byte read from NULL, regardless of pointer type */ 2968 /* Allow zero-byte read from NULL, regardless of pointer type */
@@ -2351,6 +3050,14 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
2351 *stype = STACK_MISC; 3050 *stype = STACK_MISC;
2352 goto mark; 3051 goto mark;
2353 } 3052 }
3053 if (state->stack[spi].slot_type[0] == STACK_SPILL &&
3054 state->stack[spi].spilled_ptr.type == SCALAR_VALUE) {
3055 __mark_reg_unknown(&state->stack[spi].spilled_ptr);
3056 for (j = 0; j < BPF_REG_SIZE; j++)
3057 state->stack[spi].slot_type[j] = STACK_MISC;
3058 goto mark;
3059 }
3060
2354err: 3061err:
2355 if (tnum_is_const(reg->var_off)) { 3062 if (tnum_is_const(reg->var_off)) {
2356 verbose(env, "invalid indirect read from stack off %d+%d size %d\n", 3063 verbose(env, "invalid indirect read from stack off %d+%d size %d\n",
@@ -2368,7 +3075,8 @@ mark:
2368 * the whole slot to be marked as 'read' 3075 * the whole slot to be marked as 'read'
2369 */ 3076 */
2370 mark_reg_read(env, &state->stack[spi].spilled_ptr, 3077 mark_reg_read(env, &state->stack[spi].spilled_ptr,
2371 state->stack[spi].spilled_ptr.parent); 3078 state->stack[spi].spilled_ptr.parent,
3079 REG_LIVE_READ64);
2372 } 3080 }
2373 return update_stack_depth(env, state, min_off); 3081 return update_stack_depth(env, state, min_off);
2374} 3082}
@@ -2701,6 +3409,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
2701 err = check_helper_mem_access(env, regno - 1, 3409 err = check_helper_mem_access(env, regno - 1,
2702 reg->umax_value, 3410 reg->umax_value,
2703 zero_size_allowed, meta); 3411 zero_size_allowed, meta);
3412 if (!err)
3413 err = mark_chain_precision(env, regno);
2704 } else if (arg_type_is_int_ptr(arg_type)) { 3414 } else if (arg_type_is_int_ptr(arg_type)) {
2705 int size = int_ptr_type_to_size(arg_type); 3415 int size = int_ptr_type_to_size(arg_type);
2706 3416
@@ -2749,22 +3459,23 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
2749 if (func_id != BPF_FUNC_get_local_storage) 3459 if (func_id != BPF_FUNC_get_local_storage)
2750 goto error; 3460 goto error;
2751 break; 3461 break;
2752 /* devmap returns a pointer to a live net_device ifindex that we cannot
2753 * allow to be modified from bpf side. So do not allow lookup elements
2754 * for now.
2755 */
2756 case BPF_MAP_TYPE_DEVMAP: 3462 case BPF_MAP_TYPE_DEVMAP:
2757 if (func_id != BPF_FUNC_redirect_map) 3463 if (func_id != BPF_FUNC_redirect_map &&
3464 func_id != BPF_FUNC_map_lookup_elem)
2758 goto error; 3465 goto error;
2759 break; 3466 break;
2760 /* Restrict bpf side of cpumap and xskmap, open when use-cases 3467 /* Restrict bpf side of cpumap and xskmap, open when use-cases
2761 * appear. 3468 * appear.
2762 */ 3469 */
2763 case BPF_MAP_TYPE_CPUMAP: 3470 case BPF_MAP_TYPE_CPUMAP:
2764 case BPF_MAP_TYPE_XSKMAP:
2765 if (func_id != BPF_FUNC_redirect_map) 3471 if (func_id != BPF_FUNC_redirect_map)
2766 goto error; 3472 goto error;
2767 break; 3473 break;
3474 case BPF_MAP_TYPE_XSKMAP:
3475 if (func_id != BPF_FUNC_redirect_map &&
3476 func_id != BPF_FUNC_map_lookup_elem)
3477 goto error;
3478 break;
2768 case BPF_MAP_TYPE_ARRAY_OF_MAPS: 3479 case BPF_MAP_TYPE_ARRAY_OF_MAPS:
2769 case BPF_MAP_TYPE_HASH_OF_MAPS: 3480 case BPF_MAP_TYPE_HASH_OF_MAPS:
2770 if (func_id != BPF_FUNC_map_lookup_elem) 3481 if (func_id != BPF_FUNC_map_lookup_elem)
@@ -3332,6 +4043,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
3332 check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); 4043 check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
3333 } 4044 }
3334 4045
4046 /* helper call returns 64-bit value. */
4047 regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
4048
3335 /* update return register (already marked as written above) */ 4049 /* update return register (already marked as written above) */
3336 if (fn->ret_type == RET_INTEGER) { 4050 if (fn->ret_type == RET_INTEGER) {
3337 /* sets type to SCALAR_VALUE */ 4051 /* sets type to SCALAR_VALUE */
@@ -3652,6 +4366,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
3652 case PTR_TO_SOCK_COMMON_OR_NULL: 4366 case PTR_TO_SOCK_COMMON_OR_NULL:
3653 case PTR_TO_TCP_SOCK: 4367 case PTR_TO_TCP_SOCK:
3654 case PTR_TO_TCP_SOCK_OR_NULL: 4368 case PTR_TO_TCP_SOCK_OR_NULL:
4369 case PTR_TO_XDP_SOCK:
3655 verbose(env, "R%d pointer arithmetic on %s prohibited\n", 4370 verbose(env, "R%d pointer arithmetic on %s prohibited\n",
3656 dst, reg_type_str[ptr_reg->type]); 4371 dst, reg_type_str[ptr_reg->type]);
3657 return -EACCES; 4372 return -EACCES;
@@ -4129,6 +4844,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
4129 struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg; 4844 struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg;
4130 struct bpf_reg_state *ptr_reg = NULL, off_reg = {0}; 4845 struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
4131 u8 opcode = BPF_OP(insn->code); 4846 u8 opcode = BPF_OP(insn->code);
4847 int err;
4132 4848
4133 dst_reg = &regs[insn->dst_reg]; 4849 dst_reg = &regs[insn->dst_reg];
4134 src_reg = NULL; 4850 src_reg = NULL;
@@ -4155,11 +4871,17 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
4155 * This is legal, but we have to reverse our 4871 * This is legal, but we have to reverse our
4156 * src/dest handling in computing the range 4872 * src/dest handling in computing the range
4157 */ 4873 */
4874 err = mark_chain_precision(env, insn->dst_reg);
4875 if (err)
4876 return err;
4158 return adjust_ptr_min_max_vals(env, insn, 4877 return adjust_ptr_min_max_vals(env, insn,
4159 src_reg, dst_reg); 4878 src_reg, dst_reg);
4160 } 4879 }
4161 } else if (ptr_reg) { 4880 } else if (ptr_reg) {
4162 /* pointer += scalar */ 4881 /* pointer += scalar */
4882 err = mark_chain_precision(env, insn->src_reg);
4883 if (err)
4884 return err;
4163 return adjust_ptr_min_max_vals(env, insn, 4885 return adjust_ptr_min_max_vals(env, insn,
4164 dst_reg, src_reg); 4886 dst_reg, src_reg);
4165 } 4887 }
@@ -4263,6 +4985,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
4263 */ 4985 */
4264 *dst_reg = *src_reg; 4986 *dst_reg = *src_reg;
4265 dst_reg->live |= REG_LIVE_WRITTEN; 4987 dst_reg->live |= REG_LIVE_WRITTEN;
4988 dst_reg->subreg_def = DEF_NOT_SUBREG;
4266 } else { 4989 } else {
4267 /* R1 = (u32) R2 */ 4990 /* R1 = (u32) R2 */
4268 if (is_pointer_value(env, insn->src_reg)) { 4991 if (is_pointer_value(env, insn->src_reg)) {
@@ -4273,6 +4996,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
4273 } else if (src_reg->type == SCALAR_VALUE) { 4996 } else if (src_reg->type == SCALAR_VALUE) {
4274 *dst_reg = *src_reg; 4997 *dst_reg = *src_reg;
4275 dst_reg->live |= REG_LIVE_WRITTEN; 4998 dst_reg->live |= REG_LIVE_WRITTEN;
4999 dst_reg->subreg_def = env->insn_idx + 1;
4276 } else { 5000 } else {
4277 mark_reg_unknown(env, regs, 5001 mark_reg_unknown(env, regs,
4278 insn->dst_reg); 5002 insn->dst_reg);
@@ -4889,6 +5613,9 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
4889 if (reg->map_ptr->inner_map_meta) { 5613 if (reg->map_ptr->inner_map_meta) {
4890 reg->type = CONST_PTR_TO_MAP; 5614 reg->type = CONST_PTR_TO_MAP;
4891 reg->map_ptr = reg->map_ptr->inner_map_meta; 5615 reg->map_ptr = reg->map_ptr->inner_map_meta;
5616 } else if (reg->map_ptr->map_type ==
5617 BPF_MAP_TYPE_XSKMAP) {
5618 reg->type = PTR_TO_XDP_SOCK;
4892 } else { 5619 } else {
4893 reg->type = PTR_TO_MAP_VALUE; 5620 reg->type = PTR_TO_MAP_VALUE;
4894 } 5621 }
@@ -5060,9 +5787,10 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
5060 struct bpf_verifier_state *this_branch = env->cur_state; 5787 struct bpf_verifier_state *this_branch = env->cur_state;
5061 struct bpf_verifier_state *other_branch; 5788 struct bpf_verifier_state *other_branch;
5062 struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs; 5789 struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs;
5063 struct bpf_reg_state *dst_reg, *other_branch_regs; 5790 struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL;
5064 u8 opcode = BPF_OP(insn->code); 5791 u8 opcode = BPF_OP(insn->code);
5065 bool is_jmp32; 5792 bool is_jmp32;
5793 int pred = -1;
5066 int err; 5794 int err;
5067 5795
5068 /* Only conditional jumps are expected to reach here. */ 5796 /* Only conditional jumps are expected to reach here. */
@@ -5087,6 +5815,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
5087 insn->src_reg); 5815 insn->src_reg);
5088 return -EACCES; 5816 return -EACCES;
5089 } 5817 }
5818 src_reg = &regs[insn->src_reg];
5090 } else { 5819 } else {
5091 if (insn->src_reg != BPF_REG_0) { 5820 if (insn->src_reg != BPF_REG_0) {
5092 verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); 5821 verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
@@ -5102,20 +5831,29 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
5102 dst_reg = &regs[insn->dst_reg]; 5831 dst_reg = &regs[insn->dst_reg];
5103 is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; 5832 is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
5104 5833
5105 if (BPF_SRC(insn->code) == BPF_K) { 5834 if (BPF_SRC(insn->code) == BPF_K)
5106 int pred = is_branch_taken(dst_reg, insn->imm, opcode, 5835 pred = is_branch_taken(dst_reg, insn->imm,
5107 is_jmp32); 5836 opcode, is_jmp32);
5108 5837 else if (src_reg->type == SCALAR_VALUE &&
5109 if (pred == 1) { 5838 tnum_is_const(src_reg->var_off))
5110 /* only follow the goto, ignore fall-through */ 5839 pred = is_branch_taken(dst_reg, src_reg->var_off.value,
5111 *insn_idx += insn->off; 5840 opcode, is_jmp32);
5112 return 0; 5841 if (pred >= 0) {
5113 } else if (pred == 0) { 5842 err = mark_chain_precision(env, insn->dst_reg);
5114 /* only follow fall-through branch, since 5843 if (BPF_SRC(insn->code) == BPF_X && !err)
5115 * that's where the program will go 5844 err = mark_chain_precision(env, insn->src_reg);
5116 */ 5845 if (err)
5117 return 0; 5846 return err;
5118 } 5847 }
5848 if (pred == 1) {
5849 /* only follow the goto, ignore fall-through */
5850 *insn_idx += insn->off;
5851 return 0;
5852 } else if (pred == 0) {
5853 /* only follow fall-through branch, since
5854 * that's where the program will go
5855 */
5856 return 0;
5119 } 5857 }
5120 5858
5121 other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx, 5859 other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx,
@@ -5352,21 +6090,32 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
5352 * Already marked as written above. 6090 * Already marked as written above.
5353 */ 6091 */
5354 mark_reg_unknown(env, regs, BPF_REG_0); 6092 mark_reg_unknown(env, regs, BPF_REG_0);
6093 /* ld_abs load up to 32-bit skb data. */
6094 regs[BPF_REG_0].subreg_def = env->insn_idx + 1;
5355 return 0; 6095 return 0;
5356} 6096}
5357 6097
5358static int check_return_code(struct bpf_verifier_env *env) 6098static int check_return_code(struct bpf_verifier_env *env)
5359{ 6099{
6100 struct tnum enforce_attach_type_range = tnum_unknown;
5360 struct bpf_reg_state *reg; 6101 struct bpf_reg_state *reg;
5361 struct tnum range = tnum_range(0, 1); 6102 struct tnum range = tnum_range(0, 1);
5362 6103
5363 switch (env->prog->type) { 6104 switch (env->prog->type) {
6105 case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
6106 if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG ||
6107 env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG)
6108 range = tnum_range(1, 1);
5364 case BPF_PROG_TYPE_CGROUP_SKB: 6109 case BPF_PROG_TYPE_CGROUP_SKB:
6110 if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) {
6111 range = tnum_range(0, 3);
6112 enforce_attach_type_range = tnum_range(2, 3);
6113 }
5365 case BPF_PROG_TYPE_CGROUP_SOCK: 6114 case BPF_PROG_TYPE_CGROUP_SOCK:
5366 case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
5367 case BPF_PROG_TYPE_SOCK_OPS: 6115 case BPF_PROG_TYPE_SOCK_OPS:
5368 case BPF_PROG_TYPE_CGROUP_DEVICE: 6116 case BPF_PROG_TYPE_CGROUP_DEVICE:
5369 case BPF_PROG_TYPE_CGROUP_SYSCTL: 6117 case BPF_PROG_TYPE_CGROUP_SYSCTL:
6118 case BPF_PROG_TYPE_CGROUP_SOCKOPT:
5370 break; 6119 break;
5371 default: 6120 default:
5372 return 0; 6121 return 0;
@@ -5380,18 +6129,23 @@ static int check_return_code(struct bpf_verifier_env *env)
5380 } 6129 }
5381 6130
5382 if (!tnum_in(range, reg->var_off)) { 6131 if (!tnum_in(range, reg->var_off)) {
6132 char tn_buf[48];
6133
5383 verbose(env, "At program exit the register R0 "); 6134 verbose(env, "At program exit the register R0 ");
5384 if (!tnum_is_unknown(reg->var_off)) { 6135 if (!tnum_is_unknown(reg->var_off)) {
5385 char tn_buf[48];
5386
5387 tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); 6136 tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
5388 verbose(env, "has value %s", tn_buf); 6137 verbose(env, "has value %s", tn_buf);
5389 } else { 6138 } else {
5390 verbose(env, "has unknown scalar value"); 6139 verbose(env, "has unknown scalar value");
5391 } 6140 }
5392 verbose(env, " should have been 0 or 1\n"); 6141 tnum_strn(tn_buf, sizeof(tn_buf), range);
6142 verbose(env, " should have been in %s\n", tn_buf);
5393 return -EINVAL; 6143 return -EINVAL;
5394 } 6144 }
6145
6146 if (!tnum_is_unknown(enforce_attach_type_range) &&
6147 tnum_in(enforce_attach_type_range, reg->var_off))
6148 env->prog->enforce_expected_attach_type = 1;
5395 return 0; 6149 return 0;
5396} 6150}
5397 6151
@@ -5435,14 +6189,33 @@ enum {
5435 BRANCH = 2, 6189 BRANCH = 2,
5436}; 6190};
5437 6191
5438#define STATE_LIST_MARK ((struct bpf_verifier_state_list *) -1L) 6192static u32 state_htab_size(struct bpf_verifier_env *env)
6193{
6194 return env->prog->len;
6195}
6196
6197static struct bpf_verifier_state_list **explored_state(
6198 struct bpf_verifier_env *env,
6199 int idx)
6200{
6201 struct bpf_verifier_state *cur = env->cur_state;
6202 struct bpf_func_state *state = cur->frame[cur->curframe];
6203
6204 return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)];
6205}
6206
6207static void init_explored_state(struct bpf_verifier_env *env, int idx)
6208{
6209 env->insn_aux_data[idx].prune_point = true;
6210}
5439 6211
5440/* t, w, e - match pseudo-code above: 6212/* t, w, e - match pseudo-code above:
5441 * t - index of current instruction 6213 * t - index of current instruction
5442 * w - next instruction 6214 * w - next instruction
5443 * e - edge 6215 * e - edge
5444 */ 6216 */
5445static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) 6217static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
6218 bool loop_ok)
5446{ 6219{
5447 int *insn_stack = env->cfg.insn_stack; 6220 int *insn_stack = env->cfg.insn_stack;
5448 int *insn_state = env->cfg.insn_state; 6221 int *insn_state = env->cfg.insn_state;
@@ -5461,7 +6234,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
5461 6234
5462 if (e == BRANCH) 6235 if (e == BRANCH)
5463 /* mark branch target for state pruning */ 6236 /* mark branch target for state pruning */
5464 env->explored_states[w] = STATE_LIST_MARK; 6237 init_explored_state(env, w);
5465 6238
5466 if (insn_state[w] == 0) { 6239 if (insn_state[w] == 0) {
5467 /* tree-edge */ 6240 /* tree-edge */
@@ -5472,6 +6245,8 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
5472 insn_stack[env->cfg.cur_stack++] = w; 6245 insn_stack[env->cfg.cur_stack++] = w;
5473 return 1; 6246 return 1;
5474 } else if ((insn_state[w] & 0xF0) == DISCOVERED) { 6247 } else if ((insn_state[w] & 0xF0) == DISCOVERED) {
6248 if (loop_ok && env->allow_ptr_leaks)
6249 return 0;
5475 verbose_linfo(env, t, "%d: ", t); 6250 verbose_linfo(env, t, "%d: ", t);
5476 verbose_linfo(env, w, "%d: ", w); 6251 verbose_linfo(env, w, "%d: ", w);
5477 verbose(env, "back-edge from insn %d to %d\n", t, w); 6252 verbose(env, "back-edge from insn %d to %d\n", t, w);
@@ -5523,16 +6298,17 @@ peek_stack:
5523 if (opcode == BPF_EXIT) { 6298 if (opcode == BPF_EXIT) {
5524 goto mark_explored; 6299 goto mark_explored;
5525 } else if (opcode == BPF_CALL) { 6300 } else if (opcode == BPF_CALL) {
5526 ret = push_insn(t, t + 1, FALLTHROUGH, env); 6301 ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
5527 if (ret == 1) 6302 if (ret == 1)
5528 goto peek_stack; 6303 goto peek_stack;
5529 else if (ret < 0) 6304 else if (ret < 0)
5530 goto err_free; 6305 goto err_free;
5531 if (t + 1 < insn_cnt) 6306 if (t + 1 < insn_cnt)
5532 env->explored_states[t + 1] = STATE_LIST_MARK; 6307 init_explored_state(env, t + 1);
5533 if (insns[t].src_reg == BPF_PSEUDO_CALL) { 6308 if (insns[t].src_reg == BPF_PSEUDO_CALL) {
5534 env->explored_states[t] = STATE_LIST_MARK; 6309 init_explored_state(env, t);
5535 ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env); 6310 ret = push_insn(t, t + insns[t].imm + 1, BRANCH,
6311 env, false);
5536 if (ret == 1) 6312 if (ret == 1)
5537 goto peek_stack; 6313 goto peek_stack;
5538 else if (ret < 0) 6314 else if (ret < 0)
@@ -5545,26 +6321,31 @@ peek_stack:
5545 } 6321 }
5546 /* unconditional jump with single edge */ 6322 /* unconditional jump with single edge */
5547 ret = push_insn(t, t + insns[t].off + 1, 6323 ret = push_insn(t, t + insns[t].off + 1,
5548 FALLTHROUGH, env); 6324 FALLTHROUGH, env, true);
5549 if (ret == 1) 6325 if (ret == 1)
5550 goto peek_stack; 6326 goto peek_stack;
5551 else if (ret < 0) 6327 else if (ret < 0)
5552 goto err_free; 6328 goto err_free;
6329 /* unconditional jmp is not a good pruning point,
6330 * but it's marked, since backtracking needs
6331 * to record jmp history in is_state_visited().
6332 */
6333 init_explored_state(env, t + insns[t].off + 1);
5553 /* tell verifier to check for equivalent states 6334 /* tell verifier to check for equivalent states
5554 * after every call and jump 6335 * after every call and jump
5555 */ 6336 */
5556 if (t + 1 < insn_cnt) 6337 if (t + 1 < insn_cnt)
5557 env->explored_states[t + 1] = STATE_LIST_MARK; 6338 init_explored_state(env, t + 1);
5558 } else { 6339 } else {
5559 /* conditional jump with two edges */ 6340 /* conditional jump with two edges */
5560 env->explored_states[t] = STATE_LIST_MARK; 6341 init_explored_state(env, t);
5561 ret = push_insn(t, t + 1, FALLTHROUGH, env); 6342 ret = push_insn(t, t + 1, FALLTHROUGH, env, true);
5562 if (ret == 1) 6343 if (ret == 1)
5563 goto peek_stack; 6344 goto peek_stack;
5564 else if (ret < 0) 6345 else if (ret < 0)
5565 goto err_free; 6346 goto err_free;
5566 6347
5567 ret = push_insn(t, t + insns[t].off + 1, BRANCH, env); 6348 ret = push_insn(t, t + insns[t].off + 1, BRANCH, env, true);
5568 if (ret == 1) 6349 if (ret == 1)
5569 goto peek_stack; 6350 goto peek_stack;
5570 else if (ret < 0) 6351 else if (ret < 0)
@@ -5574,7 +6355,7 @@ peek_stack:
5574 /* all other non-branch instructions with single 6355 /* all other non-branch instructions with single
5575 * fall-through edge 6356 * fall-through edge
5576 */ 6357 */
5577 ret = push_insn(t, t + 1, FALLTHROUGH, env); 6358 ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
5578 if (ret == 1) 6359 if (ret == 1)
5579 goto peek_stack; 6360 goto peek_stack;
5580 else if (ret < 0) 6361 else if (ret < 0)
@@ -6005,12 +6786,12 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn,
6005 struct bpf_verifier_state_list *sl; 6786 struct bpf_verifier_state_list *sl;
6006 int i; 6787 int i;
6007 6788
6008 sl = env->explored_states[insn]; 6789 sl = *explored_state(env, insn);
6009 if (!sl) 6790 while (sl) {
6010 return; 6791 if (sl->state.branches)
6011 6792 goto next;
6012 while (sl != STATE_LIST_MARK) { 6793 if (sl->state.insn_idx != insn ||
6013 if (sl->state.curframe != cur->curframe) 6794 sl->state.curframe != cur->curframe)
6014 goto next; 6795 goto next;
6015 for (i = 0; i <= cur->curframe; i++) 6796 for (i = 0; i <= cur->curframe; i++)
6016 if (sl->state.frame[i]->callsite != cur->frame[i]->callsite) 6797 if (sl->state.frame[i]->callsite != cur->frame[i]->callsite)
@@ -6050,6 +6831,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
6050 switch (rold->type) { 6831 switch (rold->type) {
6051 case SCALAR_VALUE: 6832 case SCALAR_VALUE:
6052 if (rcur->type == SCALAR_VALUE) { 6833 if (rcur->type == SCALAR_VALUE) {
6834 if (!rold->precise && !rcur->precise)
6835 return true;
6053 /* new val must satisfy old val knowledge */ 6836 /* new val must satisfy old val knowledge */
6054 return range_within(rold, rcur) && 6837 return range_within(rold, rcur) &&
6055 tnum_in(rold->var_off, rcur->var_off); 6838 tnum_in(rold->var_off, rcur->var_off);
@@ -6122,6 +6905,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
6122 case PTR_TO_SOCK_COMMON_OR_NULL: 6905 case PTR_TO_SOCK_COMMON_OR_NULL:
6123 case PTR_TO_TCP_SOCK: 6906 case PTR_TO_TCP_SOCK:
6124 case PTR_TO_TCP_SOCK_OR_NULL: 6907 case PTR_TO_TCP_SOCK_OR_NULL:
6908 case PTR_TO_XDP_SOCK:
6125 /* Only valid matches are exact, which memcmp() above 6909 /* Only valid matches are exact, which memcmp() above
6126 * would have accepted 6910 * would have accepted
6127 */ 6911 */
@@ -6292,20 +7076,33 @@ static bool states_equal(struct bpf_verifier_env *env,
6292 return true; 7076 return true;
6293} 7077}
6294 7078
7079/* Return 0 if no propagation happened. Return negative error code if error
7080 * happened. Otherwise, return the propagated bit.
7081 */
6295static int propagate_liveness_reg(struct bpf_verifier_env *env, 7082static int propagate_liveness_reg(struct bpf_verifier_env *env,
6296 struct bpf_reg_state *reg, 7083 struct bpf_reg_state *reg,
6297 struct bpf_reg_state *parent_reg) 7084 struct bpf_reg_state *parent_reg)
6298{ 7085{
7086 u8 parent_flag = parent_reg->live & REG_LIVE_READ;
7087 u8 flag = reg->live & REG_LIVE_READ;
6299 int err; 7088 int err;
6300 7089
6301 if (parent_reg->live & REG_LIVE_READ || !(reg->live & REG_LIVE_READ)) 7090 /* When comes here, read flags of PARENT_REG or REG could be any of
7091 * REG_LIVE_READ64, REG_LIVE_READ32, REG_LIVE_NONE. There is no need
7092 * of propagation if PARENT_REG has strongest REG_LIVE_READ64.
7093 */
7094 if (parent_flag == REG_LIVE_READ64 ||
7095 /* Or if there is no read flag from REG. */
7096 !flag ||
7097 /* Or if the read flag from REG is the same as PARENT_REG. */
7098 parent_flag == flag)
6302 return 0; 7099 return 0;
6303 7100
6304 err = mark_reg_read(env, reg, parent_reg); 7101 err = mark_reg_read(env, reg, parent_reg, flag);
6305 if (err) 7102 if (err)
6306 return err; 7103 return err;
6307 7104
6308 return 0; 7105 return flag;
6309} 7106}
6310 7107
6311/* A write screens off any subsequent reads; but write marks come from the 7108/* A write screens off any subsequent reads; but write marks come from the
@@ -6339,8 +7136,10 @@ static int propagate_liveness(struct bpf_verifier_env *env,
6339 for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) { 7136 for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) {
6340 err = propagate_liveness_reg(env, &state_reg[i], 7137 err = propagate_liveness_reg(env, &state_reg[i],
6341 &parent_reg[i]); 7138 &parent_reg[i]);
6342 if (err) 7139 if (err < 0)
6343 return err; 7140 return err;
7141 if (err == REG_LIVE_READ64)
7142 mark_insn_zext(env, &parent_reg[i]);
6344 } 7143 }
6345 7144
6346 /* Propagate stack slots. */ 7145 /* Propagate stack slots. */
@@ -6350,32 +7149,132 @@ static int propagate_liveness(struct bpf_verifier_env *env,
6350 state_reg = &state->stack[i].spilled_ptr; 7149 state_reg = &state->stack[i].spilled_ptr;
6351 err = propagate_liveness_reg(env, state_reg, 7150 err = propagate_liveness_reg(env, state_reg,
6352 parent_reg); 7151 parent_reg);
6353 if (err) 7152 if (err < 0)
6354 return err; 7153 return err;
6355 } 7154 }
6356 } 7155 }
6357 return err; 7156 return 0;
6358} 7157}
6359 7158
7159/* find precise scalars in the previous equivalent state and
7160 * propagate them into the current state
7161 */
7162static int propagate_precision(struct bpf_verifier_env *env,
7163 const struct bpf_verifier_state *old)
7164{
7165 struct bpf_reg_state *state_reg;
7166 struct bpf_func_state *state;
7167 int i, err = 0;
7168
7169 state = old->frame[old->curframe];
7170 state_reg = state->regs;
7171 for (i = 0; i < BPF_REG_FP; i++, state_reg++) {
7172 if (state_reg->type != SCALAR_VALUE ||
7173 !state_reg->precise)
7174 continue;
7175 if (env->log.level & BPF_LOG_LEVEL2)
7176 verbose(env, "propagating r%d\n", i);
7177 err = mark_chain_precision(env, i);
7178 if (err < 0)
7179 return err;
7180 }
7181
7182 for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
7183 if (state->stack[i].slot_type[0] != STACK_SPILL)
7184 continue;
7185 state_reg = &state->stack[i].spilled_ptr;
7186 if (state_reg->type != SCALAR_VALUE ||
7187 !state_reg->precise)
7188 continue;
7189 if (env->log.level & BPF_LOG_LEVEL2)
7190 verbose(env, "propagating fp%d\n",
7191 (-i - 1) * BPF_REG_SIZE);
7192 err = mark_chain_precision_stack(env, i);
7193 if (err < 0)
7194 return err;
7195 }
7196 return 0;
7197}
7198
7199static bool states_maybe_looping(struct bpf_verifier_state *old,
7200 struct bpf_verifier_state *cur)
7201{
7202 struct bpf_func_state *fold, *fcur;
7203 int i, fr = cur->curframe;
7204
7205 if (old->curframe != fr)
7206 return false;
7207
7208 fold = old->frame[fr];
7209 fcur = cur->frame[fr];
7210 for (i = 0; i < MAX_BPF_REG; i++)
7211 if (memcmp(&fold->regs[i], &fcur->regs[i],
7212 offsetof(struct bpf_reg_state, parent)))
7213 return false;
7214 return true;
7215}
7216
7217
6360static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) 7218static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
6361{ 7219{
6362 struct bpf_verifier_state_list *new_sl; 7220 struct bpf_verifier_state_list *new_sl;
6363 struct bpf_verifier_state_list *sl, **pprev; 7221 struct bpf_verifier_state_list *sl, **pprev;
6364 struct bpf_verifier_state *cur = env->cur_state, *new; 7222 struct bpf_verifier_state *cur = env->cur_state, *new;
6365 int i, j, err, states_cnt = 0; 7223 int i, j, err, states_cnt = 0;
7224 bool add_new_state = false;
6366 7225
6367 pprev = &env->explored_states[insn_idx]; 7226 cur->last_insn_idx = env->prev_insn_idx;
6368 sl = *pprev; 7227 if (!env->insn_aux_data[insn_idx].prune_point)
6369
6370 if (!sl)
6371 /* this 'insn_idx' instruction wasn't marked, so we will not 7228 /* this 'insn_idx' instruction wasn't marked, so we will not
6372 * be doing state search here 7229 * be doing state search here
6373 */ 7230 */
6374 return 0; 7231 return 0;
6375 7232
7233 /* bpf progs typically have pruning point every 4 instructions
7234 * http://vger.kernel.org/bpfconf2019.html#session-1
7235 * Do not add new state for future pruning if the verifier hasn't seen
7236 * at least 2 jumps and at least 8 instructions.
7237 * This heuristics helps decrease 'total_states' and 'peak_states' metric.
7238 * In tests that amounts to up to 50% reduction into total verifier
7239 * memory consumption and 20% verifier time speedup.
7240 */
7241 if (env->jmps_processed - env->prev_jmps_processed >= 2 &&
7242 env->insn_processed - env->prev_insn_processed >= 8)
7243 add_new_state = true;
7244
7245 pprev = explored_state(env, insn_idx);
7246 sl = *pprev;
7247
6376 clean_live_states(env, insn_idx, cur); 7248 clean_live_states(env, insn_idx, cur);
6377 7249
6378 while (sl != STATE_LIST_MARK) { 7250 while (sl) {
7251 states_cnt++;
7252 if (sl->state.insn_idx != insn_idx)
7253 goto next;
7254 if (sl->state.branches) {
7255 if (states_maybe_looping(&sl->state, cur) &&
7256 states_equal(env, &sl->state, cur)) {
7257 verbose_linfo(env, insn_idx, "; ");
7258 verbose(env, "infinite loop detected at insn %d\n", insn_idx);
7259 return -EINVAL;
7260 }
7261 /* if the verifier is processing a loop, avoid adding new state
7262 * too often, since different loop iterations have distinct
7263 * states and may not help future pruning.
7264 * This threshold shouldn't be too low to make sure that
7265 * a loop with large bound will be rejected quickly.
7266 * The most abusive loop will be:
7267 * r1 += 1
7268 * if r1 < 1000000 goto pc-2
7269 * 1M insn_procssed limit / 100 == 10k peak states.
7270 * This threshold shouldn't be too high either, since states
7271 * at the end of the loop are likely to be useful in pruning.
7272 */
7273 if (env->jmps_processed - env->prev_jmps_processed < 20 &&
7274 env->insn_processed - env->prev_insn_processed < 100)
7275 add_new_state = false;
7276 goto miss;
7277 }
6379 if (states_equal(env, &sl->state, cur)) { 7278 if (states_equal(env, &sl->state, cur)) {
6380 sl->hit_cnt++; 7279 sl->hit_cnt++;
6381 /* reached equivalent register/stack state, 7280 /* reached equivalent register/stack state,
@@ -6389,12 +7288,27 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
6389 * this state and will pop a new one. 7288 * this state and will pop a new one.
6390 */ 7289 */
6391 err = propagate_liveness(env, &sl->state, cur); 7290 err = propagate_liveness(env, &sl->state, cur);
7291
7292 /* if previous state reached the exit with precision and
7293 * current state is equivalent to it (except precsion marks)
7294 * the precision needs to be propagated back in
7295 * the current state.
7296 */
7297 err = err ? : push_jmp_history(env, cur);
7298 err = err ? : propagate_precision(env, &sl->state);
6392 if (err) 7299 if (err)
6393 return err; 7300 return err;
6394 return 1; 7301 return 1;
6395 } 7302 }
6396 states_cnt++; 7303miss:
6397 sl->miss_cnt++; 7304 /* when new state is not going to be added do not increase miss count.
7305 * Otherwise several loop iterations will remove the state
7306 * recorded earlier. The goal of these heuristics is to have
7307 * states from some iterations of the loop (some in the beginning
7308 * and some at the end) to help pruning.
7309 */
7310 if (add_new_state)
7311 sl->miss_cnt++;
6398 /* heuristic to determine whether this state is beneficial 7312 /* heuristic to determine whether this state is beneficial
6399 * to keep checking from state equivalence point of view. 7313 * to keep checking from state equivalence point of view.
6400 * Higher numbers increase max_states_per_insn and verification time, 7314 * Higher numbers increase max_states_per_insn and verification time,
@@ -6406,6 +7320,11 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
6406 */ 7320 */
6407 *pprev = sl->next; 7321 *pprev = sl->next;
6408 if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) { 7322 if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) {
7323 u32 br = sl->state.branches;
7324
7325 WARN_ONCE(br,
7326 "BUG live_done but branches_to_explore %d\n",
7327 br);
6409 free_verifier_state(&sl->state, false); 7328 free_verifier_state(&sl->state, false);
6410 kfree(sl); 7329 kfree(sl);
6411 env->peak_states--; 7330 env->peak_states--;
@@ -6420,6 +7339,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
6420 sl = *pprev; 7339 sl = *pprev;
6421 continue; 7340 continue;
6422 } 7341 }
7342next:
6423 pprev = &sl->next; 7343 pprev = &sl->next;
6424 sl = *pprev; 7344 sl = *pprev;
6425 } 7345 }
@@ -6428,20 +7348,27 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
6428 env->max_states_per_insn = states_cnt; 7348 env->max_states_per_insn = states_cnt;
6429 7349
6430 if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) 7350 if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES)
6431 return 0; 7351 return push_jmp_history(env, cur);
6432 7352
6433 /* there were no equivalent states, remember current one. 7353 if (!add_new_state)
6434 * technically the current state is not proven to be safe yet, 7354 return push_jmp_history(env, cur);
7355
7356 /* There were no equivalent states, remember the current one.
7357 * Technically the current state is not proven to be safe yet,
6435 * but it will either reach outer most bpf_exit (which means it's safe) 7358 * but it will either reach outer most bpf_exit (which means it's safe)
6436 * or it will be rejected. Since there are no loops, we won't be 7359 * or it will be rejected. When there are no loops the verifier won't be
6437 * seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx) 7360 * seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx)
6438 * again on the way to bpf_exit 7361 * again on the way to bpf_exit.
7362 * When looping the sl->state.branches will be > 0 and this state
7363 * will not be considered for equivalence until branches == 0.
6439 */ 7364 */
6440 new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL); 7365 new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL);
6441 if (!new_sl) 7366 if (!new_sl)
6442 return -ENOMEM; 7367 return -ENOMEM;
6443 env->total_states++; 7368 env->total_states++;
6444 env->peak_states++; 7369 env->peak_states++;
7370 env->prev_jmps_processed = env->jmps_processed;
7371 env->prev_insn_processed = env->insn_processed;
6445 7372
6446 /* add new state to the head of linked list */ 7373 /* add new state to the head of linked list */
6447 new = &new_sl->state; 7374 new = &new_sl->state;
@@ -6451,8 +7378,15 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
6451 kfree(new_sl); 7378 kfree(new_sl);
6452 return err; 7379 return err;
6453 } 7380 }
6454 new_sl->next = env->explored_states[insn_idx]; 7381 new->insn_idx = insn_idx;
6455 env->explored_states[insn_idx] = new_sl; 7382 WARN_ONCE(new->branches != 1,
7383 "BUG is_state_visited:branches_to_explore=%d insn %d\n", new->branches, insn_idx);
7384
7385 cur->parent = new;
7386 cur->first_insn_idx = insn_idx;
7387 clear_jmp_history(cur);
7388 new_sl->next = *explored_state(env, insn_idx);
7389 *explored_state(env, insn_idx) = new_sl;
6456 /* connect new state to parentage chain. Current frame needs all 7390 /* connect new state to parentage chain. Current frame needs all
6457 * registers connected. Only r6 - r9 of the callers are alive (pushed 7391 * registers connected. Only r6 - r9 of the callers are alive (pushed
6458 * to the stack implicitly by JITs) so in callers' frames connect just 7392 * to the stack implicitly by JITs) so in callers' frames connect just
@@ -6460,17 +7394,18 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
6460 * the state of the call instruction (with WRITTEN set), and r0 comes 7394 * the state of the call instruction (with WRITTEN set), and r0 comes
6461 * from callee with its full parentage chain, anyway. 7395 * from callee with its full parentage chain, anyway.
6462 */ 7396 */
6463 for (j = 0; j <= cur->curframe; j++)
6464 for (i = j < cur->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++)
6465 cur->frame[j]->regs[i].parent = &new->frame[j]->regs[i];
6466 /* clear write marks in current state: the writes we did are not writes 7397 /* clear write marks in current state: the writes we did are not writes
6467 * our child did, so they don't screen off its reads from us. 7398 * our child did, so they don't screen off its reads from us.
6468 * (There are no read marks in current state, because reads always mark 7399 * (There are no read marks in current state, because reads always mark
6469 * their parent and current state never has children yet. Only 7400 * their parent and current state never has children yet. Only
6470 * explored_states can get read marks.) 7401 * explored_states can get read marks.)
6471 */ 7402 */
6472 for (i = 0; i < BPF_REG_FP; i++) 7403 for (j = 0; j <= cur->curframe; j++) {
6473 cur->frame[cur->curframe]->regs[i].live = REG_LIVE_NONE; 7404 for (i = j < cur->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++)
7405 cur->frame[j]->regs[i].parent = &new->frame[j]->regs[i];
7406 for (i = 0; i < BPF_REG_FP; i++)
7407 cur->frame[j]->regs[i].live = REG_LIVE_NONE;
7408 }
6474 7409
6475 /* all stack frames are accessible from callee, clear them all */ 7410 /* all stack frames are accessible from callee, clear them all */
6476 for (j = 0; j <= cur->curframe; j++) { 7411 for (j = 0; j <= cur->curframe; j++) {
@@ -6497,6 +7432,7 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type)
6497 case PTR_TO_SOCK_COMMON_OR_NULL: 7432 case PTR_TO_SOCK_COMMON_OR_NULL:
6498 case PTR_TO_TCP_SOCK: 7433 case PTR_TO_TCP_SOCK:
6499 case PTR_TO_TCP_SOCK_OR_NULL: 7434 case PTR_TO_TCP_SOCK_OR_NULL:
7435 case PTR_TO_XDP_SOCK:
6500 return false; 7436 return false;
6501 default: 7437 default:
6502 return true; 7438 return true;
@@ -6528,6 +7464,7 @@ static int do_check(struct bpf_verifier_env *env)
6528 struct bpf_reg_state *regs; 7464 struct bpf_reg_state *regs;
6529 int insn_cnt = env->prog->len; 7465 int insn_cnt = env->prog->len;
6530 bool do_print_state = false; 7466 bool do_print_state = false;
7467 int prev_insn_idx = -1;
6531 7468
6532 env->prev_linfo = NULL; 7469 env->prev_linfo = NULL;
6533 7470
@@ -6536,6 +7473,7 @@ static int do_check(struct bpf_verifier_env *env)
6536 return -ENOMEM; 7473 return -ENOMEM;
6537 state->curframe = 0; 7474 state->curframe = 0;
6538 state->speculative = false; 7475 state->speculative = false;
7476 state->branches = 1;
6539 state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); 7477 state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL);
6540 if (!state->frame[0]) { 7478 if (!state->frame[0]) {
6541 kfree(state); 7479 kfree(state);
@@ -6552,6 +7490,7 @@ static int do_check(struct bpf_verifier_env *env)
6552 u8 class; 7490 u8 class;
6553 int err; 7491 int err;
6554 7492
7493 env->prev_insn_idx = prev_insn_idx;
6555 if (env->insn_idx >= insn_cnt) { 7494 if (env->insn_idx >= insn_cnt) {
6556 verbose(env, "invalid insn idx %d insn_cnt %d\n", 7495 verbose(env, "invalid insn idx %d insn_cnt %d\n",
6557 env->insn_idx, insn_cnt); 7496 env->insn_idx, insn_cnt);
@@ -6624,6 +7563,7 @@ static int do_check(struct bpf_verifier_env *env)
6624 7563
6625 regs = cur_regs(env); 7564 regs = cur_regs(env);
6626 env->insn_aux_data[env->insn_idx].seen = true; 7565 env->insn_aux_data[env->insn_idx].seen = true;
7566 prev_insn_idx = env->insn_idx;
6627 7567
6628 if (class == BPF_ALU || class == BPF_ALU64) { 7568 if (class == BPF_ALU || class == BPF_ALU64) {
6629 err = check_alu_op(env, insn); 7569 err = check_alu_op(env, insn);
@@ -6742,6 +7682,7 @@ static int do_check(struct bpf_verifier_env *env)
6742 } else if (class == BPF_JMP || class == BPF_JMP32) { 7682 } else if (class == BPF_JMP || class == BPF_JMP32) {
6743 u8 opcode = BPF_OP(insn->code); 7683 u8 opcode = BPF_OP(insn->code);
6744 7684
7685 env->jmps_processed++;
6745 if (opcode == BPF_CALL) { 7686 if (opcode == BPF_CALL) {
6746 if (BPF_SRC(insn->code) != BPF_K || 7687 if (BPF_SRC(insn->code) != BPF_K ||
6747 insn->off != 0 || 7688 insn->off != 0 ||
@@ -6796,7 +7737,6 @@ static int do_check(struct bpf_verifier_env *env)
6796 7737
6797 if (state->curframe) { 7738 if (state->curframe) {
6798 /* exit from nested function */ 7739 /* exit from nested function */
6799 env->prev_insn_idx = env->insn_idx;
6800 err = prepare_func_exit(env, &env->insn_idx); 7740 err = prepare_func_exit(env, &env->insn_idx);
6801 if (err) 7741 if (err)
6802 return err; 7742 return err;
@@ -6827,7 +7767,8 @@ static int do_check(struct bpf_verifier_env *env)
6827 if (err) 7767 if (err)
6828 return err; 7768 return err;
6829process_bpf_exit: 7769process_bpf_exit:
6830 err = pop_stack(env, &env->prev_insn_idx, 7770 update_branch_counts(env, env->cur_state);
7771 err = pop_stack(env, &prev_insn_idx,
6831 &env->insn_idx); 7772 &env->insn_idx);
6832 if (err < 0) { 7773 if (err < 0) {
6833 if (err != -ENOENT) 7774 if (err != -ENOENT)
@@ -7130,14 +8071,23 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)
7130 * insni[off, off + cnt). Adjust corresponding insn_aux_data by copying 8071 * insni[off, off + cnt). Adjust corresponding insn_aux_data by copying
7131 * [0, off) and [off, end) to new locations, so the patched range stays zero 8072 * [0, off) and [off, end) to new locations, so the patched range stays zero
7132 */ 8073 */
7133static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len, 8074static int adjust_insn_aux_data(struct bpf_verifier_env *env,
7134 u32 off, u32 cnt) 8075 struct bpf_prog *new_prog, u32 off, u32 cnt)
7135{ 8076{
7136 struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data; 8077 struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data;
8078 struct bpf_insn *insn = new_prog->insnsi;
8079 u32 prog_len;
7137 int i; 8080 int i;
7138 8081
8082 /* aux info at OFF always needs adjustment, no matter fast path
8083 * (cnt == 1) is taken or not. There is no guarantee INSN at OFF is the
8084 * original insn at old prog.
8085 */
8086 old_data[off].zext_dst = insn_has_def32(env, insn + off + cnt - 1);
8087
7139 if (cnt == 1) 8088 if (cnt == 1)
7140 return 0; 8089 return 0;
8090 prog_len = new_prog->len;
7141 new_data = vzalloc(array_size(prog_len, 8091 new_data = vzalloc(array_size(prog_len,
7142 sizeof(struct bpf_insn_aux_data))); 8092 sizeof(struct bpf_insn_aux_data)));
7143 if (!new_data) 8093 if (!new_data)
@@ -7145,8 +8095,10 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len,
7145 memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off); 8095 memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off);
7146 memcpy(new_data + off + cnt - 1, old_data + off, 8096 memcpy(new_data + off + cnt - 1, old_data + off,
7147 sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); 8097 sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
7148 for (i = off; i < off + cnt - 1; i++) 8098 for (i = off; i < off + cnt - 1; i++) {
7149 new_data[i].seen = true; 8099 new_data[i].seen = true;
8100 new_data[i].zext_dst = insn_has_def32(env, insn + i);
8101 }
7150 env->insn_aux_data = new_data; 8102 env->insn_aux_data = new_data;
7151 vfree(old_data); 8103 vfree(old_data);
7152 return 0; 8104 return 0;
@@ -7179,7 +8131,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
7179 env->insn_aux_data[off].orig_idx); 8131 env->insn_aux_data[off].orig_idx);
7180 return NULL; 8132 return NULL;
7181 } 8133 }
7182 if (adjust_insn_aux_data(env, new_prog->len, off, len)) 8134 if (adjust_insn_aux_data(env, new_prog, off, len))
7183 return NULL; 8135 return NULL;
7184 adjust_subprog_starts(env, off, len); 8136 adjust_subprog_starts(env, off, len);
7185 return new_prog; 8137 return new_prog;
@@ -7443,6 +8395,84 @@ static int opt_remove_nops(struct bpf_verifier_env *env)
7443 return 0; 8395 return 0;
7444} 8396}
7445 8397
8398static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
8399 const union bpf_attr *attr)
8400{
8401 struct bpf_insn *patch, zext_patch[2], rnd_hi32_patch[4];
8402 struct bpf_insn_aux_data *aux = env->insn_aux_data;
8403 int i, patch_len, delta = 0, len = env->prog->len;
8404 struct bpf_insn *insns = env->prog->insnsi;
8405 struct bpf_prog *new_prog;
8406 bool rnd_hi32;
8407
8408 rnd_hi32 = attr->prog_flags & BPF_F_TEST_RND_HI32;
8409 zext_patch[1] = BPF_ZEXT_REG(0);
8410 rnd_hi32_patch[1] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, 0);
8411 rnd_hi32_patch[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32);
8412 rnd_hi32_patch[3] = BPF_ALU64_REG(BPF_OR, 0, BPF_REG_AX);
8413 for (i = 0; i < len; i++) {
8414 int adj_idx = i + delta;
8415 struct bpf_insn insn;
8416
8417 insn = insns[adj_idx];
8418 if (!aux[adj_idx].zext_dst) {
8419 u8 code, class;
8420 u32 imm_rnd;
8421
8422 if (!rnd_hi32)
8423 continue;
8424
8425 code = insn.code;
8426 class = BPF_CLASS(code);
8427 if (insn_no_def(&insn))
8428 continue;
8429
8430 /* NOTE: arg "reg" (the fourth one) is only used for
8431 * BPF_STX which has been ruled out in above
8432 * check, it is safe to pass NULL here.
8433 */
8434 if (is_reg64(env, &insn, insn.dst_reg, NULL, DST_OP)) {
8435 if (class == BPF_LD &&
8436 BPF_MODE(code) == BPF_IMM)
8437 i++;
8438 continue;
8439 }
8440
8441 /* ctx load could be transformed into wider load. */
8442 if (class == BPF_LDX &&
8443 aux[adj_idx].ptr_type == PTR_TO_CTX)
8444 continue;
8445
8446 imm_rnd = get_random_int();
8447 rnd_hi32_patch[0] = insn;
8448 rnd_hi32_patch[1].imm = imm_rnd;
8449 rnd_hi32_patch[3].dst_reg = insn.dst_reg;
8450 patch = rnd_hi32_patch;
8451 patch_len = 4;
8452 goto apply_patch_buffer;
8453 }
8454
8455 if (!bpf_jit_needs_zext())
8456 continue;
8457
8458 zext_patch[0] = insn;
8459 zext_patch[1].dst_reg = insn.dst_reg;
8460 zext_patch[1].src_reg = insn.dst_reg;
8461 patch = zext_patch;
8462 patch_len = 2;
8463apply_patch_buffer:
8464 new_prog = bpf_patch_insn_data(env, adj_idx, patch, patch_len);
8465 if (!new_prog)
8466 return -ENOMEM;
8467 env->prog = new_prog;
8468 insns = new_prog->insnsi;
8469 aux = env->insn_aux_data;
8470 delta += patch_len - 1;
8471 }
8472
8473 return 0;
8474}
8475
7446/* convert load instructions that access fields of a context type into a 8476/* convert load instructions that access fields of a context type into a
7447 * sequence of instructions that access fields of the underlying structure: 8477 * sequence of instructions that access fields of the underlying structure:
7448 * struct __sk_buff -> struct sk_buff 8478 * struct __sk_buff -> struct sk_buff
@@ -7541,6 +8571,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
7541 case PTR_TO_TCP_SOCK: 8571 case PTR_TO_TCP_SOCK:
7542 convert_ctx_access = bpf_tcp_sock_convert_ctx_access; 8572 convert_ctx_access = bpf_tcp_sock_convert_ctx_access;
7543 break; 8573 break;
8574 case PTR_TO_XDP_SOCK:
8575 convert_ctx_access = bpf_xdp_sock_convert_ctx_access;
8576 break;
7544 default: 8577 default:
7545 continue; 8578 continue;
7546 } 8579 }
@@ -8130,16 +9163,15 @@ static void free_states(struct bpf_verifier_env *env)
8130 if (!env->explored_states) 9163 if (!env->explored_states)
8131 return; 9164 return;
8132 9165
8133 for (i = 0; i < env->prog->len; i++) { 9166 for (i = 0; i < state_htab_size(env); i++) {
8134 sl = env->explored_states[i]; 9167 sl = env->explored_states[i];
8135 9168
8136 if (sl) 9169 while (sl) {
8137 while (sl != STATE_LIST_MARK) { 9170 sln = sl->next;
8138 sln = sl->next; 9171 free_verifier_state(&sl->state, false);
8139 free_verifier_state(&sl->state, false); 9172 kfree(sl);
8140 kfree(sl); 9173 sl = sln;
8141 sl = sln; 9174 }
8142 }
8143 } 9175 }
8144 9176
8145 kvfree(env->explored_states); 9177 kvfree(env->explored_states);
@@ -8239,7 +9271,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
8239 goto skip_full_check; 9271 goto skip_full_check;
8240 } 9272 }
8241 9273
8242 env->explored_states = kvcalloc(env->prog->len, 9274 env->explored_states = kvcalloc(state_htab_size(env),
8243 sizeof(struct bpf_verifier_state_list *), 9275 sizeof(struct bpf_verifier_state_list *),
8244 GFP_USER); 9276 GFP_USER);
8245 ret = -ENOMEM; 9277 ret = -ENOMEM;
@@ -8294,6 +9326,15 @@ skip_full_check:
8294 if (ret == 0) 9326 if (ret == 0)
8295 ret = fixup_bpf_calls(env); 9327 ret = fixup_bpf_calls(env);
8296 9328
9329 /* do 32-bit optimization after insn patching has done so those patched
9330 * insns could be handled correctly.
9331 */
9332 if (ret == 0 && !bpf_prog_is_dev_bound(env->prog->aux)) {
9333 ret = opt_subreg_zext_lo32_rnd_hi32(env, attr);
9334 env->prog->aux->verifier_zext = bpf_jit_needs_zext() ? !ret
9335 : false;
9336 }
9337
8297 if (ret == 0) 9338 if (ret == 0)
8298 ret = fixup_call_args(env); 9339 ret = fixup_call_args(env);
8299 9340
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
index 686d244e798d..9bb96ace9fa1 100644
--- a/kernel/bpf/xskmap.c
+++ b/kernel/bpf/xskmap.c
@@ -17,8 +17,8 @@ struct xsk_map {
17 17
18static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) 18static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
19{ 19{
20 int cpu, err = -EINVAL;
21 struct xsk_map *m; 20 struct xsk_map *m;
21 int cpu, err;
22 u64 cost; 22 u64 cost;
23 23
24 if (!capable(CAP_NET_ADMIN)) 24 if (!capable(CAP_NET_ADMIN))
@@ -37,13 +37,9 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
37 37
38 cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *); 38 cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *);
39 cost += sizeof(struct list_head) * num_possible_cpus(); 39 cost += sizeof(struct list_head) * num_possible_cpus();
40 if (cost >= U32_MAX - PAGE_SIZE)
41 goto free_m;
42
43 m->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
44 40
45 /* Notice returns -EPERM on if map size is larger than memlock limit */ 41 /* Notice returns -EPERM on if map size is larger than memlock limit */
46 err = bpf_map_precharge_memlock(m->map.pages); 42 err = bpf_map_charge_init(&m->map.memory, cost);
47 if (err) 43 if (err)
48 goto free_m; 44 goto free_m;
49 45
@@ -51,7 +47,7 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
51 47
52 m->flush_list = alloc_percpu(struct list_head); 48 m->flush_list = alloc_percpu(struct list_head);
53 if (!m->flush_list) 49 if (!m->flush_list)
54 goto free_m; 50 goto free_charge;
55 51
56 for_each_possible_cpu(cpu) 52 for_each_possible_cpu(cpu)
57 INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu)); 53 INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu));
@@ -65,6 +61,8 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
65 61
66free_percpu: 62free_percpu:
67 free_percpu(m->flush_list); 63 free_percpu(m->flush_list);
64free_charge:
65 bpf_map_charge_finish(&m->map.memory);
68free_m: 66free_m:
69 kfree(m); 67 kfree(m);
70 return ERR_PTR(err); 68 return ERR_PTR(err);
@@ -147,13 +145,18 @@ void __xsk_map_flush(struct bpf_map *map)
147 145
148 list_for_each_entry_safe(xs, tmp, flush_list, flush_node) { 146 list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
149 xsk_flush(xs); 147 xsk_flush(xs);
150 __list_del(xs->flush_node.prev, xs->flush_node.next); 148 __list_del_clearprev(&xs->flush_node);
151 xs->flush_node.prev = NULL;
152 } 149 }
153} 150}
154 151
155static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) 152static void *xsk_map_lookup_elem(struct bpf_map *map, void *key)
156{ 153{
154 WARN_ON_ONCE(!rcu_read_lock_held());
155 return __xsk_map_lookup_elem(map, *(u32 *)key);
156}
157
158static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key)
159{
157 return ERR_PTR(-EOPNOTSUPP); 160 return ERR_PTR(-EOPNOTSUPP);
158} 161}
159 162
@@ -220,6 +223,7 @@ const struct bpf_map_ops xsk_map_ops = {
220 .map_free = xsk_map_free, 223 .map_free = xsk_map_free,
221 .map_get_next_key = xsk_map_get_next_key, 224 .map_get_next_key = xsk_map_get_next_key,
222 .map_lookup_elem = xsk_map_lookup_elem, 225 .map_lookup_elem = xsk_map_lookup_elem,
226 .map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only,
223 .map_update_elem = xsk_map_update_elem, 227 .map_update_elem = xsk_map_update_elem,
224 .map_delete_elem = xsk_map_delete_elem, 228 .map_delete_elem = xsk_map_delete_elem,
225 .map_check_btf = map_check_no_btf, 229 .map_check_btf = map_check_no_btf,
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 68ca5de7ec27..88006be40ea3 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1#include "cgroup-internal.h" 2#include "cgroup-internal.h"
2 3
3#include <linux/ctype.h> 4#include <linux/ctype.h>
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 327f37c9fdfa..300b0c416341 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -101,7 +101,7 @@ static DEFINE_SPINLOCK(cgroup_idr_lock);
101 */ 101 */
102static DEFINE_SPINLOCK(cgroup_file_kn_lock); 102static DEFINE_SPINLOCK(cgroup_file_kn_lock);
103 103
104struct percpu_rw_semaphore cgroup_threadgroup_rwsem; 104DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
105 105
106#define cgroup_assert_mutex_or_rcu_locked() \ 106#define cgroup_assert_mutex_or_rcu_locked() \
107 RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ 107 RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
@@ -215,7 +215,8 @@ static struct cftype cgroup_base_files[];
215 215
216static int cgroup_apply_control(struct cgroup *cgrp); 216static int cgroup_apply_control(struct cgroup *cgrp);
217static void cgroup_finalize_control(struct cgroup *cgrp, int ret); 217static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
218static void css_task_iter_advance(struct css_task_iter *it); 218static void css_task_iter_skip(struct css_task_iter *it,
219 struct task_struct *task);
219static int cgroup_destroy_locked(struct cgroup *cgrp); 220static int cgroup_destroy_locked(struct cgroup *cgrp);
220static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, 221static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
221 struct cgroup_subsys *ss); 222 struct cgroup_subsys *ss);
@@ -738,6 +739,7 @@ struct css_set init_css_set = {
738 .dom_cset = &init_css_set, 739 .dom_cset = &init_css_set,
739 .tasks = LIST_HEAD_INIT(init_css_set.tasks), 740 .tasks = LIST_HEAD_INIT(init_css_set.tasks),
740 .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), 741 .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
742 .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks),
741 .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), 743 .task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
742 .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets), 744 .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
743 .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), 745 .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
@@ -843,6 +845,21 @@ static void css_set_update_populated(struct css_set *cset, bool populated)
843 cgroup_update_populated(link->cgrp, populated); 845 cgroup_update_populated(link->cgrp, populated);
844} 846}
845 847
848/*
849 * @task is leaving, advance task iterators which are pointing to it so
850 * that they can resume at the next position. Advancing an iterator might
851 * remove it from the list, use safe walk. See css_task_iter_skip() for
852 * details.
853 */
854static void css_set_skip_task_iters(struct css_set *cset,
855 struct task_struct *task)
856{
857 struct css_task_iter *it, *pos;
858
859 list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node)
860 css_task_iter_skip(it, task);
861}
862
846/** 863/**
847 * css_set_move_task - move a task from one css_set to another 864 * css_set_move_task - move a task from one css_set to another
848 * @task: task being moved 865 * @task: task being moved
@@ -868,22 +885,9 @@ static void css_set_move_task(struct task_struct *task,
868 css_set_update_populated(to_cset, true); 885 css_set_update_populated(to_cset, true);
869 886
870 if (from_cset) { 887 if (from_cset) {
871 struct css_task_iter *it, *pos;
872
873 WARN_ON_ONCE(list_empty(&task->cg_list)); 888 WARN_ON_ONCE(list_empty(&task->cg_list));
874 889
875 /* 890 css_set_skip_task_iters(from_cset, task);
876 * @task is leaving, advance task iterators which are
877 * pointing to it so that they can resume at the next
878 * position. Advancing an iterator might remove it from
879 * the list, use safe walk. See css_task_iter_advance*()
880 * for details.
881 */
882 list_for_each_entry_safe(it, pos, &from_cset->task_iters,
883 iters_node)
884 if (it->task_pos == &task->cg_list)
885 css_task_iter_advance(it);
886
887 list_del_init(&task->cg_list); 891 list_del_init(&task->cg_list);
888 if (!css_set_populated(from_cset)) 892 if (!css_set_populated(from_cset))
889 css_set_update_populated(from_cset, false); 893 css_set_update_populated(from_cset, false);
@@ -1210,6 +1214,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
1210 cset->dom_cset = cset; 1214 cset->dom_cset = cset;
1211 INIT_LIST_HEAD(&cset->tasks); 1215 INIT_LIST_HEAD(&cset->tasks);
1212 INIT_LIST_HEAD(&cset->mg_tasks); 1216 INIT_LIST_HEAD(&cset->mg_tasks);
1217 INIT_LIST_HEAD(&cset->dying_tasks);
1213 INIT_LIST_HEAD(&cset->task_iters); 1218 INIT_LIST_HEAD(&cset->task_iters);
1214 INIT_LIST_HEAD(&cset->threaded_csets); 1219 INIT_LIST_HEAD(&cset->threaded_csets);
1215 INIT_HLIST_NODE(&cset->hlist); 1220 INIT_HLIST_NODE(&cset->hlist);
@@ -1810,11 +1815,13 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
1810 1815
1811enum cgroup2_param { 1816enum cgroup2_param {
1812 Opt_nsdelegate, 1817 Opt_nsdelegate,
1818 Opt_memory_localevents,
1813 nr__cgroup2_params 1819 nr__cgroup2_params
1814}; 1820};
1815 1821
1816static const struct fs_parameter_spec cgroup2_param_specs[] = { 1822static const struct fs_parameter_spec cgroup2_param_specs[] = {
1817 fsparam_flag ("nsdelegate", Opt_nsdelegate), 1823 fsparam_flag("nsdelegate", Opt_nsdelegate),
1824 fsparam_flag("memory_localevents", Opt_memory_localevents),
1818 {} 1825 {}
1819}; 1826};
1820 1827
@@ -1837,6 +1844,9 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param
1837 case Opt_nsdelegate: 1844 case Opt_nsdelegate:
1838 ctx->flags |= CGRP_ROOT_NS_DELEGATE; 1845 ctx->flags |= CGRP_ROOT_NS_DELEGATE;
1839 return 0; 1846 return 0;
1847 case Opt_memory_localevents:
1848 ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
1849 return 0;
1840 } 1850 }
1841 return -EINVAL; 1851 return -EINVAL;
1842} 1852}
@@ -1848,6 +1858,11 @@ static void apply_cgroup_root_flags(unsigned int root_flags)
1848 cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE; 1858 cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
1849 else 1859 else
1850 cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE; 1860 cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
1861
1862 if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
1863 cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
1864 else
1865 cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS;
1851 } 1866 }
1852} 1867}
1853 1868
@@ -1855,6 +1870,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root
1855{ 1870{
1856 if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) 1871 if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
1857 seq_puts(seq, ",nsdelegate"); 1872 seq_puts(seq, ",nsdelegate");
1873 if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
1874 seq_puts(seq, ",memory_localevents");
1858 return 0; 1875 return 0;
1859} 1876}
1860 1877
@@ -3540,17 +3557,84 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
3540#ifdef CONFIG_PSI 3557#ifdef CONFIG_PSI
3541static int cgroup_io_pressure_show(struct seq_file *seq, void *v) 3558static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
3542{ 3559{
3543 return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_IO); 3560 struct cgroup *cgroup = seq_css(seq)->cgroup;
3561 struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi;
3562
3563 return psi_show(seq, psi, PSI_IO);
3544} 3564}
3545static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) 3565static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
3546{ 3566{
3547 return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_MEM); 3567 struct cgroup *cgroup = seq_css(seq)->cgroup;
3568 struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi;
3569
3570 return psi_show(seq, psi, PSI_MEM);
3548} 3571}
3549static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) 3572static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
3550{ 3573{
3551 return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU); 3574 struct cgroup *cgroup = seq_css(seq)->cgroup;
3575 struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi;
3576
3577 return psi_show(seq, psi, PSI_CPU);
3578}
3579
3580static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
3581 size_t nbytes, enum psi_res res)
3582{
3583 struct psi_trigger *new;
3584 struct cgroup *cgrp;
3585
3586 cgrp = cgroup_kn_lock_live(of->kn, false);
3587 if (!cgrp)
3588 return -ENODEV;
3589
3590 cgroup_get(cgrp);
3591 cgroup_kn_unlock(of->kn);
3592
3593 new = psi_trigger_create(&cgrp->psi, buf, nbytes, res);
3594 if (IS_ERR(new)) {
3595 cgroup_put(cgrp);
3596 return PTR_ERR(new);
3597 }
3598
3599 psi_trigger_replace(&of->priv, new);
3600
3601 cgroup_put(cgrp);
3602
3603 return nbytes;
3604}
3605
3606static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
3607 char *buf, size_t nbytes,
3608 loff_t off)
3609{
3610 return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
3552} 3611}
3553#endif 3612
3613static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
3614 char *buf, size_t nbytes,
3615 loff_t off)
3616{
3617 return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
3618}
3619
3620static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
3621 char *buf, size_t nbytes,
3622 loff_t off)
3623{
3624 return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
3625}
3626
3627static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
3628 poll_table *pt)
3629{
3630 return psi_trigger_poll(&of->priv, of->file, pt);
3631}
3632
3633static void cgroup_pressure_release(struct kernfs_open_file *of)
3634{
3635 psi_trigger_replace(&of->priv, NULL);
3636}
3637#endif /* CONFIG_PSI */
3554 3638
3555static int cgroup_freeze_show(struct seq_file *seq, void *v) 3639static int cgroup_freeze_show(struct seq_file *seq, void *v)
3556{ 3640{
@@ -4142,6 +4226,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
4142 4226
4143 return NULL; 4227 return NULL;
4144} 4228}
4229EXPORT_SYMBOL_GPL(css_next_descendant_pre);
4145 4230
4146/** 4231/**
4147 * css_rightmost_descendant - return the rightmost descendant of a css 4232 * css_rightmost_descendant - return the rightmost descendant of a css
@@ -4329,15 +4414,18 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
4329 it->task_pos = NULL; 4414 it->task_pos = NULL;
4330 return; 4415 return;
4331 } 4416 }
4332 } while (!css_set_populated(cset)); 4417 } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks));
4333 4418
4334 if (!list_empty(&cset->tasks)) 4419 if (!list_empty(&cset->tasks))
4335 it->task_pos = cset->tasks.next; 4420 it->task_pos = cset->tasks.next;
4336 else 4421 else if (!list_empty(&cset->mg_tasks))
4337 it->task_pos = cset->mg_tasks.next; 4422 it->task_pos = cset->mg_tasks.next;
4423 else
4424 it->task_pos = cset->dying_tasks.next;
4338 4425
4339 it->tasks_head = &cset->tasks; 4426 it->tasks_head = &cset->tasks;
4340 it->mg_tasks_head = &cset->mg_tasks; 4427 it->mg_tasks_head = &cset->mg_tasks;
4428 it->dying_tasks_head = &cset->dying_tasks;
4341 4429
4342 /* 4430 /*
4343 * We don't keep css_sets locked across iteration steps and thus 4431 * We don't keep css_sets locked across iteration steps and thus
@@ -4363,9 +4451,20 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
4363 list_add(&it->iters_node, &cset->task_iters); 4451 list_add(&it->iters_node, &cset->task_iters);
4364} 4452}
4365 4453
4454static void css_task_iter_skip(struct css_task_iter *it,
4455 struct task_struct *task)
4456{
4457 lockdep_assert_held(&css_set_lock);
4458
4459 if (it->task_pos == &task->cg_list) {
4460 it->task_pos = it->task_pos->next;
4461 it->flags |= CSS_TASK_ITER_SKIPPED;
4462 }
4463}
4464
4366static void css_task_iter_advance(struct css_task_iter *it) 4465static void css_task_iter_advance(struct css_task_iter *it)
4367{ 4466{
4368 struct list_head *next; 4467 struct task_struct *task;
4369 4468
4370 lockdep_assert_held(&css_set_lock); 4469 lockdep_assert_held(&css_set_lock);
4371repeat: 4470repeat:
@@ -4375,25 +4474,40 @@ repeat:
4375 * consumed first and then ->mg_tasks. After ->mg_tasks, 4474 * consumed first and then ->mg_tasks. After ->mg_tasks,
4376 * we move onto the next cset. 4475 * we move onto the next cset.
4377 */ 4476 */
4378 next = it->task_pos->next; 4477 if (it->flags & CSS_TASK_ITER_SKIPPED)
4379 4478 it->flags &= ~CSS_TASK_ITER_SKIPPED;
4380 if (next == it->tasks_head) 4479 else
4381 next = it->mg_tasks_head->next; 4480 it->task_pos = it->task_pos->next;
4382 4481
4383 if (next == it->mg_tasks_head) 4482 if (it->task_pos == it->tasks_head)
4483 it->task_pos = it->mg_tasks_head->next;
4484 if (it->task_pos == it->mg_tasks_head)
4485 it->task_pos = it->dying_tasks_head->next;
4486 if (it->task_pos == it->dying_tasks_head)
4384 css_task_iter_advance_css_set(it); 4487 css_task_iter_advance_css_set(it);
4385 else
4386 it->task_pos = next;
4387 } else { 4488 } else {
4388 /* called from start, proceed to the first cset */ 4489 /* called from start, proceed to the first cset */
4389 css_task_iter_advance_css_set(it); 4490 css_task_iter_advance_css_set(it);
4390 } 4491 }
4391 4492
4392 /* if PROCS, skip over tasks which aren't group leaders */ 4493 if (!it->task_pos)
4393 if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos && 4494 return;
4394 !thread_group_leader(list_entry(it->task_pos, struct task_struct, 4495
4395 cg_list))) 4496 task = list_entry(it->task_pos, struct task_struct, cg_list);
4396 goto repeat; 4497
4498 if (it->flags & CSS_TASK_ITER_PROCS) {
4499 /* if PROCS, skip over tasks which aren't group leaders */
4500 if (!thread_group_leader(task))
4501 goto repeat;
4502
4503 /* and dying leaders w/o live member threads */
4504 if (!atomic_read(&task->signal->live))
4505 goto repeat;
4506 } else {
4507 /* skip all dying ones */
4508 if (task->flags & PF_EXITING)
4509 goto repeat;
4510 }
4397} 4511}
4398 4512
4399/** 4513/**
@@ -4449,6 +4563,10 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
4449 4563
4450 spin_lock_irq(&css_set_lock); 4564 spin_lock_irq(&css_set_lock);
4451 4565
4566 /* @it may be half-advanced by skips, finish advancing */
4567 if (it->flags & CSS_TASK_ITER_SKIPPED)
4568 css_task_iter_advance(it);
4569
4452 if (it->task_pos) { 4570 if (it->task_pos) {
4453 it->cur_task = list_entry(it->task_pos, struct task_struct, 4571 it->cur_task = list_entry(it->task_pos, struct task_struct,
4454 cg_list); 4572 cg_list);
@@ -4743,20 +4861,26 @@ static struct cftype cgroup_base_files[] = {
4743#ifdef CONFIG_PSI 4861#ifdef CONFIG_PSI
4744 { 4862 {
4745 .name = "io.pressure", 4863 .name = "io.pressure",
4746 .flags = CFTYPE_NOT_ON_ROOT,
4747 .seq_show = cgroup_io_pressure_show, 4864 .seq_show = cgroup_io_pressure_show,
4865 .write = cgroup_io_pressure_write,
4866 .poll = cgroup_pressure_poll,
4867 .release = cgroup_pressure_release,
4748 }, 4868 },
4749 { 4869 {
4750 .name = "memory.pressure", 4870 .name = "memory.pressure",
4751 .flags = CFTYPE_NOT_ON_ROOT,
4752 .seq_show = cgroup_memory_pressure_show, 4871 .seq_show = cgroup_memory_pressure_show,
4872 .write = cgroup_memory_pressure_write,
4873 .poll = cgroup_pressure_poll,
4874 .release = cgroup_pressure_release,
4753 }, 4875 },
4754 { 4876 {
4755 .name = "cpu.pressure", 4877 .name = "cpu.pressure",
4756 .flags = CFTYPE_NOT_ON_ROOT,
4757 .seq_show = cgroup_cpu_pressure_show, 4878 .seq_show = cgroup_cpu_pressure_show,
4879 .write = cgroup_cpu_pressure_write,
4880 .poll = cgroup_pressure_poll,
4881 .release = cgroup_pressure_release,
4758 }, 4882 },
4759#endif 4883#endif /* CONFIG_PSI */
4760 { } /* terminate */ 4884 { } /* terminate */
4761}; 4885};
4762 4886
@@ -4882,8 +5006,6 @@ static void css_release_work_fn(struct work_struct *work)
4882 if (cgrp->kn) 5006 if (cgrp->kn)
4883 RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, 5007 RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
4884 NULL); 5008 NULL);
4885
4886 cgroup_bpf_put(cgrp);
4887 } 5009 }
4888 5010
4889 mutex_unlock(&cgroup_mutex); 5011 mutex_unlock(&cgroup_mutex);
@@ -5409,6 +5531,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
5409 5531
5410 cgroup1_check_for_release(parent); 5532 cgroup1_check_for_release(parent);
5411 5533
5534 cgroup_bpf_offline(cgrp);
5535
5412 /* put the base reference */ 5536 /* put the base reference */
5413 percpu_ref_kill(&cgrp->self.refcnt); 5537 percpu_ref_kill(&cgrp->self.refcnt);
5414 5538
@@ -5543,7 +5667,6 @@ int __init cgroup_init(void)
5543 int ssid; 5667 int ssid;
5544 5668
5545 BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); 5669 BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
5546 BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
5547 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); 5670 BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
5548 BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); 5671 BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
5549 5672
@@ -5924,6 +6047,7 @@ void cgroup_exit(struct task_struct *tsk)
5924 if (!list_empty(&tsk->cg_list)) { 6047 if (!list_empty(&tsk->cg_list)) {
5925 spin_lock_irq(&css_set_lock); 6048 spin_lock_irq(&css_set_lock);
5926 css_set_move_task(tsk, cset, NULL, false); 6049 css_set_move_task(tsk, cset, NULL, false);
6050 list_add_tail(&tsk->cg_list, &cset->dying_tasks);
5927 cset->nr_tasks--; 6051 cset->nr_tasks--;
5928 6052
5929 WARN_ON_ONCE(cgroup_task_frozen(tsk)); 6053 WARN_ON_ONCE(cgroup_task_frozen(tsk));
@@ -5949,6 +6073,13 @@ void cgroup_release(struct task_struct *task)
5949 do_each_subsys_mask(ss, ssid, have_release_callback) { 6073 do_each_subsys_mask(ss, ssid, have_release_callback) {
5950 ss->release(task); 6074 ss->release(task);
5951 } while_each_subsys_mask(); 6075 } while_each_subsys_mask();
6076
6077 if (use_task_css_set_links) {
6078 spin_lock_irq(&css_set_lock);
6079 css_set_skip_task_iters(task_css_set(task), task);
6080 list_del_init(&task->cg_list);
6081 spin_unlock_irq(&css_set_lock);
6082 }
5952} 6083}
5953 6084
5954void cgroup_free(struct task_struct *task) 6085void cgroup_free(struct task_struct *task)
@@ -6110,6 +6241,48 @@ struct cgroup *cgroup_get_from_fd(int fd)
6110} 6241}
6111EXPORT_SYMBOL_GPL(cgroup_get_from_fd); 6242EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
6112 6243
6244static u64 power_of_ten(int power)
6245{
6246 u64 v = 1;
6247 while (power--)
6248 v *= 10;
6249 return v;
6250}
6251
6252/**
6253 * cgroup_parse_float - parse a floating number
6254 * @input: input string
6255 * @dec_shift: number of decimal digits to shift
6256 * @v: output
6257 *
6258 * Parse a decimal floating point number in @input and store the result in
6259 * @v with decimal point right shifted @dec_shift times. For example, if
6260 * @input is "12.3456" and @dec_shift is 3, *@v will be set to 12345.
6261 * Returns 0 on success, -errno otherwise.
6262 *
6263 * There's nothing cgroup specific about this function except that it's
6264 * currently the only user.
6265 */
6266int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
6267{
6268 s64 whole, frac = 0;
6269 int fstart = 0, fend = 0, flen;
6270
6271 if (!sscanf(input, "%lld.%n%lld%n", &whole, &fstart, &frac, &fend))
6272 return -EINVAL;
6273 if (frac < 0)
6274 return -EINVAL;
6275
6276 flen = fend > fstart ? fend - fstart : 0;
6277 if (flen < dec_shift)
6278 frac *= power_of_ten(dec_shift - flen);
6279 else
6280 frac = DIV_ROUND_CLOSEST_ULL(frac, power_of_ten(flen - dec_shift));
6281
6282 *v = whole * power_of_ten(dec_shift) + frac;
6283 return 0;
6284}
6285
6113/* 6286/*
6114 * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data 6287 * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data
6115 * definition in cgroup-defs.h. 6288 * definition in cgroup-defs.h.
@@ -6148,6 +6321,7 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
6148 * Don't use cgroup_get_live(). 6321 * Don't use cgroup_get_live().
6149 */ 6322 */
6150 cgroup_get(sock_cgroup_ptr(skcd)); 6323 cgroup_get(sock_cgroup_ptr(skcd));
6324 cgroup_bpf_get(sock_cgroup_ptr(skcd));
6151 return; 6325 return;
6152 } 6326 }
6153 6327
@@ -6159,6 +6333,7 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
6159 cset = task_css_set(current); 6333 cset = task_css_set(current);
6160 if (likely(cgroup_tryget(cset->dfl_cgrp))) { 6334 if (likely(cgroup_tryget(cset->dfl_cgrp))) {
6161 skcd->val = (unsigned long)cset->dfl_cgrp; 6335 skcd->val = (unsigned long)cset->dfl_cgrp;
6336 cgroup_bpf_get(cset->dfl_cgrp);
6162 break; 6337 break;
6163 } 6338 }
6164 cpu_relax(); 6339 cpu_relax();
@@ -6169,7 +6344,10 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
6169 6344
6170void cgroup_sk_free(struct sock_cgroup_data *skcd) 6345void cgroup_sk_free(struct sock_cgroup_data *skcd)
6171{ 6346{
6172 cgroup_put(sock_cgroup_ptr(skcd)); 6347 struct cgroup *cgrp = sock_cgroup_ptr(skcd);
6348
6349 cgroup_bpf_put(cgrp);
6350 cgroup_put(cgrp);
6173} 6351}
6174 6352
6175#endif /* CONFIG_SOCK_CGROUP_DATA */ 6353#endif /* CONFIG_SOCK_CGROUP_DATA */
@@ -6252,7 +6430,7 @@ static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);
6252static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr, 6430static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
6253 char *buf) 6431 char *buf)
6254{ 6432{
6255 return snprintf(buf, PAGE_SIZE, "nsdelegate\n"); 6433 return snprintf(buf, PAGE_SIZE, "nsdelegate\nmemory_localevents\n");
6256} 6434}
6257static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features); 6435static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
6258 6436
@@ -6272,4 +6450,5 @@ static int __init cgroup_sysfs_init(void)
6272 return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group); 6450 return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group);
6273} 6451}
6274subsys_initcall(cgroup_sysfs_init); 6452subsys_initcall(cgroup_sysfs_init);
6453
6275#endif /* CONFIG_SYSFS */ 6454#endif /* CONFIG_SYSFS */
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 6a1942ed781c..b3b02b9c4405 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -729,7 +729,7 @@ static inline int nr_cpusets(void)
729 * load balancing domains (sched domains) as specified by that partial 729 * load balancing domains (sched domains) as specified by that partial
730 * partition. 730 * partition.
731 * 731 *
732 * See "What is sched_load_balance" in Documentation/cgroup-v1/cpusets.txt 732 * See "What is sched_load_balance" in Documentation/cgroup-v1/cpusets.rst
733 * for a background explanation of this. 733 * for a background explanation of this.
734 * 734 *
735 * Does not return errors, on the theory that the callers of this 735 * Does not return errors, on the theory that the callers of this
@@ -2829,7 +2829,7 @@ static void cpuset_fork(struct task_struct *task)
2829 if (task_css_is_root(task, cpuset_cgrp_id)) 2829 if (task_css_is_root(task, cpuset_cgrp_id))
2830 return; 2830 return;
2831 2831
2832 set_cpus_allowed_ptr(task, &current->cpus_allowed); 2832 set_cpus_allowed_ptr(task, current->cpus_ptr);
2833 task->mems_allowed = current->mems_allowed; 2833 task->mems_allowed = current->mems_allowed;
2834} 2834}
2835 2835
@@ -3254,10 +3254,23 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
3254 spin_unlock_irqrestore(&callback_lock, flags); 3254 spin_unlock_irqrestore(&callback_lock, flags);
3255} 3255}
3256 3256
3257/**
3258 * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe.
3259 * @tsk: pointer to task_struct with which the scheduler is struggling
3260 *
3261 * Description: In the case that the scheduler cannot find an allowed cpu in
3262 * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy
3263 * mode however, this value is the same as task_cs(tsk)->effective_cpus,
3264 * which will not contain a sane cpumask during cases such as cpu hotplugging.
3265 * This is the absolute last resort for the scheduler and it is only used if
3266 * _every_ other avenue has been traveled.
3267 **/
3268
3257void cpuset_cpus_allowed_fallback(struct task_struct *tsk) 3269void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
3258{ 3270{
3259 rcu_read_lock(); 3271 rcu_read_lock();
3260 do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus); 3272 do_set_cpus_allowed(tsk, is_in_v2_mode() ?
3273 task_cs(tsk)->cpus_allowed : cpu_possible_mask);
3261 rcu_read_unlock(); 3274 rcu_read_unlock();
3262 3275
3263 /* 3276 /*
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index c9960baaa14f..8e513a573fe9 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * Process number limiting controller for cgroups. 3 * Process number limiting controller for cgroups.
3 * 4 *
@@ -25,10 +26,6 @@
25 * a superset of parent/child/pids.current. 26 * a superset of parent/child/pids.current.
26 * 27 *
27 * Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com> 28 * Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com>
28 *
29 * This file is subject to the terms and conditions of version 2 of the GNU
30 * General Public License. See the file COPYING in the main directory of the
31 * Linux distribution for more details.
32 */ 29 */
33 30
34#include <linux/kernel.h> 31#include <linux/kernel.h>
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c
index 1d75ae7f1cb7..ae042c347c64 100644
--- a/kernel/cgroup/rdma.c
+++ b/kernel/cgroup/rdma.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * RDMA resource limiting controller for cgroups. 3 * RDMA resource limiting controller for cgroups.
3 * 4 *
@@ -5,10 +6,6 @@
5 * additional RDMA resources after a certain limit is reached. 6 * additional RDMA resources after a certain limit is reached.
6 * 7 *
7 * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com> 8 * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
8 *
9 * This file is subject to the terms and conditions of version 2 of the GNU
10 * General Public License. See the file COPYING in the main directory of the
11 * Linux distribution for more details.
12 */ 9 */
13 10
14#include <linux/bitops.h> 11#include <linux/bitops.h>
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index bb95a35e8c2d..ca19b4c8acf5 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1#include "cgroup-internal.h" 2#include "cgroup-internal.h"
2 3
3#include <linux/sched/cputime.h> 4#include <linux/sched/cputime.h>
diff --git a/kernel/compat.c b/kernel/compat.c
index d8a36c6ad7c9..a2bc1d6ceb57 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * linux/kernel/compat.c 3 * linux/kernel/compat.c
3 * 4 *
@@ -5,10 +6,6 @@
5 * on 64 bit kernels. 6 * on 64 bit kernels.
6 * 7 *
7 * Copyright (C) 2002-2003 Stephen Rothwell, IBM Corporation 8 * Copyright (C) 2002-2003 Stephen Rothwell, IBM Corporation
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
12 */ 9 */
13 10
14#include <linux/linkage.h> 11#include <linux/linkage.h>
@@ -346,8 +343,11 @@ get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat)
346 return -EFAULT; 343 return -EFAULT;
347 switch (_NSIG_WORDS) { 344 switch (_NSIG_WORDS) {
348 case 4: set->sig[3] = v.sig[6] | (((long)v.sig[7]) << 32 ); 345 case 4: set->sig[3] = v.sig[6] | (((long)v.sig[7]) << 32 );
346 /* fall through */
349 case 3: set->sig[2] = v.sig[4] | (((long)v.sig[5]) << 32 ); 347 case 3: set->sig[2] = v.sig[4] | (((long)v.sig[5]) << 32 );
348 /* fall through */
350 case 2: set->sig[1] = v.sig[2] | (((long)v.sig[3]) << 32 ); 349 case 2: set->sig[1] = v.sig[2] | (((long)v.sig[3]) << 32 );
350 /* fall through */
351 case 1: set->sig[0] = v.sig[0] | (((long)v.sig[1]) << 32 ); 351 case 1: set->sig[0] = v.sig[0] | (((long)v.sig[1]) << 32 );
352 } 352 }
353#else 353#else
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 9ad37b9e44a7..be01a4d627c9 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * Context tracking: Probe on high level context boundaries such as kernel 3 * Context tracking: Probe on high level context boundaries such as kernel
3 * and userspace. This includes syscalls and exceptions entry/exit. 4 * and userspace. This includes syscalls and exceptions entry/exit.
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f2ef10460698..e84c0873559e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -522,7 +522,7 @@ static int bringup_wait_for_ap(unsigned int cpu)
522 /* 522 /*
523 * SMT soft disabling on X86 requires to bring the CPU out of the 523 * SMT soft disabling on X86 requires to bring the CPU out of the
524 * BIOS 'wait for SIPI' state in order to set the CR4.MCE bit. The 524 * BIOS 'wait for SIPI' state in order to set the CR4.MCE bit. The
525 * CPU marked itself as booted_once in cpu_notify_starting() so the 525 * CPU marked itself as booted_once in notify_cpu_starting() so the
526 * cpu_smt_allowed() check will now return false if this is not the 526 * cpu_smt_allowed() check will now return false if this is not the
527 * primary sibling. 527 * primary sibling.
528 */ 528 */
@@ -1221,6 +1221,13 @@ int freeze_secondary_cpus(int primary)
1221 for_each_online_cpu(cpu) { 1221 for_each_online_cpu(cpu) {
1222 if (cpu == primary) 1222 if (cpu == primary)
1223 continue; 1223 continue;
1224
1225 if (pm_wakeup_pending()) {
1226 pr_info("Wakeup pending. Abort CPU freeze\n");
1227 error = -EBUSY;
1228 break;
1229 }
1230
1224 trace_suspend_resume(TPS("CPU_OFF"), cpu, true); 1231 trace_suspend_resume(TPS("CPU_OFF"), cpu, true);
1225 error = _cpu_down(cpu, 1, CPUHP_OFFLINE); 1232 error = _cpu_down(cpu, 1, CPUHP_OFFLINE);
1226 trace_suspend_resume(TPS("CPU_OFF"), cpu, false); 1233 trace_suspend_resume(TPS("CPU_OFF"), cpu, false);
@@ -1964,6 +1971,9 @@ static ssize_t write_cpuhp_fail(struct device *dev,
1964 if (ret) 1971 if (ret)
1965 return ret; 1972 return ret;
1966 1973
1974 if (fail < CPUHP_OFFLINE || fail > CPUHP_ONLINE)
1975 return -EINVAL;
1976
1967 /* 1977 /*
1968 * Cannot fail STARTING/DYING callbacks. 1978 * Cannot fail STARTING/DYING callbacks.
1969 */ 1979 */
@@ -2061,7 +2071,7 @@ static void cpuhp_online_cpu_device(unsigned int cpu)
2061 kobject_uevent(&dev->kobj, KOBJ_ONLINE); 2071 kobject_uevent(&dev->kobj, KOBJ_ONLINE);
2062} 2072}
2063 2073
2064static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) 2074int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
2065{ 2075{
2066 int cpu, ret = 0; 2076 int cpu, ret = 0;
2067 2077
@@ -2093,7 +2103,7 @@ static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
2093 return ret; 2103 return ret;
2094} 2104}
2095 2105
2096static int cpuhp_smt_enable(void) 2106int cpuhp_smt_enable(void)
2097{ 2107{
2098 int cpu, ret = 0; 2108 int cpu, ret = 0;
2099 2109
@@ -2339,6 +2349,9 @@ static int __init mitigations_parse_cmdline(char *arg)
2339 cpu_mitigations = CPU_MITIGATIONS_AUTO; 2349 cpu_mitigations = CPU_MITIGATIONS_AUTO;
2340 else if (!strcmp(arg, "auto,nosmt")) 2350 else if (!strcmp(arg, "auto,nosmt"))
2341 cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT; 2351 cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT;
2352 else
2353 pr_crit("Unsupported mitigations=%s, system may still be vulnerable\n",
2354 arg);
2342 2355
2343 return 0; 2356 return 0;
2344} 2357}
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index 67b02e138a47..cbca6879ab7d 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -1,18 +1,9 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * Copyright (C) 2011 Google, Inc. 3 * Copyright (C) 2011 Google, Inc.
3 * 4 *
4 * Author: 5 * Author:
5 * Colin Cross <ccross@android.com> 6 * Colin Cross <ccross@android.com>
6 *
7 * This software is licensed under the terms of the GNU General Public
8 * License version 2, as published by the Free Software Foundation, and
9 * may be copied, distributed, and modified under those terms.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 */ 7 */
17 8
18#include <linux/kernel.h> 9#include <linux/kernel.h>
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 093c9f917ed0..9f1557b98468 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -1,9 +1,7 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * crash.c - kernel crash support code. 3 * crash.c - kernel crash support code.
3 * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> 4 * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
4 *
5 * This source code is licensed under the GNU General Public License,
6 * Version 2. See the file COPYING for more details.
7 */ 5 */
8 6
9#include <linux/crash_core.h> 7#include <linux/crash_core.h>
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
index b64e238b553b..9c23ae074b40 100644
--- a/kernel/crash_dump.c
+++ b/kernel/crash_dump.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1#include <linux/kernel.h> 2#include <linux/kernel.h>
2#include <linux/crash_dump.h> 3#include <linux/crash_dump.h>
3#include <linux/init.h> 4#include <linux/init.h>
diff --git a/kernel/cred.c b/kernel/cred.c
index 45d77284aed0..f9a0ce66c9c3 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -1,12 +1,8 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
1/* Task credentials management - see Documentation/security/credentials.rst 2/* Task credentials management - see Documentation/security/credentials.rst
2 * 3 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. 4 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 5 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */ 6 */
11#include <linux/export.h> 7#include <linux/export.h>
12#include <linux/cred.h> 8#include <linux/cred.h>
@@ -174,6 +170,11 @@ void exit_creds(struct task_struct *tsk)
174 validate_creds(cred); 170 validate_creds(cred);
175 alter_cred_subscribers(cred, -1); 171 alter_cred_subscribers(cred, -1);
176 put_cred(cred); 172 put_cred(cred);
173
174#ifdef CONFIG_KEYS_REQUEST_CACHE
175 key_put(current->cached_requested_key);
176 current->cached_requested_key = NULL;
177#endif
177} 178}
178 179
179/** 180/**
@@ -327,6 +328,10 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
327 struct cred *new; 328 struct cred *new;
328 int ret; 329 int ret;
329 330
331#ifdef CONFIG_KEYS_REQUEST_CACHE
332 p->cached_requested_key = NULL;
333#endif
334
330 if ( 335 if (
331#ifdef CONFIG_KEYS 336#ifdef CONFIG_KEYS
332 !p->cred->thread_keyring && 337 !p->cred->thread_keyring &&
@@ -450,14 +455,23 @@ int commit_creds(struct cred *new)
450 if (task->mm) 455 if (task->mm)
451 set_dumpable(task->mm, suid_dumpable); 456 set_dumpable(task->mm, suid_dumpable);
452 task->pdeath_signal = 0; 457 task->pdeath_signal = 0;
458 /*
459 * If a task drops privileges and becomes nondumpable,
460 * the dumpability change must become visible before
461 * the credential change; otherwise, a __ptrace_may_access()
462 * racing with this change may be able to attach to a task it
463 * shouldn't be able to attach to (as if the task had dropped
464 * privileges without becoming nondumpable).
465 * Pairs with a read barrier in __ptrace_may_access().
466 */
453 smp_wmb(); 467 smp_wmb();
454 } 468 }
455 469
456 /* alter the thread keyring */ 470 /* alter the thread keyring */
457 if (!uid_eq(new->fsuid, old->fsuid)) 471 if (!uid_eq(new->fsuid, old->fsuid))
458 key_fsuid_changed(task); 472 key_fsuid_changed(new);
459 if (!gid_eq(new->fsgid, old->fsgid)) 473 if (!gid_eq(new->fsgid, old->fsgid))
460 key_fsgid_changed(task); 474 key_fsgid_changed(new);
461 475
462 /* do it 476 /* do it
463 * RLIMIT_NPROC limits on user->processes have already been checked 477 * RLIMIT_NPROC limits on user->processes have already been checked
diff --git a/kernel/debug/Makefile b/kernel/debug/Makefile
index a85edc339985..332ee6c6ec2c 100644
--- a/kernel/debug/Makefile
+++ b/kernel/debug/Makefile
@@ -1,3 +1,4 @@
1# SPDX-License-Identifier: GPL-2.0-only
1# 2#
2# Makefile for the linux kernel debugger 3# Makefile for the linux kernel debugger
3# 4#
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 7510dc687c0d..4b280fc7dd67 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -1033,13 +1033,14 @@ int gdb_serial_stub(struct kgdb_state *ks)
1033 return DBG_PASS_EVENT; 1033 return DBG_PASS_EVENT;
1034 } 1034 }
1035#endif 1035#endif
1036 /* Fall through */
1036 case 'C': /* Exception passing */ 1037 case 'C': /* Exception passing */
1037 tmp = gdb_cmd_exception_pass(ks); 1038 tmp = gdb_cmd_exception_pass(ks);
1038 if (tmp > 0) 1039 if (tmp > 0)
1039 goto default_handle; 1040 goto default_handle;
1040 if (tmp == 0) 1041 if (tmp == 0)
1041 break; 1042 break;
1042 /* Fall through on tmp < 0 */ 1043 /* Fall through - on tmp < 0 */
1043 case 'c': /* Continue packet */ 1044 case 'c': /* Continue packet */
1044 case 's': /* Single step packet */ 1045 case 's': /* Single step packet */
1045 if (kgdb_contthread && kgdb_contthread != current) { 1046 if (kgdb_contthread && kgdb_contthread != current) {
@@ -1048,7 +1049,7 @@ int gdb_serial_stub(struct kgdb_state *ks)
1048 break; 1049 break;
1049 } 1050 }
1050 dbg_activate_sw_breakpoints(); 1051 dbg_activate_sw_breakpoints();
1051 /* Fall through to default processing */ 1052 /* Fall through - to default processing */
1052 default: 1053 default:
1053default_handle: 1054default_handle:
1054 error = kgdb_arch_handle_exception(ks->ex_vector, 1055 error = kgdb_arch_handle_exception(ks->ex_vector,
@@ -1094,10 +1095,10 @@ int gdbstub_state(struct kgdb_state *ks, char *cmd)
1094 return error; 1095 return error;
1095 case 's': 1096 case 's':
1096 case 'c': 1097 case 'c':
1097 strcpy(remcom_in_buffer, cmd); 1098 strscpy(remcom_in_buffer, cmd, sizeof(remcom_in_buffer));
1098 return 0; 1099 return 0;
1099 case '$': 1100 case '$':
1100 strcpy(remcom_in_buffer, cmd); 1101 strscpy(remcom_in_buffer, cmd, sizeof(remcom_in_buffer));
1101 gdbstub_use_prev_in_buf = strlen(remcom_in_buffer); 1102 gdbstub_use_prev_in_buf = strlen(remcom_in_buffer);
1102 gdbstub_prev_in_buf_pos = 0; 1103 gdbstub_prev_in_buf_pos = 0;
1103 return 0; 1104 return 0;
diff --git a/kernel/debug/kdb/Makefile b/kernel/debug/kdb/Makefile
index d4fc58f4b88d..efac857c5511 100644
--- a/kernel/debug/kdb/Makefile
+++ b/kernel/debug/kdb/Makefile
@@ -6,7 +6,6 @@
6# Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved. 6# Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
7# 7#
8 8
9CCVERSION := $(shell $(CC) -v 2>&1 | sed -ne '$$p')
10obj-y := kdb_io.o kdb_main.o kdb_support.o kdb_bt.o gen-kdb_cmds.o kdb_bp.o kdb_debugger.o 9obj-y := kdb_io.o kdb_main.o kdb_support.o kdb_bt.o gen-kdb_cmds.o kdb_bp.o kdb_debugger.o
11obj-$(CONFIG_KDB_KEYBOARD) += kdb_keyboard.o 10obj-$(CONFIG_KDB_KEYBOARD) += kdb_keyboard.o
12 11
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 6a4b41484afe..3a5184eb6977 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -446,7 +446,7 @@ poll_again:
446char *kdb_getstr(char *buffer, size_t bufsize, const char *prompt) 446char *kdb_getstr(char *buffer, size_t bufsize, const char *prompt)
447{ 447{
448 if (prompt && kdb_prompt_str != prompt) 448 if (prompt && kdb_prompt_str != prompt)
449 strncpy(kdb_prompt_str, prompt, CMD_BUFLEN); 449 strscpy(kdb_prompt_str, prompt, CMD_BUFLEN);
450 kdb_printf(kdb_prompt_str); 450 kdb_printf(kdb_prompt_str);
451 kdb_nextline = 1; /* Prompt and input resets line number */ 451 kdb_nextline = 1; /* Prompt and input resets line number */
452 return kdb_read(buffer, bufsize); 452 return kdb_read(buffer, bufsize);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 82a3b32a7cfc..9ecfa37c7fbf 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -2522,7 +2522,6 @@ static int kdb_summary(int argc, const char **argv)
2522 kdb_printf("machine %s\n", init_uts_ns.name.machine); 2522 kdb_printf("machine %s\n", init_uts_ns.name.machine);
2523 kdb_printf("nodename %s\n", init_uts_ns.name.nodename); 2523 kdb_printf("nodename %s\n", init_uts_ns.name.nodename);
2524 kdb_printf("domainname %s\n", init_uts_ns.name.domainname); 2524 kdb_printf("domainname %s\n", init_uts_ns.name.domainname);
2525 kdb_printf("ccversion %s\n", __stringify(CCVERSION));
2526 2525
2527 now = __ktime_get_real_seconds(); 2526 now = __ktime_get_real_seconds();
2528 time64_to_tm(now, 0, &tm); 2527 time64_to_tm(now, 0, &tm);
@@ -2584,7 +2583,7 @@ static int kdb_per_cpu(int argc, const char **argv)
2584 diag = kdbgetularg(argv[3], &whichcpu); 2583 diag = kdbgetularg(argv[3], &whichcpu);
2585 if (diag) 2584 if (diag)
2586 return diag; 2585 return diag;
2587 if (!cpu_online(whichcpu)) { 2586 if (whichcpu >= nr_cpu_ids || !cpu_online(whichcpu)) {
2588 kdb_printf("cpu %ld is not online\n", whichcpu); 2587 kdb_printf("cpu %ld is not online\n", whichcpu);
2589 return KDB_BADCPUNUM; 2588 return KDB_BADCPUNUM;
2590 } 2589 }
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 50bf9b119bad..b8e6306e7e13 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -192,7 +192,7 @@ int kallsyms_symbol_complete(char *prefix_name, int max_len)
192 192
193 while ((name = kdb_walk_kallsyms(&pos))) { 193 while ((name = kdb_walk_kallsyms(&pos))) {
194 if (strncmp(name, prefix_name, prefix_len) == 0) { 194 if (strncmp(name, prefix_name, prefix_len) == 0) {
195 strcpy(ks_namebuf, name); 195 strscpy(ks_namebuf, name, sizeof(ks_namebuf));
196 /* Work out the longest name that matches the prefix */ 196 /* Work out the longest name that matches the prefix */
197 if (++number == 1) { 197 if (++number == 1) {
198 prev_len = min_t(int, max_len-1, 198 prev_len = min_t(int, max_len-1,
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 2a12b988c717..27725754ac99 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -1,16 +1,7 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
1/* delayacct.c - per-task delay accounting 2/* delayacct.c - per-task delay accounting
2 * 3 *
3 * Copyright (C) Shailabh Nagar, IBM Corp. 2006 4 * Copyright (C) Shailabh Nagar, IBM Corp. 2006
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it would be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
13 * the GNU General Public License for more details.
14 */ 5 */
15 6
16#include <linux/sched.h> 7#include <linux/sched.h>
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 83d711f8d665..70f8f8d9200e 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -1,3 +1,4 @@
1# SPDX-License-Identifier: GPL-2.0-only
1 2
2config HAS_DMA 3config HAS_DMA
3 bool 4 bool
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index b2a87905846d..bfc0c17f2a3d 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -214,6 +214,62 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages,
214 return cma_release(dev_get_cma_area(dev), pages, count); 214 return cma_release(dev_get_cma_area(dev), pages, count);
215} 215}
216 216
217/**
218 * dma_alloc_contiguous() - allocate contiguous pages
219 * @dev: Pointer to device for which the allocation is performed.
220 * @size: Requested allocation size.
221 * @gfp: Allocation flags.
222 *
223 * This function allocates contiguous memory buffer for specified device. It
224 * first tries to use device specific contiguous memory area if available or
225 * the default global one, then tries a fallback allocation of normal pages.
226 *
227 * Note that it byapss one-page size of allocations from the global area as
228 * the addresses within one page are always contiguous, so there is no need
229 * to waste CMA pages for that kind; it also helps reduce fragmentations.
230 */
231struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
232{
233 int node = dev ? dev_to_node(dev) : NUMA_NO_NODE;
234 size_t count = PAGE_ALIGN(size) >> PAGE_SHIFT;
235 size_t align = get_order(PAGE_ALIGN(size));
236 struct page *page = NULL;
237 struct cma *cma = NULL;
238
239 if (dev && dev->cma_area)
240 cma = dev->cma_area;
241 else if (count > 1)
242 cma = dma_contiguous_default_area;
243
244 /* CMA can be used only in the context which permits sleeping */
245 if (cma && gfpflags_allow_blocking(gfp)) {
246 align = min_t(size_t, align, CONFIG_CMA_ALIGNMENT);
247 page = cma_alloc(cma, count, align, gfp & __GFP_NOWARN);
248 }
249
250 /* Fallback allocation of normal pages */
251 if (!page)
252 page = alloc_pages_node(node, gfp, align);
253 return page;
254}
255
256/**
257 * dma_free_contiguous() - release allocated pages
258 * @dev: Pointer to device for which the pages were allocated.
259 * @page: Pointer to the allocated pages.
260 * @size: Size of allocated pages.
261 *
262 * This function releases memory allocated by dma_alloc_contiguous(). As the
263 * cma_release returns false when provided pages do not belong to contiguous
264 * area and true otherwise, this function then does a fallback __free_pages()
265 * upon a false-return.
266 */
267void dma_free_contiguous(struct device *dev, struct page *page, size_t size)
268{
269 if (!cma_release(dev_get_cma_area(dev), page, size >> PAGE_SHIFT))
270 __free_pages(page, get_order(size));
271}
272
217/* 273/*
218 * Support for reserved memory regions defined in device tree 274 * Support for reserved memory regions defined in device tree
219 */ 275 */
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index badd77670d00..099002d84f46 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -1,20 +1,8 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * Copyright (C) 2008 Advanced Micro Devices, Inc. 3 * Copyright (C) 2008 Advanced Micro Devices, Inc.
3 * 4 *
4 * Author: Joerg Roedel <joerg.roedel@amd.com> 5 * Author: Joerg Roedel <joerg.roedel@amd.com>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published
8 * by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */ 6 */
19 7
20#define pr_fmt(fmt) "DMA-API: " fmt 8#define pr_fmt(fmt) "DMA-API: " fmt
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 2c2772e9702a..b90e1aede743 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -96,8 +96,6 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
96struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, 96struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
97 dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) 97 dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
98{ 98{
99 unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
100 int page_order = get_order(size);
101 struct page *page = NULL; 99 struct page *page = NULL;
102 u64 phys_mask; 100 u64 phys_mask;
103 101
@@ -109,20 +107,9 @@ struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
109 gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, 107 gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
110 &phys_mask); 108 &phys_mask);
111again: 109again:
112 /* CMA can be used only in the context which permits sleeping */ 110 page = dma_alloc_contiguous(dev, size, gfp);
113 if (gfpflags_allow_blocking(gfp)) {
114 page = dma_alloc_from_contiguous(dev, count, page_order,
115 gfp & __GFP_NOWARN);
116 if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
117 dma_release_from_contiguous(dev, page, count);
118 page = NULL;
119 }
120 }
121 if (!page)
122 page = alloc_pages_node(dev_to_node(dev), gfp, page_order);
123
124 if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { 111 if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
125 __free_pages(page, page_order); 112 dma_free_contiguous(dev, page, size);
126 page = NULL; 113 page = NULL;
127 114
128 if (IS_ENABLED(CONFIG_ZONE_DMA32) && 115 if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
@@ -151,10 +138,18 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
151 if (!page) 138 if (!page)
152 return NULL; 139 return NULL;
153 140
141 if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) {
142 /* remove any dirty cache lines on the kernel alias */
143 if (!PageHighMem(page))
144 arch_dma_prep_coherent(page, size);
145 /* return the page pointer as the opaque cookie */
146 return page;
147 }
148
154 if (PageHighMem(page)) { 149 if (PageHighMem(page)) {
155 /* 150 /*
156 * Depending on the cma= arguments and per-arch setup 151 * Depending on the cma= arguments and per-arch setup
157 * dma_alloc_from_contiguous could return highmem pages. 152 * dma_alloc_contiguous could return highmem pages.
158 * Without remapping there is no way to return them here, 153 * Without remapping there is no way to return them here,
159 * so log an error and fail. 154 * so log an error and fail.
160 */ 155 */
@@ -171,15 +166,19 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
171 *dma_handle = phys_to_dma(dev, page_to_phys(page)); 166 *dma_handle = phys_to_dma(dev, page_to_phys(page));
172 } 167 }
173 memset(ret, 0, size); 168 memset(ret, 0, size);
169
170 if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
171 dma_alloc_need_uncached(dev, attrs)) {
172 arch_dma_prep_coherent(page, size);
173 ret = uncached_kernel_address(ret);
174 }
175
174 return ret; 176 return ret;
175} 177}
176 178
177void __dma_direct_free_pages(struct device *dev, size_t size, struct page *page) 179void __dma_direct_free_pages(struct device *dev, size_t size, struct page *page)
178{ 180{
179 unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; 181 dma_free_contiguous(dev, page, size);
180
181 if (!dma_release_from_contiguous(dev, page, count))
182 __free_pages(page, get_order(size));
183} 182}
184 183
185void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, 184void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
@@ -187,15 +186,26 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
187{ 186{
188 unsigned int page_order = get_order(size); 187 unsigned int page_order = get_order(size);
189 188
189 if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) {
190 /* cpu_addr is a struct page cookie, not a kernel address */
191 __dma_direct_free_pages(dev, size, cpu_addr);
192 return;
193 }
194
190 if (force_dma_unencrypted()) 195 if (force_dma_unencrypted())
191 set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order); 196 set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order);
197
198 if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
199 dma_alloc_need_uncached(dev, attrs))
200 cpu_addr = cached_kernel_address(cpu_addr);
192 __dma_direct_free_pages(dev, size, virt_to_page(cpu_addr)); 201 __dma_direct_free_pages(dev, size, virt_to_page(cpu_addr));
193} 202}
194 203
195void *dma_direct_alloc(struct device *dev, size_t size, 204void *dma_direct_alloc(struct device *dev, size_t size,
196 dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) 205 dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
197{ 206{
198 if (!dev_is_dma_coherent(dev)) 207 if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
208 dma_alloc_need_uncached(dev, attrs))
199 return arch_dma_alloc(dev, size, dma_handle, gfp, attrs); 209 return arch_dma_alloc(dev, size, dma_handle, gfp, attrs);
200 return dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs); 210 return dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs);
201} 211}
@@ -203,7 +213,8 @@ void *dma_direct_alloc(struct device *dev, size_t size,
203void dma_direct_free(struct device *dev, size_t size, 213void dma_direct_free(struct device *dev, size_t size,
204 void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs) 214 void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs)
205{ 215{
206 if (!dev_is_dma_coherent(dev)) 216 if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
217 dma_alloc_need_uncached(dev, attrs))
207 arch_dma_free(dev, size, cpu_addr, dma_addr, attrs); 218 arch_dma_free(dev, size, cpu_addr, dma_addr, attrs);
208 else 219 else
209 dma_direct_free_pages(dev, size, cpu_addr, dma_addr, attrs); 220 dma_direct_free_pages(dev, size, cpu_addr, dma_addr, attrs);
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index f7afdadb6770..1f628e7ac709 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -317,6 +317,12 @@ void arch_dma_set_mask(struct device *dev, u64 mask);
317 317
318int dma_set_mask(struct device *dev, u64 mask) 318int dma_set_mask(struct device *dev, u64 mask)
319{ 319{
320 /*
321 * Truncate the mask to the actually supported dma_addr_t width to
322 * avoid generating unsupportable addresses.
323 */
324 mask = (dma_addr_t)mask;
325
320 if (!dev->dma_mask || !dma_supported(dev, mask)) 326 if (!dev->dma_mask || !dma_supported(dev, mask))
321 return -EIO; 327 return -EIO;
322 328
@@ -330,6 +336,12 @@ EXPORT_SYMBOL(dma_set_mask);
330#ifndef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK 336#ifndef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK
331int dma_set_coherent_mask(struct device *dev, u64 mask) 337int dma_set_coherent_mask(struct device *dev, u64 mask)
332{ 338{
339 /*
340 * Truncate the mask to the actually supported dma_addr_t width to
341 * avoid generating unsupportable addresses.
342 */
343 mask = (dma_addr_t)mask;
344
333 if (!dma_supported(dev, mask)) 345 if (!dma_supported(dev, mask))
334 return -EIO; 346 return -EIO;
335 347
diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
index 7a723194ecbe..a594aec07882 100644
--- a/kernel/dma/remap.c
+++ b/kernel/dma/remap.c
@@ -158,6 +158,9 @@ out:
158 158
159bool dma_in_atomic_pool(void *start, size_t size) 159bool dma_in_atomic_pool(void *start, size_t size)
160{ 160{
161 if (unlikely(!atomic_pool))
162 return false;
163
161 return addr_in_gen_pool(atomic_pool, (unsigned long)start, size); 164 return addr_in_gen_pool(atomic_pool, (unsigned long)start, size);
162} 165}
163 166
@@ -199,8 +202,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
199 202
200 size = PAGE_ALIGN(size); 203 size = PAGE_ALIGN(size);
201 204
202 if (!gfpflags_allow_blocking(flags) && 205 if (!gfpflags_allow_blocking(flags)) {
203 !(attrs & DMA_ATTR_NO_KERNEL_MAPPING)) {
204 ret = dma_alloc_from_pool(size, &page, flags); 206 ret = dma_alloc_from_pool(size, &page, flags);
205 if (!ret) 207 if (!ret)
206 return NULL; 208 return NULL;
@@ -214,11 +216,6 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
214 /* remove any dirty cache lines on the kernel alias */ 216 /* remove any dirty cache lines on the kernel alias */
215 arch_dma_prep_coherent(page, size); 217 arch_dma_prep_coherent(page, size);
216 218
217 if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) {
218 ret = page; /* opaque cookie */
219 goto done;
220 }
221
222 /* create a coherent mapping */ 219 /* create a coherent mapping */
223 ret = dma_common_contiguous_remap(page, size, VM_USERMAP, 220 ret = dma_common_contiguous_remap(page, size, VM_USERMAP,
224 arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs), 221 arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs),
@@ -237,10 +234,7 @@ done:
237void arch_dma_free(struct device *dev, size_t size, void *vaddr, 234void arch_dma_free(struct device *dev, size_t size, void *vaddr,
238 dma_addr_t dma_handle, unsigned long attrs) 235 dma_addr_t dma_handle, unsigned long attrs)
239{ 236{
240 if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) { 237 if (!dma_free_from_pool(vaddr, PAGE_ALIGN(size))) {
241 /* vaddr is a struct page cookie, not a kernel address */
242 __dma_direct_free_pages(dev, size, vaddr);
243 } else if (!dma_free_from_pool(vaddr, PAGE_ALIGN(size))) {
244 phys_addr_t phys = dma_to_phys(dev, dma_handle); 238 phys_addr_t phys = dma_to_phys(dev, dma_handle);
245 struct page *page = pfn_to_page(__phys_to_pfn(phys)); 239 struct page *page = pfn_to_page(__phys_to_pfn(phys));
246 240
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 6f7619c1f877..62fa5a82a065 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * Dynamic DMA mapping support. 3 * Dynamic DMA mapping support.
3 * 4 *
@@ -695,29 +696,12 @@ bool is_swiotlb_active(void)
695 696
696static int __init swiotlb_create_debugfs(void) 697static int __init swiotlb_create_debugfs(void)
697{ 698{
698 struct dentry *d_swiotlb_usage; 699 struct dentry *root;
699 struct dentry *ent;
700
701 d_swiotlb_usage = debugfs_create_dir("swiotlb", NULL);
702
703 if (!d_swiotlb_usage)
704 return -ENOMEM;
705
706 ent = debugfs_create_ulong("io_tlb_nslabs", 0400,
707 d_swiotlb_usage, &io_tlb_nslabs);
708 if (!ent)
709 goto fail;
710
711 ent = debugfs_create_ulong("io_tlb_used", 0400,
712 d_swiotlb_usage, &io_tlb_used);
713 if (!ent)
714 goto fail;
715 700
701 root = debugfs_create_dir("swiotlb", NULL);
702 debugfs_create_ulong("io_tlb_nslabs", 0400, root, &io_tlb_nslabs);
703 debugfs_create_ulong("io_tlb_used", 0400, root, &io_tlb_used);
716 return 0; 704 return 0;
717
718fail:
719 debugfs_remove_recursive(d_swiotlb_usage);
720 return -ENOMEM;
721} 705}
722 706
723late_initcall(swiotlb_create_debugfs); 707late_initcall(swiotlb_create_debugfs);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index abbd4b3b96c2..785d708f8553 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2952,6 +2952,12 @@ static void ctx_sched_out(struct perf_event_context *ctx,
2952 if (!ctx->nr_active || !(is_active & EVENT_ALL)) 2952 if (!ctx->nr_active || !(is_active & EVENT_ALL))
2953 return; 2953 return;
2954 2954
2955 /*
2956 * If we had been multiplexing, no rotations are necessary, now no events
2957 * are active.
2958 */
2959 ctx->rotate_necessary = 0;
2960
2955 perf_pmu_disable(ctx->pmu); 2961 perf_pmu_disable(ctx->pmu);
2956 if (is_active & EVENT_PINNED) { 2962 if (is_active & EVENT_PINNED) {
2957 list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list) 2963 list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
@@ -3319,10 +3325,13 @@ static int flexible_sched_in(struct perf_event *event, void *data)
3319 return 0; 3325 return 0;
3320 3326
3321 if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) { 3327 if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
3322 if (!group_sched_in(event, sid->cpuctx, sid->ctx)) 3328 int ret = group_sched_in(event, sid->cpuctx, sid->ctx);
3323 list_add_tail(&event->active_list, &sid->ctx->flexible_active); 3329 if (ret) {
3324 else
3325 sid->can_add_hw = 0; 3330 sid->can_add_hw = 0;
3331 sid->ctx->rotate_necessary = 1;
3332 return 0;
3333 }
3334 list_add_tail(&event->active_list, &sid->ctx->flexible_active);
3326 } 3335 }
3327 3336
3328 return 0; 3337 return 0;
@@ -3690,24 +3699,17 @@ ctx_first_active(struct perf_event_context *ctx)
3690static bool perf_rotate_context(struct perf_cpu_context *cpuctx) 3699static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
3691{ 3700{
3692 struct perf_event *cpu_event = NULL, *task_event = NULL; 3701 struct perf_event *cpu_event = NULL, *task_event = NULL;
3693 bool cpu_rotate = false, task_rotate = false; 3702 struct perf_event_context *task_ctx = NULL;
3694 struct perf_event_context *ctx = NULL; 3703 int cpu_rotate, task_rotate;
3695 3704
3696 /* 3705 /*
3697 * Since we run this from IRQ context, nobody can install new 3706 * Since we run this from IRQ context, nobody can install new
3698 * events, thus the event count values are stable. 3707 * events, thus the event count values are stable.
3699 */ 3708 */
3700 3709
3701 if (cpuctx->ctx.nr_events) { 3710 cpu_rotate = cpuctx->ctx.rotate_necessary;
3702 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) 3711 task_ctx = cpuctx->task_ctx;
3703 cpu_rotate = true; 3712 task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;
3704 }
3705
3706 ctx = cpuctx->task_ctx;
3707 if (ctx && ctx->nr_events) {
3708 if (ctx->nr_events != ctx->nr_active)
3709 task_rotate = true;
3710 }
3711 3713
3712 if (!(cpu_rotate || task_rotate)) 3714 if (!(cpu_rotate || task_rotate))
3713 return false; 3715 return false;
@@ -3716,7 +3718,7 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
3716 perf_pmu_disable(cpuctx->ctx.pmu); 3718 perf_pmu_disable(cpuctx->ctx.pmu);
3717 3719
3718 if (task_rotate) 3720 if (task_rotate)
3719 task_event = ctx_first_active(ctx); 3721 task_event = ctx_first_active(task_ctx);
3720 if (cpu_rotate) 3722 if (cpu_rotate)
3721 cpu_event = ctx_first_active(&cpuctx->ctx); 3723 cpu_event = ctx_first_active(&cpuctx->ctx);
3722 3724
@@ -3724,17 +3726,17 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
3724 * As per the order given at ctx_resched() first 'pop' task flexible 3726 * As per the order given at ctx_resched() first 'pop' task flexible
3725 * and then, if needed CPU flexible. 3727 * and then, if needed CPU flexible.
3726 */ 3728 */
3727 if (task_event || (ctx && cpu_event)) 3729 if (task_event || (task_ctx && cpu_event))
3728 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); 3730 ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
3729 if (cpu_event) 3731 if (cpu_event)
3730 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 3732 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3731 3733
3732 if (task_event) 3734 if (task_event)
3733 rotate_ctx(ctx, task_event); 3735 rotate_ctx(task_ctx, task_event);
3734 if (cpu_event) 3736 if (cpu_event)
3735 rotate_ctx(&cpuctx->ctx, cpu_event); 3737 rotate_ctx(&cpuctx->ctx, cpu_event);
3736 3738
3737 perf_event_sched_in(cpuctx, ctx, current); 3739 perf_event_sched_in(cpuctx, task_ctx, current);
3738 3740
3739 perf_pmu_enable(cpuctx->ctx.pmu); 3741 perf_pmu_enable(cpuctx->ctx.pmu);
3740 perf_ctx_unlock(cpuctx, cpuctx->task_ctx); 3742 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -5005,6 +5007,9 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
5005 if (perf_event_check_period(event, value)) 5007 if (perf_event_check_period(event, value))
5006 return -EINVAL; 5008 return -EINVAL;
5007 5009
5010 if (!event->attr.freq && (value & (1ULL << 63)))
5011 return -EINVAL;
5012
5008 event_function_call(event, __perf_event_period, &value); 5013 event_function_call(event, __perf_event_period, &value);
5009 5014
5010 return 0; 5015 return 0;
@@ -5923,7 +5928,7 @@ static void perf_sample_regs_user(struct perf_regs *regs_user,
5923 if (user_mode(regs)) { 5928 if (user_mode(regs)) {
5924 regs_user->abi = perf_reg_abi(current); 5929 regs_user->abi = perf_reg_abi(current);
5925 regs_user->regs = regs; 5930 regs_user->regs = regs;
5926 } else if (current->mm) { 5931 } else if (!(current->flags & PF_KTHREAD)) {
5927 perf_get_regs_user(regs_user, regs, regs_user_copy); 5932 perf_get_regs_user(regs_user, regs, regs_user_copy);
5928 } else { 5933 } else {
5929 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; 5934 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
@@ -8532,9 +8537,9 @@ static int perf_tp_event_match(struct perf_event *event,
8532 if (event->hw.state & PERF_HES_STOPPED) 8537 if (event->hw.state & PERF_HES_STOPPED)
8533 return 0; 8538 return 0;
8534 /* 8539 /*
8535 * All tracepoints are from kernel-space. 8540 * If exclude_kernel, only trace user-space tracepoints (uprobes)
8536 */ 8541 */
8537 if (event->attr.exclude_kernel) 8542 if (event->attr.exclude_kernel && !user_mode(regs))
8538 return 0; 8543 return 0;
8539 8544
8540 if (!perf_tp_filter_match(event, data)) 8545 if (!perf_tp_filter_match(event, data))
@@ -9874,6 +9879,12 @@ static int pmu_dev_alloc(struct pmu *pmu)
9874 if (ret) 9879 if (ret)
9875 goto del_dev; 9880 goto del_dev;
9876 9881
9882 if (pmu->attr_update)
9883 ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update);
9884
9885 if (ret)
9886 goto del_dev;
9887
9877out: 9888out:
9878 return ret; 9889 return ret;
9879 9890
@@ -10033,6 +10044,12 @@ void perf_pmu_unregister(struct pmu *pmu)
10033} 10044}
10034EXPORT_SYMBOL_GPL(perf_pmu_unregister); 10045EXPORT_SYMBOL_GPL(perf_pmu_unregister);
10035 10046
10047static inline bool has_extended_regs(struct perf_event *event)
10048{
10049 return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) ||
10050 (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK);
10051}
10052
10036static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) 10053static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
10037{ 10054{
10038 struct perf_event_context *ctx = NULL; 10055 struct perf_event_context *ctx = NULL;
@@ -10064,12 +10081,16 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
10064 perf_event_ctx_unlock(event->group_leader, ctx); 10081 perf_event_ctx_unlock(event->group_leader, ctx);
10065 10082
10066 if (!ret) { 10083 if (!ret) {
10084 if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
10085 has_extended_regs(event))
10086 ret = -EOPNOTSUPP;
10087
10067 if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE && 10088 if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
10068 event_has_any_exclude_flag(event)) { 10089 event_has_any_exclude_flag(event))
10069 if (event->destroy)
10070 event->destroy(event);
10071 ret = -EINVAL; 10090 ret = -EINVAL;
10072 } 10091
10092 if (ret && event->destroy)
10093 event->destroy(event);
10073 } 10094 }
10074 10095
10075 if (ret) 10096 if (ret)
@@ -10680,11 +10701,11 @@ static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
10680 break; 10701 break;
10681 10702
10682 case CLOCK_BOOTTIME: 10703 case CLOCK_BOOTTIME:
10683 event->clock = &ktime_get_boot_ns; 10704 event->clock = &ktime_get_boottime_ns;
10684 break; 10705 break;
10685 10706
10686 case CLOCK_TAI: 10707 case CLOCK_TAI:
10687 event->clock = &ktime_get_tai_ns; 10708 event->clock = &ktime_get_clocktai_ns;
10688 break; 10709 break;
10689 10710
10690 default: 10711 default:
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 79c47076700a..3aef4191798c 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -24,7 +24,7 @@ struct ring_buffer {
24 atomic_t poll; /* POLL_ for wakeups */ 24 atomic_t poll; /* POLL_ for wakeups */
25 25
26 local_t head; /* write position */ 26 local_t head; /* write position */
27 local_t nest; /* nested writers */ 27 unsigned int nest; /* nested writers */
28 local_t events; /* event limit */ 28 local_t events; /* event limit */
29 local_t wakeup; /* wakeup stamp */ 29 local_t wakeup; /* wakeup stamp */
30 local_t lost; /* nr records lost */ 30 local_t lost; /* nr records lost */
@@ -41,7 +41,7 @@ struct ring_buffer {
41 41
42 /* AUX area */ 42 /* AUX area */
43 long aux_head; 43 long aux_head;
44 local_t aux_nest; 44 unsigned int aux_nest;
45 long aux_wakeup; /* last aux_watermark boundary crossed by aux_head */ 45 long aux_wakeup; /* last aux_watermark boundary crossed by aux_head */
46 unsigned long aux_pgoff; 46 unsigned long aux_pgoff;
47 int aux_nr_pages; 47 int aux_nr_pages;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 674b35383491..ffb59a4ef4ff 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -38,7 +38,12 @@ static void perf_output_get_handle(struct perf_output_handle *handle)
38 struct ring_buffer *rb = handle->rb; 38 struct ring_buffer *rb = handle->rb;
39 39
40 preempt_disable(); 40 preempt_disable();
41 local_inc(&rb->nest); 41
42 /*
43 * Avoid an explicit LOAD/STORE such that architectures with memops
44 * can use them.
45 */
46 (*(volatile unsigned int *)&rb->nest)++;
42 handle->wakeup = local_read(&rb->wakeup); 47 handle->wakeup = local_read(&rb->wakeup);
43} 48}
44 49
@@ -46,17 +51,35 @@ static void perf_output_put_handle(struct perf_output_handle *handle)
46{ 51{
47 struct ring_buffer *rb = handle->rb; 52 struct ring_buffer *rb = handle->rb;
48 unsigned long head; 53 unsigned long head;
54 unsigned int nest;
55
56 /*
57 * If this isn't the outermost nesting, we don't have to update
58 * @rb->user_page->data_head.
59 */
60 nest = READ_ONCE(rb->nest);
61 if (nest > 1) {
62 WRITE_ONCE(rb->nest, nest - 1);
63 goto out;
64 }
49 65
50again: 66again:
67 /*
68 * In order to avoid publishing a head value that goes backwards,
69 * we must ensure the load of @rb->head happens after we've
70 * incremented @rb->nest.
71 *
72 * Otherwise we can observe a @rb->head value before one published
73 * by an IRQ/NMI happening between the load and the increment.
74 */
75 barrier();
51 head = local_read(&rb->head); 76 head = local_read(&rb->head);
52 77
53 /* 78 /*
54 * IRQ/NMI can happen here, which means we can miss a head update. 79 * IRQ/NMI can happen here and advance @rb->head, causing our
80 * load above to be stale.
55 */ 81 */
56 82
57 if (!local_dec_and_test(&rb->nest))
58 goto out;
59
60 /* 83 /*
61 * Since the mmap() consumer (userspace) can run on a different CPU: 84 * Since the mmap() consumer (userspace) can run on a different CPU:
62 * 85 *
@@ -84,14 +107,23 @@ again:
84 * See perf_output_begin(). 107 * See perf_output_begin().
85 */ 108 */
86 smp_wmb(); /* B, matches C */ 109 smp_wmb(); /* B, matches C */
87 rb->user_page->data_head = head; 110 WRITE_ONCE(rb->user_page->data_head, head);
88 111
89 /* 112 /*
90 * Now check if we missed an update -- rely on previous implied 113 * We must publish the head before decrementing the nest count,
91 * compiler barriers to force a re-read. 114 * otherwise an IRQ/NMI can publish a more recent head value and our
115 * write will (temporarily) publish a stale value.
92 */ 116 */
117 barrier();
118 WRITE_ONCE(rb->nest, 0);
119
120 /*
121 * Ensure we decrement @rb->nest before we validate the @rb->head.
122 * Otherwise we cannot be sure we caught the 'last' nested update.
123 */
124 barrier();
93 if (unlikely(head != local_read(&rb->head))) { 125 if (unlikely(head != local_read(&rb->head))) {
94 local_inc(&rb->nest); 126 WRITE_ONCE(rb->nest, 1);
95 goto again; 127 goto again;
96 } 128 }
97 129
@@ -330,6 +362,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
330 struct perf_event *output_event = event; 362 struct perf_event *output_event = event;
331 unsigned long aux_head, aux_tail; 363 unsigned long aux_head, aux_tail;
332 struct ring_buffer *rb; 364 struct ring_buffer *rb;
365 unsigned int nest;
333 366
334 if (output_event->parent) 367 if (output_event->parent)
335 output_event = output_event->parent; 368 output_event = output_event->parent;
@@ -360,13 +393,16 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
360 if (!refcount_inc_not_zero(&rb->aux_refcount)) 393 if (!refcount_inc_not_zero(&rb->aux_refcount))
361 goto err; 394 goto err;
362 395
396 nest = READ_ONCE(rb->aux_nest);
363 /* 397 /*
364 * Nesting is not supported for AUX area, make sure nested 398 * Nesting is not supported for AUX area, make sure nested
365 * writers are caught early 399 * writers are caught early
366 */ 400 */
367 if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1))) 401 if (WARN_ON_ONCE(nest))
368 goto err_put; 402 goto err_put;
369 403
404 WRITE_ONCE(rb->aux_nest, nest + 1);
405
370 aux_head = rb->aux_head; 406 aux_head = rb->aux_head;
371 407
372 handle->rb = rb; 408 handle->rb = rb;
@@ -394,7 +430,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
394 if (!handle->size) { /* A, matches D */ 430 if (!handle->size) { /* A, matches D */
395 event->pending_disable = smp_processor_id(); 431 event->pending_disable = smp_processor_id();
396 perf_output_wakeup(handle); 432 perf_output_wakeup(handle);
397 local_set(&rb->aux_nest, 0); 433 WRITE_ONCE(rb->aux_nest, 0);
398 goto err_put; 434 goto err_put;
399 } 435 }
400 } 436 }
@@ -471,7 +507,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
471 perf_event_aux_event(handle->event, aux_head, size, 507 perf_event_aux_event(handle->event, aux_head, size,
472 handle->aux_flags); 508 handle->aux_flags);
473 509
474 rb->user_page->aux_head = rb->aux_head; 510 WRITE_ONCE(rb->user_page->aux_head, rb->aux_head);
475 if (rb_need_aux_wakeup(rb)) 511 if (rb_need_aux_wakeup(rb))
476 wakeup = true; 512 wakeup = true;
477 513
@@ -483,7 +519,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
483 519
484 handle->event = NULL; 520 handle->event = NULL;
485 521
486 local_set(&rb->aux_nest, 0); 522 WRITE_ONCE(rb->aux_nest, 0);
487 /* can't be last */ 523 /* can't be last */
488 rb_free_aux(rb); 524 rb_free_aux(rb);
489 ring_buffer_put(rb); 525 ring_buffer_put(rb);
@@ -503,7 +539,7 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
503 539
504 rb->aux_head += size; 540 rb->aux_head += size;
505 541
506 rb->user_page->aux_head = rb->aux_head; 542 WRITE_ONCE(rb->user_page->aux_head, rb->aux_head);
507 if (rb_need_aux_wakeup(rb)) { 543 if (rb_need_aux_wakeup(rb)) {
508 perf_output_wakeup(handle); 544 perf_output_wakeup(handle);
509 handle->wakeup = rb->aux_wakeup + rb->aux_watermark; 545 handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 78f61bfc6b79..84fa00497c49 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -46,7 +46,7 @@ static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */
46static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; 46static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
47#define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) 47#define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
48 48
49static struct percpu_rw_semaphore dup_mmap_sem; 49DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem);
50 50
51/* Have a copy of original instruction */ 51/* Have a copy of original instruction */
52#define UPROBE_COPY_INSN 0 52#define UPROBE_COPY_INSN 0
@@ -2112,7 +2112,7 @@ static void handle_trampoline(struct pt_regs *regs)
2112 2112
2113 sigill: 2113 sigill:
2114 uprobe_warn(current, "handle uretprobe, sending SIGILL."); 2114 uprobe_warn(current, "handle uretprobe, sending SIGILL.");
2115 force_sig(SIGILL, current); 2115 force_sig(SIGILL);
2116 2116
2117} 2117}
2118 2118
@@ -2228,7 +2228,7 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
2228 2228
2229 if (unlikely(err)) { 2229 if (unlikely(err)) {
2230 uprobe_warn(current, "execute the probed insn, sending SIGILL."); 2230 uprobe_warn(current, "execute the probed insn, sending SIGILL.");
2231 force_sig(SIGILL, current); 2231 force_sig(SIGILL);
2232 } 2232 }
2233} 2233}
2234 2234
@@ -2302,7 +2302,5 @@ void __init uprobes_init(void)
2302 for (i = 0; i < UPROBES_HASH_SZ; i++) 2302 for (i = 0; i < UPROBES_HASH_SZ; i++)
2303 mutex_init(&uprobes_mmap_mutex[i]); 2303 mutex_init(&uprobes_mmap_mutex[i]);
2304 2304
2305 BUG_ON(percpu_init_rwsem(&dup_mmap_sem));
2306
2307 BUG_ON(register_die_notifier(&uprobe_exception_nb)); 2305 BUG_ON(register_die_notifier(&uprobe_exception_nb));
2308} 2306}
diff --git a/kernel/exit.c b/kernel/exit.c
index 2166c2d92ddc..a75b6a7f458a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * linux/kernel/exit.c 3 * linux/kernel/exit.c
3 * 4 *
@@ -194,6 +195,7 @@ repeat:
194 rcu_read_unlock(); 195 rcu_read_unlock();
195 196
196 proc_flush_task(p); 197 proc_flush_task(p);
198 cgroup_release(p);
197 199
198 write_lock_irq(&tasklist_lock); 200 write_lock_irq(&tasklist_lock);
199 ptrace_release_task(p); 201 ptrace_release_task(p);
@@ -219,7 +221,6 @@ repeat:
219 } 221 }
220 222
221 write_unlock_irq(&tasklist_lock); 223 write_unlock_irq(&tasklist_lock);
222 cgroup_release(p);
223 release_thread(p); 224 release_thread(p);
224 call_rcu(&p->rcu, delayed_put_task_struct); 225 call_rcu(&p->rcu, delayed_put_task_struct);
225 226
@@ -422,7 +423,7 @@ retry:
422 * freed task structure. 423 * freed task structure.
423 */ 424 */
424 if (atomic_read(&mm->mm_users) <= 1) { 425 if (atomic_read(&mm->mm_users) <= 1) {
425 mm->owner = NULL; 426 WRITE_ONCE(mm->owner, NULL);
426 return; 427 return;
427 } 428 }
428 429
@@ -462,7 +463,7 @@ retry:
462 * most likely racing with swapoff (try_to_unuse()) or /proc or 463 * most likely racing with swapoff (try_to_unuse()) or /proc or
463 * ptrace or page migration (get_task_mm()). Mark owner as NULL. 464 * ptrace or page migration (get_task_mm()). Mark owner as NULL.
464 */ 465 */
465 mm->owner = NULL; 466 WRITE_ONCE(mm->owner, NULL);
466 return; 467 return;
467 468
468assign_new_owner: 469assign_new_owner:
@@ -483,7 +484,7 @@ assign_new_owner:
483 put_task_struct(c); 484 put_task_struct(c);
484 goto retry; 485 goto retry;
485 } 486 }
486 mm->owner = c; 487 WRITE_ONCE(mm->owner, c);
487 task_unlock(c); 488 task_unlock(c);
488 put_task_struct(c); 489 put_task_struct(c);
489} 490}
diff --git a/kernel/extable.c b/kernel/extable.c
index 6a5b61ebc66c..e23cce6e6092 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -1,19 +1,7 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
1/* Rewritten by Rusty Russell, on the backs of many others... 2/* Rewritten by Rusty Russell, on the backs of many others...
2 Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. 3 Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM.
3 4
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/ 5*/
18#include <linux/ftrace.h> 6#include <linux/ftrace.h>
19#include <linux/memory.h> 7#include <linux/memory.h>
diff --git a/kernel/fail_function.c b/kernel/fail_function.c
index feb80712b913..63b349168da7 100644
--- a/kernel/fail_function.c
+++ b/kernel/fail_function.c
@@ -152,20 +152,13 @@ static int fei_retval_get(void *data, u64 *val)
152DEFINE_DEBUGFS_ATTRIBUTE(fei_retval_ops, fei_retval_get, fei_retval_set, 152DEFINE_DEBUGFS_ATTRIBUTE(fei_retval_ops, fei_retval_get, fei_retval_set,
153 "%llx\n"); 153 "%llx\n");
154 154
155static int fei_debugfs_add_attr(struct fei_attr *attr) 155static void fei_debugfs_add_attr(struct fei_attr *attr)
156{ 156{
157 struct dentry *dir; 157 struct dentry *dir;
158 158
159 dir = debugfs_create_dir(attr->kp.symbol_name, fei_debugfs_dir); 159 dir = debugfs_create_dir(attr->kp.symbol_name, fei_debugfs_dir);
160 if (!dir)
161 return -ENOMEM;
162
163 if (!debugfs_create_file("retval", 0600, dir, attr, &fei_retval_ops)) {
164 debugfs_remove_recursive(dir);
165 return -ENOMEM;
166 }
167 160
168 return 0; 161 debugfs_create_file("retval", 0600, dir, attr, &fei_retval_ops);
169} 162}
170 163
171static void fei_debugfs_remove_attr(struct fei_attr *attr) 164static void fei_debugfs_remove_attr(struct fei_attr *attr)
@@ -306,7 +299,7 @@ static ssize_t fei_write(struct file *file, const char __user *buffer,
306 299
307 ret = register_kprobe(&attr->kp); 300 ret = register_kprobe(&attr->kp);
308 if (!ret) 301 if (!ret)
309 ret = fei_debugfs_add_attr(attr); 302 fei_debugfs_add_attr(attr);
310 if (ret < 0) 303 if (ret < 0)
311 fei_attr_remove(attr); 304 fei_attr_remove(attr);
312 else { 305 else {
@@ -337,19 +330,13 @@ static int __init fei_debugfs_init(void)
337 return PTR_ERR(dir); 330 return PTR_ERR(dir);
338 331
339 /* injectable attribute is just a symlink of error_inject/list */ 332 /* injectable attribute is just a symlink of error_inject/list */
340 if (!debugfs_create_symlink("injectable", dir, 333 debugfs_create_symlink("injectable", dir, "../error_injection/list");
341 "../error_injection/list"))
342 goto error;
343 334
344 if (!debugfs_create_file("inject", 0600, dir, NULL, &fei_ops)) 335 debugfs_create_file("inject", 0600, dir, NULL, &fei_ops);
345 goto error;
346 336
347 fei_debugfs_dir = dir; 337 fei_debugfs_dir = dir;
348 338
349 return 0; 339 return 0;
350error:
351 debugfs_remove_recursive(dir);
352 return -ENOMEM;
353} 340}
354 341
355late_initcall(fei_debugfs_init); 342late_initcall(fei_debugfs_init);
diff --git a/kernel/fork.c b/kernel/fork.c
index 737db1828437..8f3e2d97d771 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * linux/kernel/fork.c 3 * linux/kernel/fork.c
3 * 4 *
@@ -122,7 +123,7 @@
122unsigned long total_forks; /* Handle normal Linux uptimes. */ 123unsigned long total_forks; /* Handle normal Linux uptimes. */
123int nr_threads; /* The idle threads do not count.. */ 124int nr_threads; /* The idle threads do not count.. */
124 125
125int max_threads; /* tunable limit on nr_threads */ 126static int max_threads; /* tunable limit on nr_threads */
126 127
127DEFINE_PER_CPU(unsigned long, process_counts) = 0; 128DEFINE_PER_CPU(unsigned long, process_counts) = 0;
128 129
@@ -247,7 +248,11 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
247 struct page *page = alloc_pages_node(node, THREADINFO_GFP, 248 struct page *page = alloc_pages_node(node, THREADINFO_GFP,
248 THREAD_SIZE_ORDER); 249 THREAD_SIZE_ORDER);
249 250
250 return page ? page_address(page) : NULL; 251 if (likely(page)) {
252 tsk->stack = page_address(page);
253 return tsk->stack;
254 }
255 return NULL;
251#endif 256#endif
252} 257}
253 258
@@ -893,6 +898,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
893#ifdef CONFIG_STACKPROTECTOR 898#ifdef CONFIG_STACKPROTECTOR
894 tsk->stack_canary = get_random_canary(); 899 tsk->stack_canary = get_random_canary();
895#endif 900#endif
901 if (orig->cpus_ptr == &orig->cpus_mask)
902 tsk->cpus_ptr = &tsk->cpus_mask;
896 903
897 /* 904 /*
898 * One for us, one for whoever does the "release_task()" (usually 905 * One for us, one for whoever does the "release_task()" (usually
@@ -955,6 +962,15 @@ static void mm_init_aio(struct mm_struct *mm)
955#endif 962#endif
956} 963}
957 964
965static __always_inline void mm_clear_owner(struct mm_struct *mm,
966 struct task_struct *p)
967{
968#ifdef CONFIG_MEMCG
969 if (mm->owner == p)
970 WRITE_ONCE(mm->owner, NULL);
971#endif
972}
973
958static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) 974static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
959{ 975{
960#ifdef CONFIG_MEMCG 976#ifdef CONFIG_MEMCG
@@ -1343,6 +1359,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk,
1343free_pt: 1359free_pt:
1344 /* don't put binfmt in mmput, we haven't got module yet */ 1360 /* don't put binfmt in mmput, we haven't got module yet */
1345 mm->binfmt = NULL; 1361 mm->binfmt = NULL;
1362 mm_init_owner(mm, NULL);
1346 mmput(mm); 1363 mmput(mm);
1347 1364
1348fail_nomem: 1365fail_nomem:
@@ -1694,36 +1711,52 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
1694} 1711}
1695#endif 1712#endif
1696 1713
1714/*
1715 * Poll support for process exit notification.
1716 */
1717static unsigned int pidfd_poll(struct file *file, struct poll_table_struct *pts)
1718{
1719 struct task_struct *task;
1720 struct pid *pid = file->private_data;
1721 int poll_flags = 0;
1722
1723 poll_wait(file, &pid->wait_pidfd, pts);
1724
1725 rcu_read_lock();
1726 task = pid_task(pid, PIDTYPE_PID);
1727 /*
1728 * Inform pollers only when the whole thread group exits.
1729 * If the thread group leader exits before all other threads in the
1730 * group, then poll(2) should block, similar to the wait(2) family.
1731 */
1732 if (!task || (task->exit_state && thread_group_empty(task)))
1733 poll_flags = POLLIN | POLLRDNORM;
1734 rcu_read_unlock();
1735
1736 return poll_flags;
1737}
1738
1697const struct file_operations pidfd_fops = { 1739const struct file_operations pidfd_fops = {
1698 .release = pidfd_release, 1740 .release = pidfd_release,
1741 .poll = pidfd_poll,
1699#ifdef CONFIG_PROC_FS 1742#ifdef CONFIG_PROC_FS
1700 .show_fdinfo = pidfd_show_fdinfo, 1743 .show_fdinfo = pidfd_show_fdinfo,
1701#endif 1744#endif
1702}; 1745};
1703 1746
1704/** 1747static void __delayed_free_task(struct rcu_head *rhp)
1705 * pidfd_create() - Create a new pid file descriptor.
1706 *
1707 * @pid: struct pid that the pidfd will reference
1708 *
1709 * This creates a new pid file descriptor with the O_CLOEXEC flag set.
1710 *
1711 * Note, that this function can only be called after the fd table has
1712 * been unshared to avoid leaking the pidfd to the new process.
1713 *
1714 * Return: On success, a cloexec pidfd is returned.
1715 * On error, a negative errno number will be returned.
1716 */
1717static int pidfd_create(struct pid *pid)
1718{ 1748{
1719 int fd; 1749 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
1720 1750
1721 fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid), 1751 free_task(tsk);
1722 O_RDWR | O_CLOEXEC); 1752}
1723 if (fd < 0)
1724 put_pid(pid);
1725 1753
1726 return fd; 1754static __always_inline void delayed_free_task(struct task_struct *tsk)
1755{
1756 if (IS_ENABLED(CONFIG_MEMCG))
1757 call_rcu(&tsk->rcu, __delayed_free_task);
1758 else
1759 free_task(tsk);
1727} 1760}
1728 1761
1729/* 1762/*
@@ -1735,19 +1768,16 @@ static int pidfd_create(struct pid *pid)
1735 * flags). The actual kick-off is left to the caller. 1768 * flags). The actual kick-off is left to the caller.
1736 */ 1769 */
1737static __latent_entropy struct task_struct *copy_process( 1770static __latent_entropy struct task_struct *copy_process(
1738 unsigned long clone_flags,
1739 unsigned long stack_start,
1740 unsigned long stack_size,
1741 int __user *parent_tidptr,
1742 int __user *child_tidptr,
1743 struct pid *pid, 1771 struct pid *pid,
1744 int trace, 1772 int trace,
1745 unsigned long tls, 1773 int node,
1746 int node) 1774 struct kernel_clone_args *args)
1747{ 1775{
1748 int pidfd = -1, retval; 1776 int pidfd = -1, retval;
1749 struct task_struct *p; 1777 struct task_struct *p;
1750 struct multiprocess_signals delayed; 1778 struct multiprocess_signals delayed;
1779 struct file *pidfile = NULL;
1780 u64 clone_flags = args->flags;
1751 1781
1752 /* 1782 /*
1753 * Don't allow sharing the root directory with processes in a different 1783 * Don't allow sharing the root directory with processes in a different
@@ -1796,27 +1826,12 @@ static __latent_entropy struct task_struct *copy_process(
1796 } 1826 }
1797 1827
1798 if (clone_flags & CLONE_PIDFD) { 1828 if (clone_flags & CLONE_PIDFD) {
1799 int reserved;
1800
1801 /* 1829 /*
1802 * - CLONE_PARENT_SETTID is useless for pidfds and also
1803 * parent_tidptr is used to return pidfds.
1804 * - CLONE_DETACHED is blocked so that we can potentially 1830 * - CLONE_DETACHED is blocked so that we can potentially
1805 * reuse it later for CLONE_PIDFD. 1831 * reuse it later for CLONE_PIDFD.
1806 * - CLONE_THREAD is blocked until someone really needs it. 1832 * - CLONE_THREAD is blocked until someone really needs it.
1807 */ 1833 */
1808 if (clone_flags & 1834 if (clone_flags & (CLONE_DETACHED | CLONE_THREAD))
1809 (CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD))
1810 return ERR_PTR(-EINVAL);
1811
1812 /*
1813 * Verify that parent_tidptr is sane so we can potentially
1814 * reuse it later.
1815 */
1816 if (get_user(reserved, parent_tidptr))
1817 return ERR_PTR(-EFAULT);
1818
1819 if (reserved != 0)
1820 return ERR_PTR(-EINVAL); 1835 return ERR_PTR(-EINVAL);
1821 } 1836 }
1822 1837
@@ -1849,11 +1864,11 @@ static __latent_entropy struct task_struct *copy_process(
1849 * p->set_child_tid which is (ab)used as a kthread's data pointer for 1864 * p->set_child_tid which is (ab)used as a kthread's data pointer for
1850 * kernel threads (PF_KTHREAD). 1865 * kernel threads (PF_KTHREAD).
1851 */ 1866 */
1852 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; 1867 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
1853 /* 1868 /*
1854 * Clear TID on mm_release()? 1869 * Clear TID on mm_release()?
1855 */ 1870 */
1856 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; 1871 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL;
1857 1872
1858 ftrace_graph_init_task(p); 1873 ftrace_graph_init_task(p);
1859 1874
@@ -1958,9 +1973,6 @@ static __latent_entropy struct task_struct *copy_process(
1958 p->pagefault_disabled = 0; 1973 p->pagefault_disabled = 0;
1959 1974
1960#ifdef CONFIG_LOCKDEP 1975#ifdef CONFIG_LOCKDEP
1961 p->lockdep_depth = 0; /* no locks held yet */
1962 p->curr_chain_key = 0;
1963 p->lockdep_recursion = 0;
1964 lockdep_init_task(p); 1976 lockdep_init_task(p);
1965#endif 1977#endif
1966 1978
@@ -2012,7 +2024,8 @@ static __latent_entropy struct task_struct *copy_process(
2012 retval = copy_io(clone_flags, p); 2024 retval = copy_io(clone_flags, p);
2013 if (retval) 2025 if (retval)
2014 goto bad_fork_cleanup_namespaces; 2026 goto bad_fork_cleanup_namespaces;
2015 retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls); 2027 retval = copy_thread_tls(clone_flags, args->stack, args->stack_size, p,
2028 args->tls);
2016 if (retval) 2029 if (retval)
2017 goto bad_fork_cleanup_io; 2030 goto bad_fork_cleanup_io;
2018 2031
@@ -2032,12 +2045,22 @@ static __latent_entropy struct task_struct *copy_process(
2032 * if the fd table isn't shared). 2045 * if the fd table isn't shared).
2033 */ 2046 */
2034 if (clone_flags & CLONE_PIDFD) { 2047 if (clone_flags & CLONE_PIDFD) {
2035 retval = pidfd_create(pid); 2048 retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
2036 if (retval < 0) 2049 if (retval < 0)
2037 goto bad_fork_free_pid; 2050 goto bad_fork_free_pid;
2038 2051
2039 pidfd = retval; 2052 pidfd = retval;
2040 retval = put_user(pidfd, parent_tidptr); 2053
2054 pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
2055 O_RDWR | O_CLOEXEC);
2056 if (IS_ERR(pidfile)) {
2057 put_unused_fd(pidfd);
2058 retval = PTR_ERR(pidfile);
2059 goto bad_fork_free_pid;
2060 }
2061 get_pid(pid); /* held by pidfile now */
2062
2063 retval = put_user(pidfd, args->pidfd);
2041 if (retval) 2064 if (retval)
2042 goto bad_fork_put_pidfd; 2065 goto bad_fork_put_pidfd;
2043 } 2066 }
@@ -2068,7 +2091,7 @@ static __latent_entropy struct task_struct *copy_process(
2068#ifdef TIF_SYSCALL_EMU 2091#ifdef TIF_SYSCALL_EMU
2069 clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); 2092 clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
2070#endif 2093#endif
2071 clear_all_latency_tracing(p); 2094 clear_tsk_latency_tracing(p);
2072 2095
2073 /* ok, now we should be set up.. */ 2096 /* ok, now we should be set up.. */
2074 p->pid = pid_nr(pid); 2097 p->pid = pid_nr(pid);
@@ -2080,7 +2103,7 @@ static __latent_entropy struct task_struct *copy_process(
2080 if (clone_flags & CLONE_PARENT) 2103 if (clone_flags & CLONE_PARENT)
2081 p->exit_signal = current->group_leader->exit_signal; 2104 p->exit_signal = current->group_leader->exit_signal;
2082 else 2105 else
2083 p->exit_signal = (clone_flags & CSIGNAL); 2106 p->exit_signal = args->exit_signal;
2084 p->group_leader = p; 2107 p->group_leader = p;
2085 p->tgid = p->pid; 2108 p->tgid = p->pid;
2086 } 2109 }
@@ -2113,7 +2136,7 @@ static __latent_entropy struct task_struct *copy_process(
2113 */ 2136 */
2114 2137
2115 p->start_time = ktime_get_ns(); 2138 p->start_time = ktime_get_ns();
2116 p->real_start_time = ktime_get_boot_ns(); 2139 p->real_start_time = ktime_get_boottime_ns();
2117 2140
2118 /* 2141 /*
2119 * Make it visible to the rest of the system, but dont wake it up yet. 2142 * Make it visible to the rest of the system, but dont wake it up yet.
@@ -2154,6 +2177,9 @@ static __latent_entropy struct task_struct *copy_process(
2154 goto bad_fork_cancel_cgroup; 2177 goto bad_fork_cancel_cgroup;
2155 } 2178 }
2156 2179
2180 /* past the last point of failure */
2181 if (pidfile)
2182 fd_install(pidfd, pidfile);
2157 2183
2158 init_task_pid_links(p); 2184 init_task_pid_links(p);
2159 if (likely(p->pid)) { 2185 if (likely(p->pid)) {
@@ -2220,8 +2246,10 @@ bad_fork_cancel_cgroup:
2220bad_fork_cgroup_threadgroup_change_end: 2246bad_fork_cgroup_threadgroup_change_end:
2221 cgroup_threadgroup_change_end(current); 2247 cgroup_threadgroup_change_end(current);
2222bad_fork_put_pidfd: 2248bad_fork_put_pidfd:
2223 if (clone_flags & CLONE_PIDFD) 2249 if (clone_flags & CLONE_PIDFD) {
2224 ksys_close(pidfd); 2250 fput(pidfile);
2251 put_unused_fd(pidfd);
2252 }
2225bad_fork_free_pid: 2253bad_fork_free_pid:
2226 if (pid != &init_struct_pid) 2254 if (pid != &init_struct_pid)
2227 free_pid(pid); 2255 free_pid(pid);
@@ -2233,8 +2261,10 @@ bad_fork_cleanup_io:
2233bad_fork_cleanup_namespaces: 2261bad_fork_cleanup_namespaces:
2234 exit_task_namespaces(p); 2262 exit_task_namespaces(p);
2235bad_fork_cleanup_mm: 2263bad_fork_cleanup_mm:
2236 if (p->mm) 2264 if (p->mm) {
2265 mm_clear_owner(p->mm, p);
2237 mmput(p->mm); 2266 mmput(p->mm);
2267 }
2238bad_fork_cleanup_signal: 2268bad_fork_cleanup_signal:
2239 if (!(clone_flags & CLONE_THREAD)) 2269 if (!(clone_flags & CLONE_THREAD))
2240 free_signal_struct(p->signal); 2270 free_signal_struct(p->signal);
@@ -2265,7 +2295,7 @@ bad_fork_cleanup_count:
2265bad_fork_free: 2295bad_fork_free:
2266 p->state = TASK_DEAD; 2296 p->state = TASK_DEAD;
2267 put_task_stack(p); 2297 put_task_stack(p);
2268 free_task(p); 2298 delayed_free_task(p);
2269fork_out: 2299fork_out:
2270 spin_lock_irq(&current->sighand->siglock); 2300 spin_lock_irq(&current->sighand->siglock);
2271 hlist_del_init(&delayed.node); 2301 hlist_del_init(&delayed.node);
@@ -2286,8 +2316,11 @@ static inline void init_idle_pids(struct task_struct *idle)
2286struct task_struct *fork_idle(int cpu) 2316struct task_struct *fork_idle(int cpu)
2287{ 2317{
2288 struct task_struct *task; 2318 struct task_struct *task;
2289 task = copy_process(CLONE_VM, 0, 0, NULL, NULL, &init_struct_pid, 0, 0, 2319 struct kernel_clone_args args = {
2290 cpu_to_node(cpu)); 2320 .flags = CLONE_VM,
2321 };
2322
2323 task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
2291 if (!IS_ERR(task)) { 2324 if (!IS_ERR(task)) {
2292 init_idle_pids(task); 2325 init_idle_pids(task);
2293 init_idle(task, cpu); 2326 init_idle(task, cpu);
@@ -2307,13 +2340,9 @@ struct mm_struct *copy_init_mm(void)
2307 * It copies the process, and if successful kick-starts 2340 * It copies the process, and if successful kick-starts
2308 * it and waits for it to finish using the VM if required. 2341 * it and waits for it to finish using the VM if required.
2309 */ 2342 */
2310long _do_fork(unsigned long clone_flags, 2343long _do_fork(struct kernel_clone_args *args)
2311 unsigned long stack_start,
2312 unsigned long stack_size,
2313 int __user *parent_tidptr,
2314 int __user *child_tidptr,
2315 unsigned long tls)
2316{ 2344{
2345 u64 clone_flags = args->flags;
2317 struct completion vfork; 2346 struct completion vfork;
2318 struct pid *pid; 2347 struct pid *pid;
2319 struct task_struct *p; 2348 struct task_struct *p;
@@ -2329,7 +2358,7 @@ long _do_fork(unsigned long clone_flags,
2329 if (!(clone_flags & CLONE_UNTRACED)) { 2358 if (!(clone_flags & CLONE_UNTRACED)) {
2330 if (clone_flags & CLONE_VFORK) 2359 if (clone_flags & CLONE_VFORK)
2331 trace = PTRACE_EVENT_VFORK; 2360 trace = PTRACE_EVENT_VFORK;
2332 else if ((clone_flags & CSIGNAL) != SIGCHLD) 2361 else if (args->exit_signal != SIGCHLD)
2333 trace = PTRACE_EVENT_CLONE; 2362 trace = PTRACE_EVENT_CLONE;
2334 else 2363 else
2335 trace = PTRACE_EVENT_FORK; 2364 trace = PTRACE_EVENT_FORK;
@@ -2338,8 +2367,7 @@ long _do_fork(unsigned long clone_flags,
2338 trace = 0; 2367 trace = 0;
2339 } 2368 }
2340 2369
2341 p = copy_process(clone_flags, stack_start, stack_size, parent_tidptr, 2370 p = copy_process(NULL, trace, NUMA_NO_NODE, args);
2342 child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
2343 add_latent_entropy(); 2371 add_latent_entropy();
2344 2372
2345 if (IS_ERR(p)) 2373 if (IS_ERR(p))
@@ -2355,7 +2383,7 @@ long _do_fork(unsigned long clone_flags,
2355 nr = pid_vnr(pid); 2383 nr = pid_vnr(pid);
2356 2384
2357 if (clone_flags & CLONE_PARENT_SETTID) 2385 if (clone_flags & CLONE_PARENT_SETTID)
2358 put_user(nr, parent_tidptr); 2386 put_user(nr, args->parent_tid);
2359 2387
2360 if (clone_flags & CLONE_VFORK) { 2388 if (clone_flags & CLONE_VFORK) {
2361 p->vfork_done = &vfork; 2389 p->vfork_done = &vfork;
@@ -2387,8 +2415,16 @@ long do_fork(unsigned long clone_flags,
2387 int __user *parent_tidptr, 2415 int __user *parent_tidptr,
2388 int __user *child_tidptr) 2416 int __user *child_tidptr)
2389{ 2417{
2390 return _do_fork(clone_flags, stack_start, stack_size, 2418 struct kernel_clone_args args = {
2391 parent_tidptr, child_tidptr, 0); 2419 .flags = (clone_flags & ~CSIGNAL),
2420 .child_tid = child_tidptr,
2421 .parent_tid = parent_tidptr,
2422 .exit_signal = (clone_flags & CSIGNAL),
2423 .stack = stack_start,
2424 .stack_size = stack_size,
2425 };
2426
2427 return _do_fork(&args);
2392} 2428}
2393#endif 2429#endif
2394 2430
@@ -2397,15 +2433,25 @@ long do_fork(unsigned long clone_flags,
2397 */ 2433 */
2398pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) 2434pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
2399{ 2435{
2400 return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, 2436 struct kernel_clone_args args = {
2401 (unsigned long)arg, NULL, NULL, 0); 2437 .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL),
2438 .exit_signal = (flags & CSIGNAL),
2439 .stack = (unsigned long)fn,
2440 .stack_size = (unsigned long)arg,
2441 };
2442
2443 return _do_fork(&args);
2402} 2444}
2403 2445
2404#ifdef __ARCH_WANT_SYS_FORK 2446#ifdef __ARCH_WANT_SYS_FORK
2405SYSCALL_DEFINE0(fork) 2447SYSCALL_DEFINE0(fork)
2406{ 2448{
2407#ifdef CONFIG_MMU 2449#ifdef CONFIG_MMU
2408 return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0); 2450 struct kernel_clone_args args = {
2451 .exit_signal = SIGCHLD,
2452 };
2453
2454 return _do_fork(&args);
2409#else 2455#else
2410 /* can not support in nommu mode */ 2456 /* can not support in nommu mode */
2411 return -EINVAL; 2457 return -EINVAL;
@@ -2416,8 +2462,12 @@ SYSCALL_DEFINE0(fork)
2416#ifdef __ARCH_WANT_SYS_VFORK 2462#ifdef __ARCH_WANT_SYS_VFORK
2417SYSCALL_DEFINE0(vfork) 2463SYSCALL_DEFINE0(vfork)
2418{ 2464{
2419 return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, 2465 struct kernel_clone_args args = {
2420 0, NULL, NULL, 0); 2466 .flags = CLONE_VFORK | CLONE_VM,
2467 .exit_signal = SIGCHLD,
2468 };
2469
2470 return _do_fork(&args);
2421} 2471}
2422#endif 2472#endif
2423 2473
@@ -2445,7 +2495,112 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
2445 unsigned long, tls) 2495 unsigned long, tls)
2446#endif 2496#endif
2447{ 2497{
2448 return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls); 2498 struct kernel_clone_args args = {
2499 .flags = (clone_flags & ~CSIGNAL),
2500 .pidfd = parent_tidptr,
2501 .child_tid = child_tidptr,
2502 .parent_tid = parent_tidptr,
2503 .exit_signal = (clone_flags & CSIGNAL),
2504 .stack = newsp,
2505 .tls = tls,
2506 };
2507
2508 /* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */
2509 if ((clone_flags & CLONE_PIDFD) && (clone_flags & CLONE_PARENT_SETTID))
2510 return -EINVAL;
2511
2512 return _do_fork(&args);
2513}
2514#endif
2515
2516#ifdef __ARCH_WANT_SYS_CLONE3
2517noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
2518 struct clone_args __user *uargs,
2519 size_t size)
2520{
2521 struct clone_args args;
2522
2523 if (unlikely(size > PAGE_SIZE))
2524 return -E2BIG;
2525
2526 if (unlikely(size < sizeof(struct clone_args)))
2527 return -EINVAL;
2528
2529 if (unlikely(!access_ok(uargs, size)))
2530 return -EFAULT;
2531
2532 if (size > sizeof(struct clone_args)) {
2533 unsigned char __user *addr;
2534 unsigned char __user *end;
2535 unsigned char val;
2536
2537 addr = (void __user *)uargs + sizeof(struct clone_args);
2538 end = (void __user *)uargs + size;
2539
2540 for (; addr < end; addr++) {
2541 if (get_user(val, addr))
2542 return -EFAULT;
2543 if (val)
2544 return -E2BIG;
2545 }
2546
2547 size = sizeof(struct clone_args);
2548 }
2549
2550 if (copy_from_user(&args, uargs, size))
2551 return -EFAULT;
2552
2553 *kargs = (struct kernel_clone_args){
2554 .flags = args.flags,
2555 .pidfd = u64_to_user_ptr(args.pidfd),
2556 .child_tid = u64_to_user_ptr(args.child_tid),
2557 .parent_tid = u64_to_user_ptr(args.parent_tid),
2558 .exit_signal = args.exit_signal,
2559 .stack = args.stack,
2560 .stack_size = args.stack_size,
2561 .tls = args.tls,
2562 };
2563
2564 return 0;
2565}
2566
2567static bool clone3_args_valid(const struct kernel_clone_args *kargs)
2568{
2569 /*
2570 * All lower bits of the flag word are taken.
2571 * Verify that no other unknown flags are passed along.
2572 */
2573 if (kargs->flags & ~CLONE_LEGACY_FLAGS)
2574 return false;
2575
2576 /*
2577 * - make the CLONE_DETACHED bit reuseable for clone3
2578 * - make the CSIGNAL bits reuseable for clone3
2579 */
2580 if (kargs->flags & (CLONE_DETACHED | CSIGNAL))
2581 return false;
2582
2583 if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) &&
2584 kargs->exit_signal)
2585 return false;
2586
2587 return true;
2588}
2589
2590SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
2591{
2592 int err;
2593
2594 struct kernel_clone_args kargs;
2595
2596 err = copy_clone_args_from_user(&kargs, uargs, size);
2597 if (err)
2598 return err;
2599
2600 if (!clone3_args_valid(&kargs))
2601 return -EINVAL;
2602
2603 return _do_fork(&kargs);
2449} 2604}
2450#endif 2605#endif
2451 2606
diff --git a/kernel/freezer.c b/kernel/freezer.c
index b162b74611e4..c0738424bb43 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * kernel/freezer.c - Function to freeze a process 3 * kernel/freezer.c - Function to freeze a process
3 * 4 *
diff --git a/kernel/futex.c b/kernel/futex.c
index 2268b97d5439..6d50728ef2e7 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
1/* 2/*
2 * Fast Userspace Mutexes (which I call "Futexes!"). 3 * Fast Userspace Mutexes (which I call "Futexes!").
3 * (C) Rusty Russell, IBM 2002 4 * (C) Rusty Russell, IBM 2002
@@ -29,20 +30,6 @@
29 * 30 *
30 * "The futexes are also cursed." 31 * "The futexes are also cursed."
31 * "But they come in a choice of three flavours!" 32 * "But they come in a choice of three flavours!"
32 *
33 * This program is free software; you can redistribute it and/or modify
34 * it under the terms of the GNU General Public License as published by
35 * the Free Software Foundation; either version 2 of the License, or
36 * (at your option) any later version.
37 *
38 * This program is distributed in the hope that it will be useful,
39 * but WITHOUT ANY WARRANTY; without even the implied warranty of
40 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
41 * GNU General Public License for more details.
42 *
43 * You should have received a copy of the GNU General Public License
44 * along with this program; if not, write to the Free Software
45 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
46 */ 33 */
47#include <linux/compat.h> 34#include <linux/compat.h>
48#include <linux/slab.h> 35#include <linux/slab.h>
@@ -484,6 +471,37 @@ enum futex_access {
484}; 471};
485 472
486/** 473/**
474 * futex_setup_timer - set up the sleeping hrtimer.
475 * @time: ptr to the given timeout value
476 * @timeout: the hrtimer_sleeper structure to be set up
477 * @flags: futex flags
478 * @range_ns: optional range in ns
479 *
480 * Return: Initialized hrtimer_sleeper structure or NULL if no timeout
481 * value given
482 */
483static inline struct hrtimer_sleeper *
484futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
485 int flags, u64 range_ns)
486{
487 if (!time)
488 return NULL;
489
490 hrtimer_init_on_stack(&timeout->timer, (flags & FLAGS_CLOCKRT) ?
491 CLOCK_REALTIME : CLOCK_MONOTONIC,
492 HRTIMER_MODE_ABS);
493 hrtimer_init_sleeper(timeout, current);
494
495 /*
496 * If range_ns is 0, calling hrtimer_set_expires_range_ns() is
497 * effectively the same as calling hrtimer_set_expires().
498 */
499 hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns);
500
501 return timeout;
502}
503
504/**
487 * get_futex_key() - Get parameters which are the keys for a futex 505 * get_futex_key() - Get parameters which are the keys for a futex
488 * @uaddr: virtual address of the futex 506 * @uaddr: virtual address of the futex
489 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED 507 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
@@ -2692,7 +2710,7 @@ out:
2692static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, 2710static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
2693 ktime_t *abs_time, u32 bitset) 2711 ktime_t *abs_time, u32 bitset)
2694{ 2712{
2695 struct hrtimer_sleeper timeout, *to = NULL; 2713 struct hrtimer_sleeper timeout, *to;
2696 struct restart_block *restart; 2714 struct restart_block *restart;
2697 struct futex_hash_bucket *hb; 2715 struct futex_hash_bucket *hb;
2698 struct futex_q q = futex_q_init; 2716 struct futex_q q = futex_q_init;
@@ -2702,17 +2720,8 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
2702 return -EINVAL; 2720 return -EINVAL;
2703 q.bitset = bitset; 2721 q.bitset = bitset;
2704 2722
2705 if (abs_time) { 2723 to = futex_setup_timer(abs_time, &timeout, flags,
2706 to = &timeout; 2724 current->timer_slack_ns);
2707
2708 hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
2709 CLOCK_REALTIME : CLOCK_MONOTONIC,
2710 HRTIMER_MODE_ABS);
2711 hrtimer_init_sleeper(to, current);
2712 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
2713 current->timer_slack_ns);
2714 }
2715
2716retry: 2725retry:
2717 /* 2726 /*
2718 * Prepare to wait on uaddr. On success, holds hb lock and increments 2727 * Prepare to wait on uaddr. On success, holds hb lock and increments
@@ -2792,7 +2801,7 @@ static long futex_wait_restart(struct restart_block *restart)
2792static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, 2801static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
2793 ktime_t *time, int trylock) 2802 ktime_t *time, int trylock)
2794{ 2803{
2795 struct hrtimer_sleeper timeout, *to = NULL; 2804 struct hrtimer_sleeper timeout, *to;
2796 struct futex_pi_state *pi_state = NULL; 2805 struct futex_pi_state *pi_state = NULL;
2797 struct rt_mutex_waiter rt_waiter; 2806 struct rt_mutex_waiter rt_waiter;
2798 struct futex_hash_bucket *hb; 2807 struct futex_hash_bucket *hb;
@@ -2805,13 +2814,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
2805 if (refill_pi_state_cache()) 2814 if (refill_pi_state_cache())
2806 return -ENOMEM; 2815 return -ENOMEM;
2807 2816
2808 if (time) { 2817 to = futex_setup_timer(time, &timeout, FLAGS_CLOCKRT, 0);
2809 to = &timeout;
2810 hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME,
2811 HRTIMER_MODE_ABS);
2812 hrtimer_init_sleeper(to, current);
2813 hrtimer_set_expires(&to->timer, *time);
2814 }
2815 2818
2816retry: 2819retry:
2817 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE); 2820 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
@@ -3208,7 +3211,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
3208 u32 val, ktime_t *abs_time, u32 bitset, 3211 u32 val, ktime_t *abs_time, u32 bitset,
3209 u32 __user *uaddr2) 3212 u32 __user *uaddr2)
3210{ 3213{
3211 struct hrtimer_sleeper timeout, *to = NULL; 3214 struct hrtimer_sleeper timeout, *to;
3212 struct futex_pi_state *pi_state = NULL; 3215 struct futex_pi_state *pi_state = NULL;
3213 struct rt_mutex_waiter rt_waiter; 3216 struct rt_mutex_waiter rt_waiter;
3214 struct futex_hash_bucket *hb; 3217 struct futex_hash_bucket *hb;
@@ -3225,15 +3228,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
3225 if (!bitset) 3228 if (!bitset)
3226 return -EINVAL; 3229 return -EINVAL;
3227 3230
3228 if (abs_time) { 3231 to = futex_setup_timer(abs_time, &timeout, flags,
3229 to = &timeout; 3232 current->timer_slack_ns);
3230 hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
3231 CLOCK_REALTIME : CLOCK_MONOTONIC,
3232 HRTIMER_MODE_ABS);
3233 hrtimer_init_sleeper(to, current);
3234 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
3235 current->timer_slack_ns);
3236 }
3237 3233
3238 /* 3234 /*
3239 * The waiter is allocated on our stack, manipulated by the requeue 3235 * The waiter is allocated on our stack, manipulated by the requeue
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 1e3823fa799b..3941a9c48f83 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -1,3 +1,4 @@
1# SPDX-License-Identifier: GPL-2.0-only
1menu "GCOV-based kernel profiling" 2menu "GCOV-based kernel profiling"
2 3
3config GCOV_KERNEL 4config GCOV_KERNEL
@@ -53,6 +54,7 @@ config GCOV_PROFILE_ALL
53choice 54choice
54 prompt "Specify GCOV format" 55 prompt "Specify GCOV format"
55 depends on GCOV_KERNEL 56 depends on GCOV_KERNEL
57 depends on CC_IS_GCC
56 ---help--- 58 ---help---
57 The gcov format is usually determined by the GCC version, and the 59 The gcov format is usually determined by the GCC version, and the
58 default is chosen according to your GCC version. However, there are 60 default is chosen according to your GCC version. However, there are
@@ -62,7 +64,7 @@ choice
62 64
63config GCOV_FORMAT_3_4 65config GCOV_FORMAT_3_4
64 bool "GCC 3.4 format" 66 bool "GCC 3.4 format"
65 depends on CC_IS_GCC && GCC_VERSION < 40700 67 depends on GCC_VERSION < 40700
66 ---help--- 68 ---help---
67 Select this option to use the format defined by GCC 3.4. 69 Select this option to use the format defined by GCC 3.4.
68 70
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index ff06d64df397..d66a74b0f100 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -2,5 +2,6 @@
2ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' 2ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
3 3
4obj-y := base.o fs.o 4obj-y := base.o fs.o
5obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_3_4.o 5obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_base.o gcc_3_4.o
6obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_4_7.o 6obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_base.o gcc_4_7.o
7obj-$(CONFIG_CC_IS_CLANG) += clang.o
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index 9c7c8d5c18f2..0ffe9f194080 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -22,88 +22,8 @@
22#include <linux/sched.h> 22#include <linux/sched.h>
23#include "gcov.h" 23#include "gcov.h"
24 24
25static int gcov_events_enabled; 25int gcov_events_enabled;
26static DEFINE_MUTEX(gcov_lock); 26DEFINE_MUTEX(gcov_lock);
27
28/*
29 * __gcov_init is called by gcc-generated constructor code for each object
30 * file compiled with -fprofile-arcs.
31 */
32void __gcov_init(struct gcov_info *info)
33{
34 static unsigned int gcov_version;
35
36 mutex_lock(&gcov_lock);
37 if (gcov_version == 0) {
38 gcov_version = gcov_info_version(info);
39 /*
40 * Printing gcc's version magic may prove useful for debugging
41 * incompatibility reports.
42 */
43 pr_info("version magic: 0x%x\n", gcov_version);
44 }
45 /*
46 * Add new profiling data structure to list and inform event
47 * listener.
48 */
49 gcov_info_link(info);
50 if (gcov_events_enabled)
51 gcov_event(GCOV_ADD, info);
52 mutex_unlock(&gcov_lock);
53}
54EXPORT_SYMBOL(__gcov_init);
55
56/*
57 * These functions may be referenced by gcc-generated profiling code but serve
58 * no function for kernel profiling.
59 */
60void __gcov_flush(void)
61{
62 /* Unused. */
63}
64EXPORT_SYMBOL(__gcov_flush);
65
66void __gcov_merge_add(gcov_type *counters, unsigned int n_counters)
67{
68 /* Unused. */
69}
70EXPORT_SYMBOL(__gcov_merge_add);
71
72void __gcov_merge_single(gcov_type *counters, unsigned int n_counters)
73{
74 /* Unused. */
75}
76EXPORT_SYMBOL(__gcov_merge_single);
77
78void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters)
79{
80 /* Unused. */
81}
82EXPORT_SYMBOL(__gcov_merge_delta);
83
84void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters)
85{
86 /* Unused. */
87}
88EXPORT_SYMBOL(__gcov_merge_ior);
89
90void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters)
91{
92 /* Unused. */
93}
94EXPORT_SYMBOL(__gcov_merge_time_profile);
95
96void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters)
97{
98 /* Unused. */
99}
100EXPORT_SYMBOL(__gcov_merge_icall_topn);
101
102void __gcov_exit(void)
103{
104 /* Unused. */
105}
106EXPORT_SYMBOL(__gcov_exit);
107 27
108/** 28/**
109 * gcov_enable_events - enable event reporting through gcov_event() 29 * gcov_enable_events - enable event reporting through gcov_event()
@@ -144,7 +64,7 @@ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
144 64
145 /* Remove entries located in module from linked list. */ 65 /* Remove entries located in module from linked list. */
146 while ((info = gcov_info_next(info))) { 66 while ((info = gcov_info_next(info))) {
147 if (within_module((unsigned long)info, mod)) { 67 if (gcov_info_within_module(info, mod)) {
148 gcov_info_unlink(prev, info); 68 gcov_info_unlink(prev, info);
149 if (gcov_events_enabled) 69 if (gcov_events_enabled)
150 gcov_event(GCOV_REMOVE, info); 70 gcov_event(GCOV_REMOVE, info);
diff --git a/kernel/gcov/clang.c b/kernel/gcov/clang.c
new file mode 100644
index 000000000000..c94b820a1b62
--- /dev/null
+++ b/kernel/gcov/clang.c
@@ -0,0 +1,581 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2019 Google, Inc.
4 * modified from kernel/gcov/gcc_4_7.c
5 *
6 * This software is licensed under the terms of the GNU General Public
7 * License version 2, as published by the Free Software Foundation, and
8 * may be copied, distributed, and modified under those terms.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 *
16 * LLVM uses profiling data that's deliberately similar to GCC, but has a
17 * very different way of exporting that data. LLVM calls llvm_gcov_init() once
18 * per module, and provides a couple of callbacks that we can use to ask for
19 * more data.
20 *
21 * We care about the "writeout" callback, which in turn calls back into
22 * compiler-rt/this module to dump all the gathered coverage data to disk:
23 *
24 * llvm_gcda_start_file()
25 * llvm_gcda_emit_function()
26 * llvm_gcda_emit_arcs()
27 * llvm_gcda_emit_function()
28 * llvm_gcda_emit_arcs()
29 * [... repeats for each function ...]
30 * llvm_gcda_summary_info()
31 * llvm_gcda_end_file()
32 *
33 * This design is much more stateless and unstructured than gcc's, and is
34 * intended to run at process exit. This forces us to keep some local state
35 * about which module we're dealing with at the moment. On the other hand, it
36 * also means we don't depend as much on how LLVM represents profiling data
37 * internally.
38 *
39 * See LLVM's lib/Transforms/Instrumentation/GCOVProfiling.cpp for more
40 * details on how this works, particularly GCOVProfiler::emitProfileArcs(),
41 * GCOVProfiler::insertCounterWriteout(), and
42 * GCOVProfiler::insertFlush().
43 */
44
45#define pr_fmt(fmt) "gcov: " fmt
46
47#include <linux/kernel.h>
48#include <linux/list.h>
49#include <linux/printk.h>
50#include <linux/ratelimit.h>
51#include <linux/seq_file.h>
52#include <linux/slab.h>
53#include <linux/vmalloc.h>
54#include "gcov.h"
55
56typedef void (*llvm_gcov_callback)(void);
57
58struct gcov_info {
59 struct list_head head;
60
61 const char *filename;
62 unsigned int version;
63 u32 checksum;
64
65 struct list_head functions;
66};
67
68struct gcov_fn_info {
69 struct list_head head;
70
71 u32 ident;
72 u32 checksum;
73 u8 use_extra_checksum;
74 u32 cfg_checksum;
75
76 u32 num_counters;
77 u64 *counters;
78 const char *function_name;
79};
80
81static struct gcov_info *current_info;
82
83static LIST_HEAD(clang_gcov_list);
84
85void llvm_gcov_init(llvm_gcov_callback writeout, llvm_gcov_callback flush)
86{
87 struct gcov_info *info = kzalloc(sizeof(*info), GFP_KERNEL);
88
89 if (!info)
90 return;
91
92 INIT_LIST_HEAD(&info->head);
93 INIT_LIST_HEAD(&info->functions);
94
95 mutex_lock(&gcov_lock);
96
97 list_add_tail(&info->head, &clang_gcov_list);
98 current_info = info;
99 writeout();
100 current_info = NULL;
101 if (gcov_events_enabled)
102 gcov_event(GCOV_ADD, info);
103
104 mutex_unlock(&gcov_lock);
105}
106EXPORT_SYMBOL(llvm_gcov_init);
107
108void llvm_gcda_start_file(const char *orig_filename, const char version[4],
109 u32 checksum)
110{
111 current_info->filename = orig_filename;
112 memcpy(&current_info->version, version, sizeof(current_info->version));
113 current_info->checksum = checksum;
114}
115EXPORT_SYMBOL(llvm_gcda_start_file);
116
117void llvm_gcda_emit_function(u32 ident, const char *function_name,
118 u32 func_checksum, u8 use_extra_checksum, u32 cfg_checksum)
119{
120 struct gcov_fn_info *info = kzalloc(sizeof(*info), GFP_KERNEL);
121
122 if (!info)
123 return;
124
125 INIT_LIST_HEAD(&info->head);
126 info->ident = ident;
127 info->checksum = func_checksum;
128 info->use_extra_checksum = use_extra_checksum;
129 info->cfg_checksum = cfg_checksum;
130 if (function_name)
131 info->function_name = kstrdup(function_name, GFP_KERNEL);
132
133 list_add_tail(&info->head, &current_info->functions);
134}
135EXPORT_SYMBOL(llvm_gcda_emit_function);
136
137void llvm_gcda_emit_arcs(u32 num_counters, u64 *counters)
138{
139 struct gcov_fn_info *info = list_last_entry(&current_info->functions,
140 struct gcov_fn_info, head);
141
142 info->num_counters = num_counters;
143 info->counters = counters;
144}
145EXPORT_SYMBOL(llvm_gcda_emit_arcs);
146
147void llvm_gcda_summary_info(void)
148{
149}
150EXPORT_SYMBOL(llvm_gcda_summary_info);
151
152void llvm_gcda_end_file(void)
153{
154}
155EXPORT_SYMBOL(llvm_gcda_end_file);
156
157/**
158 * gcov_info_filename - return info filename
159 * @info: profiling data set
160 */
161const char *gcov_info_filename(struct gcov_info *info)
162{
163 return info->filename;
164}
165
166/**
167 * gcov_info_version - return info version
168 * @info: profiling data set
169 */
170unsigned int gcov_info_version(struct gcov_info *info)
171{
172 return info->version;
173}
174
175/**
176 * gcov_info_next - return next profiling data set
177 * @info: profiling data set
178 *
179 * Returns next gcov_info following @info or first gcov_info in the chain if
180 * @info is %NULL.
181 */
182struct gcov_info *gcov_info_next(struct gcov_info *info)
183{
184 if (!info)
185 return list_first_entry_or_null(&clang_gcov_list,
186 struct gcov_info, head);
187 if (list_is_last(&info->head, &clang_gcov_list))
188 return NULL;
189 return list_next_entry(info, head);
190}
191
192/**
193 * gcov_info_link - link/add profiling data set to the list
194 * @info: profiling data set
195 */
196void gcov_info_link(struct gcov_info *info)
197{
198 list_add_tail(&info->head, &clang_gcov_list);
199}
200
201/**
202 * gcov_info_unlink - unlink/remove profiling data set from the list
203 * @prev: previous profiling data set
204 * @info: profiling data set
205 */
206void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
207{
208 /* Generic code unlinks while iterating. */
209 __list_del_entry(&info->head);
210}
211
212/**
213 * gcov_info_within_module - check if a profiling data set belongs to a module
214 * @info: profiling data set
215 * @mod: module
216 *
217 * Returns true if profiling data belongs module, false otherwise.
218 */
219bool gcov_info_within_module(struct gcov_info *info, struct module *mod)
220{
221 return within_module((unsigned long)info->filename, mod);
222}
223
224/* Symbolic links to be created for each profiling data file. */
225const struct gcov_link gcov_link[] = {
226 { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */
227 { 0, NULL},
228};
229
230/**
231 * gcov_info_reset - reset profiling data to zero
232 * @info: profiling data set
233 */
234void gcov_info_reset(struct gcov_info *info)
235{
236 struct gcov_fn_info *fn;
237
238 list_for_each_entry(fn, &info->functions, head)
239 memset(fn->counters, 0,
240 sizeof(fn->counters[0]) * fn->num_counters);
241}
242
243/**
244 * gcov_info_is_compatible - check if profiling data can be added
245 * @info1: first profiling data set
246 * @info2: second profiling data set
247 *
248 * Returns non-zero if profiling data can be added, zero otherwise.
249 */
250int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
251{
252 struct gcov_fn_info *fn_ptr1 = list_first_entry_or_null(
253 &info1->functions, struct gcov_fn_info, head);
254 struct gcov_fn_info *fn_ptr2 = list_first_entry_or_null(
255 &info2->functions, struct gcov_fn_info, head);
256
257 if (info1->checksum != info2->checksum)
258 return false;
259 if (!fn_ptr1)
260 return fn_ptr1 == fn_ptr2;
261 while (!list_is_last(&fn_ptr1->head, &info1->functions) &&
262 !list_is_last(&fn_ptr2->head, &info2->functions)) {
263 if (fn_ptr1->checksum != fn_ptr2->checksum)
264 return false;
265 if (fn_ptr1->use_extra_checksum != fn_ptr2->use_extra_checksum)
266 return false;
267 if (fn_ptr1->use_extra_checksum &&
268 fn_ptr1->cfg_checksum != fn_ptr2->cfg_checksum)
269 return false;
270 fn_ptr1 = list_next_entry(fn_ptr1, head);
271 fn_ptr2 = list_next_entry(fn_ptr2, head);
272 }
273 return list_is_last(&fn_ptr1->head, &info1->functions) &&
274 list_is_last(&fn_ptr2->head, &info2->functions);
275}
276
277/**
278 * gcov_info_add - add up profiling data
279 * @dest: profiling data set to which data is added
280 * @source: profiling data set which is added
281 *
282 * Adds profiling counts of @source to @dest.
283 */
284void gcov_info_add(struct gcov_info *dst, struct gcov_info *src)
285{
286 struct gcov_fn_info *dfn_ptr;
287 struct gcov_fn_info *sfn_ptr = list_first_entry_or_null(&src->functions,
288 struct gcov_fn_info, head);
289
290 list_for_each_entry(dfn_ptr, &dst->functions, head) {
291 u32 i;
292
293 for (i = 0; i < sfn_ptr->num_counters; i++)
294 dfn_ptr->counters[i] += sfn_ptr->counters[i];
295 }
296}
297
298static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn)
299{
300 size_t cv_size; /* counter values size */
301 struct gcov_fn_info *fn_dup = kmemdup(fn, sizeof(*fn),
302 GFP_KERNEL);
303 if (!fn_dup)
304 return NULL;
305 INIT_LIST_HEAD(&fn_dup->head);
306
307 fn_dup->function_name = kstrdup(fn->function_name, GFP_KERNEL);
308 if (!fn_dup->function_name)
309 goto err_name;
310
311 cv_size = fn->num_counters * sizeof(fn->counters[0]);
312 fn_dup->counters = vmalloc(cv_size);
313 if (!fn_dup->counters)
314 goto err_counters;
315 memcpy(fn_dup->counters, fn->counters, cv_size);
316
317 return fn_dup;
318
319err_counters:
320 kfree(fn_dup->function_name);
321err_name:
322 kfree(fn_dup);
323 return NULL;
324}
325
326/**
327 * gcov_info_dup - duplicate profiling data set
328 * @info: profiling data set to duplicate
329 *
330 * Return newly allocated duplicate on success, %NULL on error.
331 */
332struct gcov_info *gcov_info_dup(struct gcov_info *info)
333{
334 struct gcov_info *dup;
335 struct gcov_fn_info *fn;
336
337 dup = kmemdup(info, sizeof(*dup), GFP_KERNEL);
338 if (!dup)
339 return NULL;
340 INIT_LIST_HEAD(&dup->head);
341 INIT_LIST_HEAD(&dup->functions);
342 dup->filename = kstrdup(info->filename, GFP_KERNEL);
343 if (!dup->filename)
344 goto err;
345
346 list_for_each_entry(fn, &info->functions, head) {
347 struct gcov_fn_info *fn_dup = gcov_fn_info_dup(fn);
348
349 if (!fn_dup)
350 goto err;
351 list_add_tail(&fn_dup->head, &dup->functions);
352 }
353
354 return dup;
355
356err:
357 gcov_info_free(dup);
358 return NULL;
359}
360
361/**
362 * gcov_info_free - release memory for profiling data set duplicate
363 * @info: profiling data set duplicate to free
364 */
365void gcov_info_free(struct gcov_info *info)
366{
367 struct gcov_fn_info *fn, *tmp;
368
369 list_for_each_entry_safe(fn, tmp, &info->functions, head) {
370 kfree(fn->function_name);
371 vfree(fn->counters);
372 list_del(&fn->head);
373 kfree(fn);
374 }
375 kfree(info->filename);
376 kfree(info);
377}
378
379#define ITER_STRIDE PAGE_SIZE
380
381/**
382 * struct gcov_iterator - specifies current file position in logical records
383 * @info: associated profiling data
384 * @buffer: buffer containing file data
385 * @size: size of buffer
386 * @pos: current position in file
387 */
388struct gcov_iterator {
389 struct gcov_info *info;
390 void *buffer;
391 size_t size;
392 loff_t pos;
393};
394
395/**
396 * store_gcov_u32 - store 32 bit number in gcov format to buffer
397 * @buffer: target buffer or NULL
398 * @off: offset into the buffer
399 * @v: value to be stored
400 *
401 * Number format defined by gcc: numbers are recorded in the 32 bit
402 * unsigned binary form of the endianness of the machine generating the
403 * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't
404 * store anything.
405 */
406static size_t store_gcov_u32(void *buffer, size_t off, u32 v)
407{
408 u32 *data;
409
410 if (buffer) {
411 data = buffer + off;
412 *data = v;
413 }
414
415 return sizeof(*data);
416}
417
418/**
419 * store_gcov_u64 - store 64 bit number in gcov format to buffer
420 * @buffer: target buffer or NULL
421 * @off: offset into the buffer
422 * @v: value to be stored
423 *
424 * Number format defined by gcc: numbers are recorded in the 32 bit
425 * unsigned binary form of the endianness of the machine generating the
426 * file. 64 bit numbers are stored as two 32 bit numbers, the low part
427 * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store
428 * anything.
429 */
430static size_t store_gcov_u64(void *buffer, size_t off, u64 v)
431{
432 u32 *data;
433
434 if (buffer) {
435 data = buffer + off;
436
437 data[0] = (v & 0xffffffffUL);
438 data[1] = (v >> 32);
439 }
440
441 return sizeof(*data) * 2;
442}
443
444/**
445 * convert_to_gcda - convert profiling data set to gcda file format
446 * @buffer: the buffer to store file data or %NULL if no data should be stored
447 * @info: profiling data set to be converted
448 *
449 * Returns the number of bytes that were/would have been stored into the buffer.
450 */
451static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
452{
453 struct gcov_fn_info *fi_ptr;
454 size_t pos = 0;
455
456 /* File header. */
457 pos += store_gcov_u32(buffer, pos, GCOV_DATA_MAGIC);
458 pos += store_gcov_u32(buffer, pos, info->version);
459 pos += store_gcov_u32(buffer, pos, info->checksum);
460
461 list_for_each_entry(fi_ptr, &info->functions, head) {
462 u32 i;
463 u32 len = 2;
464
465 if (fi_ptr->use_extra_checksum)
466 len++;
467
468 pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION);
469 pos += store_gcov_u32(buffer, pos, len);
470 pos += store_gcov_u32(buffer, pos, fi_ptr->ident);
471 pos += store_gcov_u32(buffer, pos, fi_ptr->checksum);
472 if (fi_ptr->use_extra_checksum)
473 pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum);
474
475 pos += store_gcov_u32(buffer, pos, GCOV_TAG_COUNTER_BASE);
476 pos += store_gcov_u32(buffer, pos, fi_ptr->num_counters * 2);
477 for (i = 0; i < fi_ptr->num_counters; i++)
478 pos += store_gcov_u64(buffer, pos, fi_ptr->counters[i]);
479 }
480
481 return pos;
482}
483
484/**
485 * gcov_iter_new - allocate and initialize profiling data iterator
486 * @info: profiling data set to be iterated
487 *
488 * Return file iterator on success, %NULL otherwise.
489 */
490struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
491{
492 struct gcov_iterator *iter;
493
494 iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL);
495 if (!iter)
496 goto err_free;
497
498 iter->info = info;
499 /* Dry-run to get the actual buffer size. */
500 iter->size = convert_to_gcda(NULL, info);
501 iter->buffer = vmalloc(iter->size);
502 if (!iter->buffer)
503 goto err_free;
504
505 convert_to_gcda(iter->buffer, info);
506
507 return iter;
508
509err_free:
510 kfree(iter);
511 return NULL;
512}
513
514
515/**
516 * gcov_iter_get_info - return profiling data set for given file iterator
517 * @iter: file iterator
518 */
519void gcov_iter_free(struct gcov_iterator *iter)
520{
521 vfree(iter->buffer);
522 kfree(iter);
523}
524
525/**
526 * gcov_iter_get_info - return profiling data set for given file iterator
527 * @iter: file iterator
528 */
529struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
530{
531 return iter->info;
532}
533
534/**
535 * gcov_iter_start - reset file iterator to starting position
536 * @iter: file iterator
537 */
538void gcov_iter_start(struct gcov_iterator *iter)
539{
540 iter->pos = 0;
541}
542
543/**
544 * gcov_iter_next - advance file iterator to next logical record
545 * @iter: file iterator
546 *
547 * Return zero if new position is valid, non-zero if iterator has reached end.
548 */
549int gcov_iter_next(struct gcov_iterator *iter)
550{
551 if (iter->pos < iter->size)
552 iter->pos += ITER_STRIDE;
553
554 if (iter->pos >= iter->size)
555 return -EINVAL;
556
557 return 0;
558}
559
560/**
561 * gcov_iter_write - write data for current pos to seq_file
562 * @iter: file iterator
563 * @seq: seq_file handle
564 *
565 * Return zero on success, non-zero otherwise.
566 */
567int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
568{
569 size_t len;
570
571 if (iter->pos >= iter->size)
572 return -EINVAL;
573
574 len = ITER_STRIDE;
575 if (iter->pos + len > iter->size)
576 len = iter->size - iter->pos;
577
578 seq_write(seq, iter->buffer + iter->pos, len);
579
580 return 0;
581}
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index 6e40ff6be083..e5eb5ea7ea59 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -64,7 +64,6 @@ struct gcov_node {
64static const char objtree[] = OBJTREE; 64static const char objtree[] = OBJTREE;
65static const char srctree[] = SRCTREE; 65static const char srctree[] = SRCTREE;
66static struct gcov_node root_node; 66static struct gcov_node root_node;
67static struct dentry *reset_dentry;
68static LIST_HEAD(all_head); 67static LIST_HEAD(all_head);
69static DEFINE_MUTEX(node_lock); 68static DEFINE_MUTEX(node_lock);
70 69
@@ -387,8 +386,6 @@ static void add_links(struct gcov_node *node, struct dentry *parent)
387 goto out_err; 386 goto out_err;
388 node->links[i] = debugfs_create_symlink(deskew(basename), 387 node->links[i] = debugfs_create_symlink(deskew(basename),
389 parent, target); 388 parent, target);
390 if (!node->links[i])
391 goto out_err;
392 kfree(target); 389 kfree(target);
393 } 390 }
394 391
@@ -450,11 +447,6 @@ static struct gcov_node *new_node(struct gcov_node *parent,
450 parent->dentry, node, &gcov_data_fops); 447 parent->dentry, node, &gcov_data_fops);
451 } else 448 } else
452 node->dentry = debugfs_create_dir(node->name, parent->dentry); 449 node->dentry = debugfs_create_dir(node->name, parent->dentry);
453 if (!node->dentry) {
454 pr_warn("could not create file\n");
455 kfree(node);
456 return NULL;
457 }
458 if (info) 450 if (info)
459 add_links(node, parent->dentry); 451 add_links(node, parent->dentry);
460 list_add(&node->list, &parent->children); 452 list_add(&node->list, &parent->children);
@@ -761,32 +753,20 @@ void gcov_event(enum gcov_action action, struct gcov_info *info)
761/* Create debugfs entries. */ 753/* Create debugfs entries. */
762static __init int gcov_fs_init(void) 754static __init int gcov_fs_init(void)
763{ 755{
764 int rc = -EIO;
765
766 init_node(&root_node, NULL, NULL, NULL); 756 init_node(&root_node, NULL, NULL, NULL);
767 /* 757 /*
768 * /sys/kernel/debug/gcov will be parent for the reset control file 758 * /sys/kernel/debug/gcov will be parent for the reset control file
769 * and all profiling files. 759 * and all profiling files.
770 */ 760 */
771 root_node.dentry = debugfs_create_dir("gcov", NULL); 761 root_node.dentry = debugfs_create_dir("gcov", NULL);
772 if (!root_node.dentry)
773 goto err_remove;
774 /* 762 /*
775 * Create reset file which resets all profiling counts when written 763 * Create reset file which resets all profiling counts when written
776 * to. 764 * to.
777 */ 765 */
778 reset_dentry = debugfs_create_file("reset", 0600, root_node.dentry, 766 debugfs_create_file("reset", 0600, root_node.dentry, NULL,
779 NULL, &gcov_reset_fops); 767 &gcov_reset_fops);
780 if (!reset_dentry)
781 goto err_remove;
782 /* Replay previous events to get our fs hierarchy up-to-date. */ 768 /* Replay previous events to get our fs hierarchy up-to-date. */
783 gcov_enable_events(); 769 gcov_enable_events();
784 return 0; 770 return 0;
785
786err_remove:
787 pr_err("init failed\n");
788 debugfs_remove(root_node.dentry);
789
790 return rc;
791} 771}
792device_initcall(gcov_fs_init); 772device_initcall(gcov_fs_init);
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c
index 2dddecbdbe6e..801ee4b0b969 100644
--- a/kernel/gcov/gcc_3_4.c
+++ b/kernel/gcov/gcc_3_4.c
@@ -137,6 +137,18 @@ void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
137 gcov_info_head = info->next; 137 gcov_info_head = info->next;
138} 138}
139 139
140/**
141 * gcov_info_within_module - check if a profiling data set belongs to a module
142 * @info: profiling data set
143 * @mod: module
144 *
145 * Returns true if profiling data belongs module, false otherwise.
146 */
147bool gcov_info_within_module(struct gcov_info *info, struct module *mod)
148{
149 return within_module((unsigned long)info, mod);
150}
151
140/* Symbolic links to be created for each profiling data file. */ 152/* Symbolic links to be created for each profiling data file. */
141const struct gcov_link gcov_link[] = { 153const struct gcov_link gcov_link[] = {
142 { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */ 154 { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index ca5e5c0ef853..ec37563674d6 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -150,6 +150,18 @@ void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
150 gcov_info_head = info->next; 150 gcov_info_head = info->next;
151} 151}
152 152
153/**
154 * gcov_info_within_module - check if a profiling data set belongs to a module
155 * @info: profiling data set
156 * @mod: module
157 *
158 * Returns true if profiling data belongs module, false otherwise.
159 */
160bool gcov_info_within_module(struct gcov_info *info, struct module *mod)
161{
162 return within_module((unsigned long)info, mod);
163}
164
153/* Symbolic links to be created for each profiling data file. */ 165/* Symbolic links to be created for each profiling data file. */
154const struct gcov_link gcov_link[] = { 166const struct gcov_link gcov_link[] = {
155 { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */ 167 { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */
diff --git a/kernel/gcov/gcc_base.c b/kernel/gcov/gcc_base.c
new file mode 100644
index 000000000000..3cf736b9f880
--- /dev/null
+++ b/kernel/gcov/gcc_base.c
@@ -0,0 +1,86 @@
1// SPDX-License-Identifier: GPL-2.0
2
3#include <linux/export.h>
4#include <linux/kernel.h>
5#include <linux/mutex.h>
6#include "gcov.h"
7
8/*
9 * __gcov_init is called by gcc-generated constructor code for each object
10 * file compiled with -fprofile-arcs.
11 */
12void __gcov_init(struct gcov_info *info)
13{
14 static unsigned int gcov_version;
15
16 mutex_lock(&gcov_lock);
17 if (gcov_version == 0) {
18 gcov_version = gcov_info_version(info);
19 /*
20 * Printing gcc's version magic may prove useful for debugging
21 * incompatibility reports.
22 */
23 pr_info("version magic: 0x%x\n", gcov_version);
24 }
25 /*
26 * Add new profiling data structure to list and inform event
27 * listener.
28 */
29 gcov_info_link(info);
30 if (gcov_events_enabled)
31 gcov_event(GCOV_ADD, info);
32 mutex_unlock(&gcov_lock);
33}
34EXPORT_SYMBOL(__gcov_init);
35
36/*
37 * These functions may be referenced by gcc-generated profiling code but serve
38 * no function for kernel profiling.
39 */
40void __gcov_flush(void)
41{
42 /* Unused. */
43}
44EXPORT_SYMBOL(__gcov_flush);
45
46void __gcov_merge_add(gcov_type *counters, unsigned int n_counters)
47{
48 /* Unused. */
49}
50EXPORT_SYMBOL(__gcov_merge_add);
51
52void __gcov_merge_single(gcov_type *counters, unsigned int n_counters)
53{
54 /* Unused. */
55}
56EXPORT_SYMBOL(__gcov_merge_single);
57
58void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters)
59{
60 /* Unused. */
61}
62EXPORT_SYMBOL(__gcov_merge_delta);
63
64void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters)
65{
66 /* Unused. */
67}
68EXPORT_SYMBOL(__gcov_merge_ior);
69
70void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters)
71{
72 /* Unused. */
73}
74EXPORT_SYMBOL(__gcov_merge_time_profile);
75
76void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters)
77{
78 /* Unused. */
79}
80EXPORT_SYMBOL(__gcov_merge_icall_topn);
81
82void __gcov_exit(void)
83{
84 /* Unused. */
85}
86EXPORT_SYMBOL(__gcov_exit);
diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h
index de118ad4a024..6ab2c1808c9d 100644
--- a/kernel/gcov/gcov.h
+++ b/kernel/gcov/gcov.h
@@ -15,6 +15,7 @@
15#ifndef GCOV_H 15#ifndef GCOV_H
16#define GCOV_H GCOV_H 16#define GCOV_H GCOV_H
17 17
18#include <linux/module.h>
18#include <linux/types.h> 19#include <linux/types.h>
19 20
20/* 21/*
@@ -46,6 +47,7 @@ unsigned int gcov_info_version(struct gcov_info *info);
46struct gcov_info *gcov_info_next(struct gcov_info *info); 47struct gcov_info *gcov_info_next(struct gcov_info *info);
47void gcov_info_link(struct gcov_info *info); 48void gcov_info_link(struct gcov_info *info);
48void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info); 49void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info);
50bool gcov_info_within_module(struct gcov_info *info, struct module *mod);
49 51
50/* Base interface. */ 52/* Base interface. */
51enum gcov_action { 53enum gcov_action {
@@ -83,4 +85,7 @@ struct gcov_link {
83}; 85};
84extern const struct gcov_link gcov_link[]; 86extern const struct gcov_link gcov_link[];
85 87
88extern int gcov_events_enabled;
89extern struct mutex gcov_lock;
90
86#endif /* GCOV_H */ 91#endif /* GCOV_H */
diff --git a/kernel/gen_ikh_data.sh b/kernel/gen_kheaders.sh
index 591a94f7b387..9ff449888d9c 100755
--- a/kernel/gen_ikh_data.sh
+++ b/kernel/gen_kheaders.sh
@@ -2,26 +2,14 @@
2# SPDX-License-Identifier: GPL-2.0 2# SPDX-License-Identifier: GPL-2.0
3 3
4# This script generates an archive consisting of kernel headers 4# This script generates an archive consisting of kernel headers
5# for CONFIG_IKHEADERS_PROC. 5# for CONFIG_IKHEADERS.
6set -e 6set -e
7spath="$(dirname "$(readlink -f "$0")")" 7sfile="$(readlink -f "$0")"
8kroot="$spath/.."
9outdir="$(pwd)" 8outdir="$(pwd)"
10tarfile=$1 9tarfile=$1
11cpio_dir=$outdir/$tarfile.tmp 10cpio_dir=$outdir/$tarfile.tmp
12 11
13# Script filename relative to the kernel source root 12dir_list="
14# We add it to the archive because it is small and any changes
15# to this script will also cause a rebuild of the archive.
16sfile="$(realpath --relative-to $kroot "$(readlink -f "$0")")"
17
18src_file_list="
19include/
20arch/$SRCARCH/include/
21$sfile
22"
23
24obj_file_list="
25include/ 13include/
26arch/$SRCARCH/include/ 14arch/$SRCARCH/include/
27" 15"
@@ -31,28 +19,31 @@ arch/$SRCARCH/include/
31 19
32# This block is useful for debugging the incremental builds. 20# This block is useful for debugging the incremental builds.
33# Uncomment it for debugging. 21# Uncomment it for debugging.
34# iter=1 22# if [ ! -f /tmp/iter ]; then iter=1; echo 1 > /tmp/iter;
35# if [ ! -f /tmp/iter ]; then echo 1 > /tmp/iter; 23# else iter=$(($(cat /tmp/iter) + 1)); echo $iter > /tmp/iter; fi
36# else; iter=$(($(cat /tmp/iter) + 1)); fi 24# find $src_file_list -name "*.h" | xargs ls -l > /tmp/src-ls-$iter
37# find $src_file_list -type f | xargs ls -lR > /tmp/src-ls-$iter 25# find $obj_file_list -name "*.h" | xargs ls -l > /tmp/obj-ls-$iter
38# find $obj_file_list -type f | xargs ls -lR > /tmp/obj-ls-$iter
39 26
40# include/generated/compile.h is ignored because it is touched even when none 27# include/generated/compile.h is ignored because it is touched even when none
41# of the source files changed. This causes pointless regeneration, so let us 28# of the source files changed. This causes pointless regeneration, so let us
42# ignore them for md5 calculation. 29# ignore them for md5 calculation.
43pushd $kroot > /dev/null 30pushd $srctree > /dev/null
44src_files_md5="$(find $src_file_list -type f | 31src_files_md5="$(find $dir_list -name "*.h" |
45 grep -v "include/generated/compile.h" | 32 grep -v "include/generated/compile.h" |
46 xargs ls -lR | md5sum | cut -d ' ' -f1)" 33 grep -v "include/generated/autoconf.h" |
34 xargs ls -l | md5sum | cut -d ' ' -f1)"
47popd > /dev/null 35popd > /dev/null
48obj_files_md5="$(find $obj_file_list -type f | 36obj_files_md5="$(find $dir_list -name "*.h" |
49 grep -v "include/generated/compile.h" | 37 grep -v "include/generated/compile.h" |
50 xargs ls -lR | md5sum | cut -d ' ' -f1)" 38 grep -v "include/generated/autoconf.h" |
51 39 xargs ls -l | md5sum | cut -d ' ' -f1)"
40# Any changes to this script will also cause a rebuild of the archive.
41this_file_md5="$(ls -l $sfile | md5sum | cut -d ' ' -f1)"
52if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi 42if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi
53if [ -f kernel/kheaders.md5 ] && 43if [ -f kernel/kheaders.md5 ] &&
54 [ "$(cat kernel/kheaders.md5|head -1)" == "$src_files_md5" ] && 44 [ "$(cat kernel/kheaders.md5|head -1)" == "$src_files_md5" ] &&
55 [ "$(cat kernel/kheaders.md5|head -2|tail -1)" == "$obj_files_md5" ] && 45 [ "$(cat kernel/kheaders.md5|head -2|tail -1)" == "$obj_files_md5" ] &&
46 [ "$(cat kernel/kheaders.md5|head -3|tail -1)" == "$this_file_md5" ] &&
56 [ "$(cat kernel/kheaders.md5|tail -1)" == "$tarfile_md5" ]; then 47 [ "$(cat kernel/kheaders.md5|tail -1)" == "$tarfile_md5" ]; then
57 exit 48 exit
58fi 49fi
@@ -64,16 +55,16 @@ fi
64rm -rf $cpio_dir 55rm -rf $cpio_dir
65mkdir $cpio_dir 56mkdir $cpio_dir
66 57
67pushd $kroot > /dev/null 58pushd $srctree > /dev/null
68for f in $src_file_list; 59for f in $dir_list;
69 do find "$f" ! -name "*.cmd" ! -name ".*"; 60 do find "$f" -name "*.h";
70done | cpio --quiet -pd $cpio_dir 61done | cpio --quiet -pd $cpio_dir
71popd > /dev/null 62popd > /dev/null
72 63
73# The second CPIO can complain if files already exist which can 64# The second CPIO can complain if files already exist which can
74# happen with out of tree builds. Just silence CPIO for now. 65# happen with out of tree builds. Just silence CPIO for now.
75for f in $obj_file_list; 66for f in $dir_list;
76 do find "$f" ! -name "*.cmd" ! -name ".*"; 67 do find "$f" -name "*.h";
77done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1 68done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1
78 69
79# Remove comments except SDPX lines 70# Remove comments except SDPX lines
@@ -82,8 +73,9 @@ find $cpio_dir -type f -print0 |
82 73
83tar -Jcf $tarfile -C $cpio_dir/ . > /dev/null 74tar -Jcf $tarfile -C $cpio_dir/ . > /dev/null
84 75
85echo "$src_files_md5" > kernel/kheaders.md5 76echo "$src_files_md5" > kernel/kheaders.md5
86echo "$obj_files_md5" >> kernel/kheaders.md5 77echo "$obj_files_md5" >> kernel/kheaders.md5
78echo "$this_file_md5" >> kernel/kheaders.md5
87echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5 79echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5
88 80
89rm -rf $cpio_dir 81rm -rf $cpio_dir
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index f108a95882c6..14a625c16cb3 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * Detect Hung Task 3 * Detect Hung Task
3 * 4 *
diff --git a/kernel/iomem.c b/kernel/iomem.c
index 93c264444510..62c92e43aa0d 100644
--- a/kernel/iomem.c
+++ b/kernel/iomem.c
@@ -121,7 +121,7 @@ EXPORT_SYMBOL(memremap);
121 121
122void memunmap(void *addr) 122void memunmap(void *addr)
123{ 123{
124 if (is_vmalloc_addr(addr)) 124 if (is_ioremap_addr(addr))
125 iounmap((void __iomem *) addr); 125 iounmap((void __iomem *) addr);
126} 126}
127EXPORT_SYMBOL(memunmap); 127EXPORT_SYMBOL(memunmap);
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 5f3e2baefca9..f92d9a687372 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -1,3 +1,4 @@
1# SPDX-License-Identifier: GPL-2.0-only
1menu "IRQ subsystem" 2menu "IRQ subsystem"
2# Options selectable by the architecture code 3# Options selectable by the architecture code
3 4
@@ -91,6 +92,9 @@ config GENERIC_MSI_IRQ_DOMAIN
91 select IRQ_DOMAIN_HIERARCHY 92 select IRQ_DOMAIN_HIERARCHY
92 select GENERIC_MSI_IRQ 93 select GENERIC_MSI_IRQ
93 94
95config IRQ_MSI_IOMMU
96 bool
97
94config HANDLE_DOMAIN_IRQ 98config HANDLE_DOMAIN_IRQ
95 bool 99 bool
96 100
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index ff6e352e3a6c..b4f53717d143 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -2,6 +2,9 @@
2 2
3obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o 3obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o
4obj-$(CONFIG_IRQ_TIMINGS) += timings.o 4obj-$(CONFIG_IRQ_TIMINGS) += timings.o
5ifeq ($(CONFIG_TEST_IRQ_TIMINGS),y)
6 CFLAGS_timings.o += -DDEBUG
7endif
5obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o 8obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o
6obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o 9obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
7obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o 10obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index f18cd5aa33e8..4352b08ae48d 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -94,8 +94,7 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
94 return nodes; 94 return nodes;
95} 95}
96 96
97static int __irq_build_affinity_masks(const struct irq_affinity *affd, 97static int __irq_build_affinity_masks(unsigned int startvec,
98 unsigned int startvec,
99 unsigned int numvecs, 98 unsigned int numvecs,
100 unsigned int firstvec, 99 unsigned int firstvec,
101 cpumask_var_t *node_to_cpumask, 100 cpumask_var_t *node_to_cpumask,
@@ -171,8 +170,7 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd,
171 * 1) spread present CPU on these vectors 170 * 1) spread present CPU on these vectors
172 * 2) spread other possible CPUs on these vectors 171 * 2) spread other possible CPUs on these vectors
173 */ 172 */
174static int irq_build_affinity_masks(const struct irq_affinity *affd, 173static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
175 unsigned int startvec, unsigned int numvecs,
176 unsigned int firstvec, 174 unsigned int firstvec,
177 struct irq_affinity_desc *masks) 175 struct irq_affinity_desc *masks)
178{ 176{
@@ -197,7 +195,7 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd,
197 build_node_to_cpumask(node_to_cpumask); 195 build_node_to_cpumask(node_to_cpumask);
198 196
199 /* Spread on present CPUs starting from affd->pre_vectors */ 197 /* Spread on present CPUs starting from affd->pre_vectors */
200 nr_present = __irq_build_affinity_masks(affd, curvec, numvecs, 198 nr_present = __irq_build_affinity_masks(curvec, numvecs,
201 firstvec, node_to_cpumask, 199 firstvec, node_to_cpumask,
202 cpu_present_mask, nmsk, masks); 200 cpu_present_mask, nmsk, masks);
203 201
@@ -212,7 +210,7 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd,
212 else 210 else
213 curvec = firstvec + nr_present; 211 curvec = firstvec + nr_present;
214 cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); 212 cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
215 nr_others = __irq_build_affinity_masks(affd, curvec, numvecs, 213 nr_others = __irq_build_affinity_masks(curvec, numvecs,
216 firstvec, node_to_cpumask, 214 firstvec, node_to_cpumask,
217 npresmsk, nmsk, masks); 215 npresmsk, nmsk, masks);
218 put_online_cpus(); 216 put_online_cpus();
@@ -295,7 +293,7 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
295 unsigned int this_vecs = affd->set_size[i]; 293 unsigned int this_vecs = affd->set_size[i];
296 int ret; 294 int ret;
297 295
298 ret = irq_build_affinity_masks(affd, curvec, this_vecs, 296 ret = irq_build_affinity_masks(curvec, this_vecs,
299 curvec, masks); 297 curvec, masks);
300 if (ret) { 298 if (ret) {
301 kfree(masks); 299 kfree(masks);
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 16cbf6beb276..ae60cae24e9a 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -90,7 +90,7 @@ unsigned long probe_irq_on(void)
90 /* It triggered already - consider it spurious. */ 90 /* It triggered already - consider it spurious. */
91 if (!(desc->istate & IRQS_WAITING)) { 91 if (!(desc->istate & IRQS_WAITING)) {
92 desc->istate &= ~IRQS_AUTODETECT; 92 desc->istate &= ~IRQS_AUTODETECT;
93 irq_shutdown(desc); 93 irq_shutdown_and_deactivate(desc);
94 } else 94 } else
95 if (i < 32) 95 if (i < 32)
96 mask |= 1 << i; 96 mask |= 1 << i;
@@ -127,7 +127,7 @@ unsigned int probe_irq_mask(unsigned long val)
127 mask |= 1 << i; 127 mask |= 1 << i;
128 128
129 desc->istate &= ~IRQS_AUTODETECT; 129 desc->istate &= ~IRQS_AUTODETECT;
130 irq_shutdown(desc); 130 irq_shutdown_and_deactivate(desc);
131 } 131 }
132 raw_spin_unlock_irq(&desc->lock); 132 raw_spin_unlock_irq(&desc->lock);
133 } 133 }
@@ -169,7 +169,7 @@ int probe_irq_off(unsigned long val)
169 nr_of_irqs++; 169 nr_of_irqs++;
170 } 170 }
171 desc->istate &= ~IRQS_AUTODETECT; 171 desc->istate &= ~IRQS_AUTODETECT;
172 irq_shutdown(desc); 172 irq_shutdown_and_deactivate(desc);
173 } 173 }
174 raw_spin_unlock_irq(&desc->lock); 174 raw_spin_unlock_irq(&desc->lock);
175 } 175 }
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 51128bea3846..b76703b2c0af 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -314,6 +314,12 @@ void irq_shutdown(struct irq_desc *desc)
314 } 314 }
315 irq_state_clr_started(desc); 315 irq_state_clr_started(desc);
316 } 316 }
317}
318
319
320void irq_shutdown_and_deactivate(struct irq_desc *desc)
321{
322 irq_shutdown(desc);
317 /* 323 /*
318 * This must be called even if the interrupt was never started up, 324 * This must be called even if the interrupt was never started up,
319 * because the activation can happen before the interrupt is 325 * because the activation can happen before the interrupt is
@@ -748,6 +754,8 @@ void handle_fasteoi_nmi(struct irq_desc *desc)
748 unsigned int irq = irq_desc_get_irq(desc); 754 unsigned int irq = irq_desc_get_irq(desc);
749 irqreturn_t res; 755 irqreturn_t res;
750 756
757 __kstat_incr_irqs_this_cpu(desc);
758
751 trace_irq_handler_entry(irq, action); 759 trace_irq_handler_entry(irq, action);
752 /* 760 /*
753 * NMIs cannot be shared, there is only one action. 761 * NMIs cannot be shared, there is only one action.
@@ -962,6 +970,8 @@ void handle_percpu_devid_fasteoi_nmi(struct irq_desc *desc)
962 unsigned int irq = irq_desc_get_irq(desc); 970 unsigned int irq = irq_desc_get_irq(desc);
963 irqreturn_t res; 971 irqreturn_t res;
964 972
973 __kstat_incr_irqs_this_cpu(desc);
974
965 trace_irq_handler_entry(irq, action); 975 trace_irq_handler_entry(irq, action);
966 res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id)); 976 res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id));
967 trace_irq_handler_exit(irq, action, res); 977 trace_irq_handler_exit(irq, action, res);
@@ -1459,6 +1469,33 @@ int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on)
1459 return -ENOSYS; 1469 return -ENOSYS;
1460} 1470}
1461EXPORT_SYMBOL_GPL(irq_chip_set_wake_parent); 1471EXPORT_SYMBOL_GPL(irq_chip_set_wake_parent);
1472
1473/**
1474 * irq_chip_request_resources_parent - Request resources on the parent interrupt
1475 * @data: Pointer to interrupt specific data
1476 */
1477int irq_chip_request_resources_parent(struct irq_data *data)
1478{
1479 data = data->parent_data;
1480
1481 if (data->chip->irq_request_resources)
1482 return data->chip->irq_request_resources(data);
1483
1484 return -ENOSYS;
1485}
1486EXPORT_SYMBOL_GPL(irq_chip_request_resources_parent);
1487
1488/**
1489 * irq_chip_release_resources_parent - Release resources on the parent interrupt
1490 * @data: Pointer to interrupt specific data
1491 */
1492void irq_chip_release_resources_parent(struct irq_data *data)
1493{
1494 data = data->parent_data;
1495 if (data->chip->irq_release_resources)
1496 data->chip->irq_release_resources(data);
1497}
1498EXPORT_SYMBOL_GPL(irq_chip_release_resources_parent);
1462#endif 1499#endif
1463 1500
1464/** 1501/**
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 5b1072e394b2..6c7ca2e983a5 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -116,7 +116,7 @@ static bool migrate_one_irq(struct irq_desc *desc)
116 */ 116 */
117 if (irqd_affinity_is_managed(d)) { 117 if (irqd_affinity_is_managed(d)) {
118 irqd_set_managed_shutdown(d); 118 irqd_set_managed_shutdown(d);
119 irq_shutdown(desc); 119 irq_shutdown_and_deactivate(desc);
120 return false; 120 return false;
121 } 121 }
122 affinity = cpu_online_mask; 122 affinity = cpu_online_mask;
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 70c3053bc1f6..3924fbe829d4 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -82,6 +82,7 @@ extern int irq_activate_and_startup(struct irq_desc *desc, bool resend);
82extern int irq_startup(struct irq_desc *desc, bool resend, bool force); 82extern int irq_startup(struct irq_desc *desc, bool resend, bool force);
83 83
84extern void irq_shutdown(struct irq_desc *desc); 84extern void irq_shutdown(struct irq_desc *desc);
85extern void irq_shutdown_and_deactivate(struct irq_desc *desc);
85extern void irq_enable(struct irq_desc *desc); 86extern void irq_enable(struct irq_desc *desc);
86extern void irq_disable(struct irq_desc *desc); 87extern void irq_disable(struct irq_desc *desc);
87extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu); 88extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu);
@@ -96,6 +97,10 @@ static inline void irq_mark_irq(unsigned int irq) { }
96extern void irq_mark_irq(unsigned int irq); 97extern void irq_mark_irq(unsigned int irq);
97#endif 98#endif
98 99
100extern int __irq_get_irqchip_state(struct irq_data *data,
101 enum irqchip_irq_state which,
102 bool *state);
103
99extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); 104extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
100 105
101irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags); 106irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags);
@@ -354,6 +359,16 @@ static inline int irq_timing_decode(u64 value, u64 *timestamp)
354 return value & U16_MAX; 359 return value & U16_MAX;
355} 360}
356 361
362static __always_inline void irq_timings_push(u64 ts, int irq)
363{
364 struct irq_timings *timings = this_cpu_ptr(&irq_timings);
365
366 timings->values[timings->count & IRQ_TIMINGS_MASK] =
367 irq_timing_encode(ts, irq);
368
369 timings->count++;
370}
371
357/* 372/*
358 * The function record_irq_time is only called in one place in the 373 * The function record_irq_time is only called in one place in the
359 * interrupts handler. We want this function always inline so the code 374 * interrupts handler. We want this function always inline so the code
@@ -367,15 +382,8 @@ static __always_inline void record_irq_time(struct irq_desc *desc)
367 if (!static_branch_likely(&irq_timing_enabled)) 382 if (!static_branch_likely(&irq_timing_enabled))
368 return; 383 return;
369 384
370 if (desc->istate & IRQS_TIMINGS) { 385 if (desc->istate & IRQS_TIMINGS)
371 struct irq_timings *timings = this_cpu_ptr(&irq_timings); 386 irq_timings_push(local_clock(), irq_desc_get_irq(desc));
372
373 timings->values[timings->count & IRQ_TIMINGS_MASK] =
374 irq_timing_encode(local_clock(),
375 irq_desc_get_irq(desc));
376
377 timings->count++;
378 }
379} 387}
380#else 388#else
381static inline void irq_remove_timings(struct irq_desc *desc) {} 389static inline void irq_remove_timings(struct irq_desc *desc) {}
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index c52b737ab8e3..9484e88dabc2 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -680,6 +680,8 @@ int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq,
680 * @hwirq: The HW irq number to convert to a logical one 680 * @hwirq: The HW irq number to convert to a logical one
681 * @regs: Register file coming from the low-level handling code 681 * @regs: Register file coming from the low-level handling code
682 * 682 *
683 * This function must be called from an NMI context.
684 *
683 * Returns: 0 on success, or -EINVAL if conversion has failed 685 * Returns: 0 on success, or -EINVAL if conversion has failed
684 */ 686 */
685int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq, 687int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq,
@@ -689,7 +691,10 @@ int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq,
689 unsigned int irq; 691 unsigned int irq;
690 int ret = 0; 692 int ret = 0;
691 693
692 nmi_enter(); 694 /*
695 * NMI context needs to be setup earlier in order to deal with tracing.
696 */
697 WARN_ON(!in_nmi());
693 698
694 irq = irq_find_mapping(domain, hwirq); 699 irq = irq_find_mapping(domain, hwirq);
695 700
@@ -702,7 +707,6 @@ int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq,
702 else 707 else
703 ret = -EINVAL; 708 ret = -EINVAL;
704 709
705 nmi_exit();
706 set_irq_regs(old_regs); 710 set_irq_regs(old_regs);
707 return ret; 711 return ret;
708} 712}
@@ -946,6 +950,11 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
946 *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; 950 *per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
947} 951}
948 952
953static bool irq_is_nmi(struct irq_desc *desc)
954{
955 return desc->istate & IRQS_NMI;
956}
957
949/** 958/**
950 * kstat_irqs - Get the statistics for an interrupt 959 * kstat_irqs - Get the statistics for an interrupt
951 * @irq: The interrupt number 960 * @irq: The interrupt number
@@ -963,7 +972,8 @@ unsigned int kstat_irqs(unsigned int irq)
963 if (!desc || !desc->kstat_irqs) 972 if (!desc || !desc->kstat_irqs)
964 return 0; 973 return 0;
965 if (!irq_settings_is_per_cpu_devid(desc) && 974 if (!irq_settings_is_per_cpu_devid(desc) &&
966 !irq_settings_is_per_cpu(desc)) 975 !irq_settings_is_per_cpu(desc) &&
976 !irq_is_nmi(desc))
967 return desc->tot_count; 977 return desc->tot_count;
968 978
969 for_each_possible_cpu(cpu) 979 for_each_possible_cpu(cpu)
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 9ed29e4a7dbf..3078d0e48bba 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -123,7 +123,7 @@ EXPORT_SYMBOL_GPL(irq_domain_free_fwnode);
123 * @ops: domain callbacks 123 * @ops: domain callbacks
124 * @host_data: Controller private data pointer 124 * @host_data: Controller private data pointer
125 * 125 *
126 * Allocates and initialize and irq_domain structure. 126 * Allocates and initializes an irq_domain structure.
127 * Returns pointer to IRQ domain, or NULL on failure. 127 * Returns pointer to IRQ domain, or NULL on failure.
128 */ 128 */
129struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, 129struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
@@ -139,7 +139,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
139 139
140 domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size), 140 domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size),
141 GFP_KERNEL, of_node_to_nid(of_node)); 141 GFP_KERNEL, of_node_to_nid(of_node));
142 if (WARN_ON(!domain)) 142 if (!domain)
143 return NULL; 143 return NULL;
144 144
145 if (fwnode && is_fwnode_irqchip(fwnode)) { 145 if (fwnode && is_fwnode_irqchip(fwnode)) {
@@ -1297,7 +1297,7 @@ int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain,
1297/** 1297/**
1298 * __irq_domain_alloc_irqs - Allocate IRQs from domain 1298 * __irq_domain_alloc_irqs - Allocate IRQs from domain
1299 * @domain: domain to allocate from 1299 * @domain: domain to allocate from
1300 * @irq_base: allocate specified IRQ nubmer if irq_base >= 0 1300 * @irq_base: allocate specified IRQ number if irq_base >= 0
1301 * @nr_irqs: number of IRQs to allocate 1301 * @nr_irqs: number of IRQs to allocate
1302 * @node: NUMA node id for memory allocation 1302 * @node: NUMA node id for memory allocation
1303 * @arg: domain specific argument 1303 * @arg: domain specific argument
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 78f3ddeb7fe4..e8f7f179bf77 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -13,6 +13,7 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/random.h> 14#include <linux/random.h>
15#include <linux/interrupt.h> 15#include <linux/interrupt.h>
16#include <linux/irqdomain.h>
16#include <linux/slab.h> 17#include <linux/slab.h>
17#include <linux/sched.h> 18#include <linux/sched.h>
18#include <linux/sched/rt.h> 19#include <linux/sched/rt.h>
@@ -34,8 +35,9 @@ static int __init setup_forced_irqthreads(char *arg)
34early_param("threadirqs", setup_forced_irqthreads); 35early_param("threadirqs", setup_forced_irqthreads);
35#endif 36#endif
36 37
37static void __synchronize_hardirq(struct irq_desc *desc) 38static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip)
38{ 39{
40 struct irq_data *irqd = irq_desc_get_irq_data(desc);
39 bool inprogress; 41 bool inprogress;
40 42
41 do { 43 do {
@@ -51,6 +53,20 @@ static void __synchronize_hardirq(struct irq_desc *desc)
51 /* Ok, that indicated we're done: double-check carefully. */ 53 /* Ok, that indicated we're done: double-check carefully. */
52 raw_spin_lock_irqsave(&desc->lock, flags); 54 raw_spin_lock_irqsave(&desc->lock, flags);
53 inprogress = irqd_irq_inprogress(&desc->irq_data); 55 inprogress = irqd_irq_inprogress(&desc->irq_data);
56
57 /*
58 * If requested and supported, check at the chip whether it
59 * is in flight at the hardware level, i.e. already pending
60 * in a CPU and waiting for service and acknowledge.
61 */
62 if (!inprogress && sync_chip) {
63 /*
64 * Ignore the return code. inprogress is only updated
65 * when the chip supports it.
66 */
67 __irq_get_irqchip_state(irqd, IRQCHIP_STATE_ACTIVE,
68 &inprogress);
69 }
54 raw_spin_unlock_irqrestore(&desc->lock, flags); 70 raw_spin_unlock_irqrestore(&desc->lock, flags);
55 71
56 /* Oops, that failed? */ 72 /* Oops, that failed? */
@@ -73,13 +89,18 @@ static void __synchronize_hardirq(struct irq_desc *desc)
73 * Returns: false if a threaded handler is active. 89 * Returns: false if a threaded handler is active.
74 * 90 *
75 * This function may be called - with care - from IRQ context. 91 * This function may be called - with care - from IRQ context.
92 *
93 * It does not check whether there is an interrupt in flight at the
94 * hardware level, but not serviced yet, as this might deadlock when
95 * called with interrupts disabled and the target CPU of the interrupt
96 * is the current CPU.
76 */ 97 */
77bool synchronize_hardirq(unsigned int irq) 98bool synchronize_hardirq(unsigned int irq)
78{ 99{
79 struct irq_desc *desc = irq_to_desc(irq); 100 struct irq_desc *desc = irq_to_desc(irq);
80 101
81 if (desc) { 102 if (desc) {
82 __synchronize_hardirq(desc); 103 __synchronize_hardirq(desc, false);
83 return !atomic_read(&desc->threads_active); 104 return !atomic_read(&desc->threads_active);
84 } 105 }
85 106
@@ -95,14 +116,19 @@ EXPORT_SYMBOL(synchronize_hardirq);
95 * to complete before returning. If you use this function while 116 * to complete before returning. If you use this function while
96 * holding a resource the IRQ handler may need you will deadlock. 117 * holding a resource the IRQ handler may need you will deadlock.
97 * 118 *
98 * This function may be called - with care - from IRQ context. 119 * Can only be called from preemptible code as it might sleep when
120 * an interrupt thread is associated to @irq.
121 *
122 * It optionally makes sure (when the irq chip supports that method)
123 * that the interrupt is not pending in any CPU and waiting for
124 * service.
99 */ 125 */
100void synchronize_irq(unsigned int irq) 126void synchronize_irq(unsigned int irq)
101{ 127{
102 struct irq_desc *desc = irq_to_desc(irq); 128 struct irq_desc *desc = irq_to_desc(irq);
103 129
104 if (desc) { 130 if (desc) {
105 __synchronize_hardirq(desc); 131 __synchronize_hardirq(desc, true);
106 /* 132 /*
107 * We made sure that no hardirq handler is 133 * We made sure that no hardirq handler is
108 * running. Now verify that no threaded handlers are 134 * running. Now verify that no threaded handlers are
@@ -1699,6 +1725,7 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id)
1699 /* If this was the last handler, shut down the IRQ line: */ 1725 /* If this was the last handler, shut down the IRQ line: */
1700 if (!desc->action) { 1726 if (!desc->action) {
1701 irq_settings_clr_disable_unlazy(desc); 1727 irq_settings_clr_disable_unlazy(desc);
1728 /* Only shutdown. Deactivate after synchronize_hardirq() */
1702 irq_shutdown(desc); 1729 irq_shutdown(desc);
1703 } 1730 }
1704 1731
@@ -1727,8 +1754,12 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id)
1727 1754
1728 unregister_handler_proc(irq, action); 1755 unregister_handler_proc(irq, action);
1729 1756
1730 /* Make sure it's not being used on another CPU: */ 1757 /*
1731 synchronize_hardirq(irq); 1758 * Make sure it's not being used on another CPU and if the chip
1759 * supports it also make sure that there is no (not yet serviced)
1760 * interrupt in flight at the hardware level.
1761 */
1762 __synchronize_hardirq(desc, true);
1732 1763
1733#ifdef CONFIG_DEBUG_SHIRQ 1764#ifdef CONFIG_DEBUG_SHIRQ
1734 /* 1765 /*
@@ -1768,6 +1799,14 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id)
1768 * require it to deallocate resources over the slow bus. 1799 * require it to deallocate resources over the slow bus.
1769 */ 1800 */
1770 chip_bus_lock(desc); 1801 chip_bus_lock(desc);
1802 /*
1803 * There is no interrupt on the fly anymore. Deactivate it
1804 * completely.
1805 */
1806 raw_spin_lock_irqsave(&desc->lock, flags);
1807 irq_domain_deactivate_irq(&desc->irq_data);
1808 raw_spin_unlock_irqrestore(&desc->lock, flags);
1809
1771 irq_release_resources(desc); 1810 irq_release_resources(desc);
1772 chip_bus_sync_unlock(desc); 1811 chip_bus_sync_unlock(desc);
1773 irq_remove_timings(desc); 1812 irq_remove_timings(desc);
@@ -1855,7 +1894,7 @@ static const void *__cleanup_nmi(unsigned int irq, struct irq_desc *desc)
1855 } 1894 }
1856 1895
1857 irq_settings_clr_disable_unlazy(desc); 1896 irq_settings_clr_disable_unlazy(desc);
1858 irq_shutdown(desc); 1897 irq_shutdown_and_deactivate(desc);
1859 1898
1860 irq_release_resources(desc); 1899 irq_release_resources(desc);
1861 1900
@@ -2578,6 +2617,28 @@ out:
2578 irq_put_desc_unlock(desc, flags); 2617 irq_put_desc_unlock(desc, flags);
2579} 2618}
2580 2619
2620int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which,
2621 bool *state)
2622{
2623 struct irq_chip *chip;
2624 int err = -EINVAL;
2625
2626 do {
2627 chip = irq_data_get_irq_chip(data);
2628 if (chip->irq_get_irqchip_state)
2629 break;
2630#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
2631 data = data->parent_data;
2632#else
2633 data = NULL;
2634#endif
2635 } while (data);
2636
2637 if (data)
2638 err = chip->irq_get_irqchip_state(data, which, state);
2639 return err;
2640}
2641
2581/** 2642/**
2582 * irq_get_irqchip_state - returns the irqchip state of a interrupt. 2643 * irq_get_irqchip_state - returns the irqchip state of a interrupt.
2583 * @irq: Interrupt line that is forwarded to a VM 2644 * @irq: Interrupt line that is forwarded to a VM
@@ -2596,7 +2657,6 @@ int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
2596{ 2657{
2597 struct irq_desc *desc; 2658 struct irq_desc *desc;
2598 struct irq_data *data; 2659 struct irq_data *data;
2599 struct irq_chip *chip;
2600 unsigned long flags; 2660 unsigned long flags;
2601 int err = -EINVAL; 2661 int err = -EINVAL;
2602 2662
@@ -2606,19 +2666,7 @@ int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
2606 2666
2607 data = irq_desc_get_irq_data(desc); 2667 data = irq_desc_get_irq_data(desc);
2608 2668
2609 do { 2669 err = __irq_get_irqchip_state(data, which, state);
2610 chip = irq_data_get_irq_chip(data);
2611 if (chip->irq_get_irqchip_state)
2612 break;
2613#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
2614 data = data->parent_data;
2615#else
2616 data = NULL;
2617#endif
2618 } while (data);
2619
2620 if (data)
2621 err = chip->irq_get_irqchip_state(data, which, state);
2622 2670
2623 irq_put_desc_busunlock(desc, flags); 2671 irq_put_desc_busunlock(desc, flags);
2624 return err; 2672 return err;
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
index 90c735da15d0..e960d7ce7bcc 100644
--- a/kernel/irq/timings.c
+++ b/kernel/irq/timings.c
@@ -1,10 +1,12 @@
1// SPDX-License-Identifier: GPL-2.0 1// SPDX-License-Identifier: GPL-2.0
2// Copyright (C) 2016, Linaro Ltd - Daniel Lezcano <daniel.lezcano@linaro.org> 2// Copyright (C) 2016, Linaro Ltd - Daniel Lezcano <daniel.lezcano@linaro.org>
3#define pr_fmt(fmt) "irq_timings: " fmt
3 4
4#include <linux/kernel.h> 5#include <linux/kernel.h>
5#include <linux/percpu.h> 6#include <linux/percpu.h>
6#include <linux/slab.h> 7#include <linux/slab.h>
7#include <linux/static_key.h> 8#include <linux/static_key.h>
9#include <linux/init.h>
8#include <linux/interrupt.h> 10#include <linux/interrupt.h>
9#include <linux/idr.h> 11#include <linux/idr.h>
10#include <linux/irq.h> 12#include <linux/irq.h>
@@ -261,12 +263,29 @@ void irq_timings_disable(void)
261#define EMA_ALPHA_VAL 64 263#define EMA_ALPHA_VAL 64
262#define EMA_ALPHA_SHIFT 7 264#define EMA_ALPHA_SHIFT 7
263 265
264#define PREDICTION_PERIOD_MIN 2 266#define PREDICTION_PERIOD_MIN 3
265#define PREDICTION_PERIOD_MAX 5 267#define PREDICTION_PERIOD_MAX 5
266#define PREDICTION_FACTOR 4 268#define PREDICTION_FACTOR 4
267#define PREDICTION_MAX 10 /* 2 ^ PREDICTION_MAX useconds */ 269#define PREDICTION_MAX 10 /* 2 ^ PREDICTION_MAX useconds */
268#define PREDICTION_BUFFER_SIZE 16 /* slots for EMAs, hardly more than 16 */ 270#define PREDICTION_BUFFER_SIZE 16 /* slots for EMAs, hardly more than 16 */
269 271
272/*
273 * Number of elements in the circular buffer: If it happens it was
274 * flushed before, then the number of elements could be smaller than
275 * IRQ_TIMINGS_SIZE, so the count is used, otherwise the array size is
276 * used as we wrapped. The index begins from zero when we did not
277 * wrap. That could be done in a nicer way with the proper circular
278 * array structure type but with the cost of extra computation in the
279 * interrupt handler hot path. We choose efficiency.
280 */
281#define for_each_irqts(i, irqts) \
282 for (i = irqts->count < IRQ_TIMINGS_SIZE ? \
283 0 : irqts->count & IRQ_TIMINGS_MASK, \
284 irqts->count = min(IRQ_TIMINGS_SIZE, \
285 irqts->count); \
286 irqts->count > 0; irqts->count--, \
287 i = (i + 1) & IRQ_TIMINGS_MASK)
288
270struct irqt_stat { 289struct irqt_stat {
271 u64 last_ts; 290 u64 last_ts;
272 u64 ema_time[PREDICTION_BUFFER_SIZE]; 291 u64 ema_time[PREDICTION_BUFFER_SIZE];
@@ -297,7 +316,16 @@ static u64 irq_timings_ema_new(u64 value, u64 ema_old)
297 316
298static int irq_timings_next_event_index(int *buffer, size_t len, int period_max) 317static int irq_timings_next_event_index(int *buffer, size_t len, int period_max)
299{ 318{
300 int i; 319 int period;
320
321 /*
322 * Move the beginning pointer to the end minus the max period x 3.
323 * We are at the point we can begin searching the pattern
324 */
325 buffer = &buffer[len - (period_max * 3)];
326
327 /* Adjust the length to the maximum allowed period x 3 */
328 len = period_max * 3;
301 329
302 /* 330 /*
303 * The buffer contains the suite of intervals, in a ilog2 331 * The buffer contains the suite of intervals, in a ilog2
@@ -306,21 +334,45 @@ static int irq_timings_next_event_index(int *buffer, size_t len, int period_max)
306 * period beginning at the end of the buffer. We do that for 334 * period beginning at the end of the buffer. We do that for
307 * each suffix. 335 * each suffix.
308 */ 336 */
309 for (i = period_max; i >= PREDICTION_PERIOD_MIN ; i--) { 337 for (period = period_max; period >= PREDICTION_PERIOD_MIN; period--) {
310 338
311 int *begin = &buffer[len - (i * 3)]; 339 /*
312 int *ptr = begin; 340 * The first comparison always succeed because the
341 * suffix is deduced from the first n-period bytes of
342 * the buffer and we compare the initial suffix with
343 * itself, so we can skip the first iteration.
344 */
345 int idx = period;
346 size_t size = period;
313 347
314 /* 348 /*
315 * We look if the suite with period 'i' repeat 349 * We look if the suite with period 'i' repeat
316 * itself. If it is truncated at the end, as it 350 * itself. If it is truncated at the end, as it
317 * repeats we can use the period to find out the next 351 * repeats we can use the period to find out the next
318 * element. 352 * element with the modulo.
319 */ 353 */
320 while (!memcmp(ptr, begin, i * sizeof(*ptr))) { 354 while (!memcmp(buffer, &buffer[idx], size * sizeof(int))) {
321 ptr += i; 355
322 if (ptr >= &buffer[len]) 356 /*
323 return begin[((i * 3) % i)]; 357 * Move the index in a period basis
358 */
359 idx += size;
360
361 /*
362 * If this condition is reached, all previous
363 * memcmp were successful, so the period is
364 * found.
365 */
366 if (idx == len)
367 return buffer[len % period];
368
369 /*
370 * If the remaining elements to compare are
371 * smaller than the period, readjust the size
372 * of the comparison for the last iteration.
373 */
374 if (len - idx < period)
375 size = len - idx;
324 } 376 }
325 } 377 }
326 378
@@ -380,11 +432,43 @@ static u64 __irq_timings_next_event(struct irqt_stat *irqs, int irq, u64 now)
380 return irqs->last_ts + irqs->ema_time[index]; 432 return irqs->last_ts + irqs->ema_time[index];
381} 433}
382 434
435static __always_inline int irq_timings_interval_index(u64 interval)
436{
437 /*
438 * The PREDICTION_FACTOR increase the interval size for the
439 * array of exponential average.
440 */
441 u64 interval_us = (interval >> 10) / PREDICTION_FACTOR;
442
443 return likely(interval_us) ? ilog2(interval_us) : 0;
444}
445
446static __always_inline void __irq_timings_store(int irq, struct irqt_stat *irqs,
447 u64 interval)
448{
449 int index;
450
451 /*
452 * Get the index in the ema table for this interrupt.
453 */
454 index = irq_timings_interval_index(interval);
455
456 /*
457 * Store the index as an element of the pattern in another
458 * circular array.
459 */
460 irqs->circ_timings[irqs->count & IRQ_TIMINGS_MASK] = index;
461
462 irqs->ema_time[index] = irq_timings_ema_new(interval,
463 irqs->ema_time[index]);
464
465 irqs->count++;
466}
467
383static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts) 468static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts)
384{ 469{
385 u64 old_ts = irqs->last_ts; 470 u64 old_ts = irqs->last_ts;
386 u64 interval; 471 u64 interval;
387 int index;
388 472
389 /* 473 /*
390 * The timestamps are absolute time values, we need to compute 474 * The timestamps are absolute time values, we need to compute
@@ -415,24 +499,7 @@ static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts)
415 return; 499 return;
416 } 500 }
417 501
418 /* 502 __irq_timings_store(irq, irqs, interval);
419 * Get the index in the ema table for this interrupt. The
420 * PREDICTION_FACTOR increase the interval size for the array
421 * of exponential average.
422 */
423 index = likely(interval) ?
424 ilog2((interval >> 10) / PREDICTION_FACTOR) : 0;
425
426 /*
427 * Store the index as an element of the pattern in another
428 * circular array.
429 */
430 irqs->circ_timings[irqs->count & IRQ_TIMINGS_MASK] = index;
431
432 irqs->ema_time[index] = irq_timings_ema_new(interval,
433 irqs->ema_time[index]);
434
435 irqs->count++;
436} 503}
437 504
438/** 505/**
@@ -493,11 +560,7 @@ u64 irq_timings_next_event(u64 now)
493 * model while decrementing the counter because we consume the 560 * model while decrementing the counter because we consume the
494 * data from our circular buffer. 561 * data from our circular buffer.
495 */ 562 */
496 563 for_each_irqts(i, irqts) {
497 i = (irqts->count & IRQ_TIMINGS_MASK) - 1;
498 irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count);
499
500 for (; irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) {
501 irq = irq_timing_decode(irqts->values[i], &ts); 564 irq = irq_timing_decode(irqts->values[i], &ts);
502 s = idr_find(&irqt_stats, irq); 565 s = idr_find(&irqt_stats, irq);
503 if (s) 566 if (s)
@@ -564,3 +627,325 @@ int irq_timings_alloc(int irq)
564 627
565 return 0; 628 return 0;
566} 629}
630
631#ifdef CONFIG_TEST_IRQ_TIMINGS
632struct timings_intervals {
633 u64 *intervals;
634 size_t count;
635};
636
637/*
638 * Intervals are given in nanosecond base
639 */
640static u64 intervals0[] __initdata = {
641 10000, 50000, 200000, 500000,
642 10000, 50000, 200000, 500000,
643 10000, 50000, 200000, 500000,
644 10000, 50000, 200000, 500000,
645 10000, 50000, 200000, 500000,
646 10000, 50000, 200000, 500000,
647 10000, 50000, 200000, 500000,
648 10000, 50000, 200000, 500000,
649 10000, 50000, 200000,
650};
651
652static u64 intervals1[] __initdata = {
653 223947000, 1240000, 1384000, 1386000, 1386000,
654 217416000, 1236000, 1384000, 1386000, 1387000,
655 214719000, 1241000, 1386000, 1387000, 1384000,
656 213696000, 1234000, 1384000, 1386000, 1388000,
657 219904000, 1240000, 1385000, 1389000, 1385000,
658 212240000, 1240000, 1386000, 1386000, 1386000,
659 214415000, 1236000, 1384000, 1386000, 1387000,
660 214276000, 1234000,
661};
662
663static u64 intervals2[] __initdata = {
664 4000, 3000, 5000, 100000,
665 3000, 3000, 5000, 117000,
666 4000, 4000, 5000, 112000,
667 4000, 3000, 4000, 110000,
668 3000, 5000, 3000, 117000,
669 4000, 4000, 5000, 112000,
670 4000, 3000, 4000, 110000,
671 3000, 4000, 5000, 112000,
672 4000,
673};
674
675static u64 intervals3[] __initdata = {
676 1385000, 212240000, 1240000,
677 1386000, 214415000, 1236000,
678 1384000, 214276000, 1234000,
679 1386000, 214415000, 1236000,
680 1385000, 212240000, 1240000,
681 1386000, 214415000, 1236000,
682 1384000, 214276000, 1234000,
683 1386000, 214415000, 1236000,
684 1385000, 212240000, 1240000,
685};
686
687static u64 intervals4[] __initdata = {
688 10000, 50000, 10000, 50000,
689 10000, 50000, 10000, 50000,
690 10000, 50000, 10000, 50000,
691 10000, 50000, 10000, 50000,
692 10000, 50000, 10000, 50000,
693 10000, 50000, 10000, 50000,
694 10000, 50000, 10000, 50000,
695 10000, 50000, 10000, 50000,
696 10000,
697};
698
699static struct timings_intervals tis[] __initdata = {
700 { intervals0, ARRAY_SIZE(intervals0) },
701 { intervals1, ARRAY_SIZE(intervals1) },
702 { intervals2, ARRAY_SIZE(intervals2) },
703 { intervals3, ARRAY_SIZE(intervals3) },
704 { intervals4, ARRAY_SIZE(intervals4) },
705};
706
707static int __init irq_timings_test_next_index(struct timings_intervals *ti)
708{
709 int _buffer[IRQ_TIMINGS_SIZE];
710 int buffer[IRQ_TIMINGS_SIZE];
711 int index, start, i, count, period_max;
712
713 count = ti->count - 1;
714
715 period_max = count > (3 * PREDICTION_PERIOD_MAX) ?
716 PREDICTION_PERIOD_MAX : count / 3;
717
718 /*
719 * Inject all values except the last one which will be used
720 * to compare with the next index result.
721 */
722 pr_debug("index suite: ");
723
724 for (i = 0; i < count; i++) {
725 index = irq_timings_interval_index(ti->intervals[i]);
726 _buffer[i & IRQ_TIMINGS_MASK] = index;
727 pr_cont("%d ", index);
728 }
729
730 start = count < IRQ_TIMINGS_SIZE ? 0 :
731 count & IRQ_TIMINGS_MASK;
732
733 count = min_t(int, count, IRQ_TIMINGS_SIZE);
734
735 for (i = 0; i < count; i++) {
736 int index = (start + i) & IRQ_TIMINGS_MASK;
737 buffer[i] = _buffer[index];
738 }
739
740 index = irq_timings_next_event_index(buffer, count, period_max);
741 i = irq_timings_interval_index(ti->intervals[ti->count - 1]);
742
743 if (index != i) {
744 pr_err("Expected (%d) and computed (%d) next indexes differ\n",
745 i, index);
746 return -EINVAL;
747 }
748
749 return 0;
750}
751
752static int __init irq_timings_next_index_selftest(void)
753{
754 int i, ret;
755
756 for (i = 0; i < ARRAY_SIZE(tis); i++) {
757
758 pr_info("---> Injecting intervals number #%d (count=%zd)\n",
759 i, tis[i].count);
760
761 ret = irq_timings_test_next_index(&tis[i]);
762 if (ret)
763 break;
764 }
765
766 return ret;
767}
768
769static int __init irq_timings_test_irqs(struct timings_intervals *ti)
770{
771 struct irqt_stat __percpu *s;
772 struct irqt_stat *irqs;
773 int i, index, ret, irq = 0xACE5;
774
775 ret = irq_timings_alloc(irq);
776 if (ret) {
777 pr_err("Failed to allocate irq timings\n");
778 return ret;
779 }
780
781 s = idr_find(&irqt_stats, irq);
782 if (!s) {
783 ret = -EIDRM;
784 goto out;
785 }
786
787 irqs = this_cpu_ptr(s);
788
789 for (i = 0; i < ti->count; i++) {
790
791 index = irq_timings_interval_index(ti->intervals[i]);
792 pr_debug("%d: interval=%llu ema_index=%d\n",
793 i, ti->intervals[i], index);
794
795 __irq_timings_store(irq, irqs, ti->intervals[i]);
796 if (irqs->circ_timings[i & IRQ_TIMINGS_MASK] != index) {
797 pr_err("Failed to store in the circular buffer\n");
798 goto out;
799 }
800 }
801
802 if (irqs->count != ti->count) {
803 pr_err("Count differs\n");
804 goto out;
805 }
806
807 ret = 0;
808out:
809 irq_timings_free(irq);
810
811 return ret;
812}
813
814static int __init irq_timings_irqs_selftest(void)
815{
816 int i, ret;
817
818 for (i = 0; i < ARRAY_SIZE(tis); i++) {
819 pr_info("---> Injecting intervals number #%d (count=%zd)\n",
820 i, tis[i].count);
821 ret = irq_timings_test_irqs(&tis[i]);
822 if (ret)
823 break;
824 }
825
826 return ret;
827}
828
829static int __init irq_timings_test_irqts(struct irq_timings *irqts,
830 unsigned count)
831{
832 int start = count >= IRQ_TIMINGS_SIZE ? count - IRQ_TIMINGS_SIZE : 0;
833 int i, irq, oirq = 0xBEEF;
834 u64 ots = 0xDEAD, ts;
835
836 /*
837 * Fill the circular buffer by using the dedicated function.
838 */
839 for (i = 0; i < count; i++) {
840 pr_debug("%d: index=%d, ts=%llX irq=%X\n",
841 i, i & IRQ_TIMINGS_MASK, ots + i, oirq + i);
842
843 irq_timings_push(ots + i, oirq + i);
844 }
845
846 /*
847 * Compute the first elements values after the index wrapped
848 * up or not.
849 */
850 ots += start;
851 oirq += start;
852
853 /*
854 * Test the circular buffer count is correct.
855 */
856 pr_debug("---> Checking timings array count (%d) is right\n", count);
857 if (WARN_ON(irqts->count != count))
858 return -EINVAL;
859
860 /*
861 * Test the macro allowing to browse all the irqts.
862 */
863 pr_debug("---> Checking the for_each_irqts() macro\n");
864 for_each_irqts(i, irqts) {
865
866 irq = irq_timing_decode(irqts->values[i], &ts);
867
868 pr_debug("index=%d, ts=%llX / %llX, irq=%X / %X\n",
869 i, ts, ots, irq, oirq);
870
871 if (WARN_ON(ts != ots || irq != oirq))
872 return -EINVAL;
873
874 ots++; oirq++;
875 }
876
877 /*
878 * The circular buffer should have be flushed when browsed
879 * with for_each_irqts
880 */
881 pr_debug("---> Checking timings array is empty after browsing it\n");
882 if (WARN_ON(irqts->count))
883 return -EINVAL;
884
885 return 0;
886}
887
888static int __init irq_timings_irqts_selftest(void)
889{
890 struct irq_timings *irqts = this_cpu_ptr(&irq_timings);
891 int i, ret;
892
893 /*
894 * Test the circular buffer with different number of
895 * elements. The purpose is to test at the limits (empty, half
896 * full, full, wrapped with the cursor at the boundaries,
897 * wrapped several times, etc ...
898 */
899 int count[] = { 0,
900 IRQ_TIMINGS_SIZE >> 1,
901 IRQ_TIMINGS_SIZE,
902 IRQ_TIMINGS_SIZE + (IRQ_TIMINGS_SIZE >> 1),
903 2 * IRQ_TIMINGS_SIZE,
904 (2 * IRQ_TIMINGS_SIZE) + 3,
905 };
906
907 for (i = 0; i < ARRAY_SIZE(count); i++) {
908
909 pr_info("---> Checking the timings with %d/%d values\n",
910 count[i], IRQ_TIMINGS_SIZE);
911
912 ret = irq_timings_test_irqts(irqts, count[i]);
913 if (ret)
914 break;
915 }
916
917 return ret;
918}
919
920static int __init irq_timings_selftest(void)
921{
922 int ret;
923
924 pr_info("------------------- selftest start -----------------\n");
925
926 /*
927 * At this point, we don't except any subsystem to use the irq
928 * timings but us, so it should not be enabled.
929 */
930 if (static_branch_unlikely(&irq_timing_enabled)) {
931 pr_warn("irq timings already initialized, skipping selftest\n");
932 return 0;
933 }
934
935 ret = irq_timings_irqts_selftest();
936 if (ret)
937 goto out;
938
939 ret = irq_timings_irqs_selftest();
940 if (ret)
941 goto out;
942
943 ret = irq_timings_next_index_selftest();
944out:
945 pr_info("---------- selftest end with %s -----------\n",
946 ret ? "failure" : "success");
947
948 return ret;
949}
950early_initcall(irq_timings_selftest);
951#endif
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 73288914ed5e..d42acaf81886 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra 3 * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra
3 * 4 *
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index de6efdecc70d..df3008419a1d 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * jump label support 3 * jump label support
3 * 4 *
@@ -36,12 +37,26 @@ static int jump_label_cmp(const void *a, const void *b)
36 const struct jump_entry *jea = a; 37 const struct jump_entry *jea = a;
37 const struct jump_entry *jeb = b; 38 const struct jump_entry *jeb = b;
38 39
40 /*
41 * Entrires are sorted by key.
42 */
39 if (jump_entry_key(jea) < jump_entry_key(jeb)) 43 if (jump_entry_key(jea) < jump_entry_key(jeb))
40 return -1; 44 return -1;
41 45
42 if (jump_entry_key(jea) > jump_entry_key(jeb)) 46 if (jump_entry_key(jea) > jump_entry_key(jeb))
43 return 1; 47 return 1;
44 48
49 /*
50 * In the batching mode, entries should also be sorted by the code
51 * inside the already sorted list of entries, enabling a bsearch in
52 * the vector.
53 */
54 if (jump_entry_code(jea) < jump_entry_code(jeb))
55 return -1;
56
57 if (jump_entry_code(jea) > jump_entry_code(jeb))
58 return 1;
59
45 return 0; 60 return 0;
46} 61}
47 62
@@ -383,25 +398,55 @@ static enum jump_label_type jump_label_type(struct jump_entry *entry)
383 return enabled ^ branch; 398 return enabled ^ branch;
384} 399}
385 400
401static bool jump_label_can_update(struct jump_entry *entry, bool init)
402{
403 /*
404 * Cannot update code that was in an init text area.
405 */
406 if (!init && jump_entry_is_init(entry))
407 return false;
408
409 if (!kernel_text_address(jump_entry_code(entry))) {
410 WARN_ONCE(1, "can't patch jump_label at %pS", (void *)jump_entry_code(entry));
411 return false;
412 }
413
414 return true;
415}
416
417#ifndef HAVE_JUMP_LABEL_BATCH
386static void __jump_label_update(struct static_key *key, 418static void __jump_label_update(struct static_key *key,
387 struct jump_entry *entry, 419 struct jump_entry *entry,
388 struct jump_entry *stop, 420 struct jump_entry *stop,
389 bool init) 421 bool init)
390{ 422{
391 for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) { 423 for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) {
392 /* 424 if (jump_label_can_update(entry, init))
393 * An entry->code of 0 indicates an entry which has been 425 arch_jump_label_transform(entry, jump_label_type(entry));
394 * disabled because it was in an init text area. 426 }
395 */ 427}
396 if (init || !jump_entry_is_init(entry)) { 428#else
397 if (kernel_text_address(jump_entry_code(entry))) 429static void __jump_label_update(struct static_key *key,
398 arch_jump_label_transform(entry, jump_label_type(entry)); 430 struct jump_entry *entry,
399 else 431 struct jump_entry *stop,
400 WARN_ONCE(1, "can't patch jump_label at %pS", 432 bool init)
401 (void *)jump_entry_code(entry)); 433{
434 for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) {
435
436 if (!jump_label_can_update(entry, init))
437 continue;
438
439 if (!arch_jump_label_transform_queue(entry, jump_label_type(entry))) {
440 /*
441 * Queue is full: Apply the current queue and try again.
442 */
443 arch_jump_label_transform_apply();
444 BUG_ON(!arch_jump_label_transform_queue(entry, jump_label_type(entry)));
402 } 445 }
403 } 446 }
447 arch_jump_label_transform_apply();
404} 448}
449#endif
405 450
406void __init jump_label_init(void) 451void __init jump_label_init(void)
407{ 452{
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 14934afa9e68..95a260f9214b 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * kallsyms.c: in-kernel printing of symbolic oopses and stack traces. 3 * kallsyms.c: in-kernel printing of symbolic oopses and stack traces.
3 * 4 *
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 68559808fdfa..1b018f1a6e0d 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1,9 +1,7 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * kexec.c - kexec_load system call 3 * kexec.c - kexec_load system call
3 * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> 4 * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
4 *
5 * This source code is licensed under the GNU General Public License,
6 * Version 2. See the file COPYING for more details.
7 */ 5 */
8 6
9#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 7#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index fd5c95ff9251..d5870723b8ad 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1,9 +1,7 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * kexec.c - kexec system call core code. 3 * kexec.c - kexec system call core code.
3 * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> 4 * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
4 *
5 * This source code is licensed under the GNU General Public License,
6 * Version 2. See the file COPYING for more details.
7 */ 5 */
8 6
9#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 7#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 072b6ee55e3f..b8cc032d5620 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -1,12 +1,10 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * kexec: kexec_file_load system call 3 * kexec: kexec_file_load system call
3 * 4 *
4 * Copyright (C) 2014 Red Hat Inc. 5 * Copyright (C) 2014 Red Hat Inc.
5 * Authors: 6 * Authors:
6 * Vivek Goyal <vgoyal@redhat.com> 7 * Vivek Goyal <vgoyal@redhat.com>
7 *
8 * This source code is licensed under the GNU General Public License,
9 * Version 2. See the file COPYING for more details.
10 */ 8 */
11 9
12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -198,9 +196,6 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
198 return ret; 196 return ret;
199 image->kernel_buf_len = size; 197 image->kernel_buf_len = size;
200 198
201 /* IMA needs to pass the measurement list to the next kernel. */
202 ima_add_kexec_buffer(image);
203
204 /* Call arch image probe handlers */ 199 /* Call arch image probe handlers */
205 ret = arch_kexec_kernel_image_probe(image, image->kernel_buf, 200 ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
206 image->kernel_buf_len); 201 image->kernel_buf_len);
@@ -241,8 +236,14 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
241 ret = -EINVAL; 236 ret = -EINVAL;
242 goto out; 237 goto out;
243 } 238 }
239
240 ima_kexec_cmdline(image->cmdline_buf,
241 image->cmdline_buf_len - 1);
244 } 242 }
245 243
244 /* IMA needs to pass the measurement list to the next kernel. */
245 ima_add_kexec_buffer(image);
246
246 /* Call arch image load handlers */ 247 /* Call arch image load handlers */
247 ldata = arch_kexec_kernel_image_load(image); 248 ldata = arch_kexec_kernel_image_load(image);
248 249
diff --git a/kernel/kheaders.c b/kernel/kheaders.c
index 70ae6052920d..8f69772af77b 100644
--- a/kernel/kheaders.c
+++ b/kernel/kheaders.c
@@ -8,9 +8,8 @@
8 8
9#include <linux/kernel.h> 9#include <linux/kernel.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/proc_fs.h> 11#include <linux/kobject.h>
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/uaccess.h>
14 13
15/* 14/*
16 * Define kernel_headers_data and kernel_headers_data_end, within which the 15 * Define kernel_headers_data and kernel_headers_data_end, within which the
@@ -31,39 +30,32 @@ extern char kernel_headers_data;
31extern char kernel_headers_data_end; 30extern char kernel_headers_data_end;
32 31
33static ssize_t 32static ssize_t
34ikheaders_read_current(struct file *file, char __user *buf, 33ikheaders_read(struct file *file, struct kobject *kobj,
35 size_t len, loff_t *offset) 34 struct bin_attribute *bin_attr,
35 char *buf, loff_t off, size_t len)
36{ 36{
37 return simple_read_from_buffer(buf, len, offset, 37 memcpy(buf, &kernel_headers_data + off, len);
38 &kernel_headers_data, 38 return len;
39 &kernel_headers_data_end -
40 &kernel_headers_data);
41} 39}
42 40
43static const struct file_operations ikheaders_file_ops = { 41static struct bin_attribute kheaders_attr __ro_after_init = {
44 .read = ikheaders_read_current, 42 .attr = {
45 .llseek = default_llseek, 43 .name = "kheaders.tar.xz",
44 .mode = 0444,
45 },
46 .read = &ikheaders_read,
46}; 47};
47 48
48static int __init ikheaders_init(void) 49static int __init ikheaders_init(void)
49{ 50{
50 struct proc_dir_entry *entry; 51 kheaders_attr.size = (&kernel_headers_data_end -
51 52 &kernel_headers_data);
52 /* create the current headers file */ 53 return sysfs_create_bin_file(kernel_kobj, &kheaders_attr);
53 entry = proc_create("kheaders.tar.xz", S_IRUGO, NULL,
54 &ikheaders_file_ops);
55 if (!entry)
56 return -ENOMEM;
57
58 proc_set_size(entry,
59 &kernel_headers_data_end -
60 &kernel_headers_data);
61 return 0;
62} 54}
63 55
64static void __exit ikheaders_cleanup(void) 56static void __exit ikheaders_cleanup(void)
65{ 57{
66 remove_proc_entry("kheaders.tar.xz", NULL); 58 sysfs_remove_bin_file(kernel_kobj, &kheaders_attr);
67} 59}
68 60
69module_init(ikheaders_init); 61module_init(ikheaders_init);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index b1ea30a5540e..9f5433a52488 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1,21 +1,8 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
1/* 2/*
2 * Kernel Probes (KProbes) 3 * Kernel Probes (KProbes)
3 * kernel/kprobes.c 4 * kernel/kprobes.c
4 * 5 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 *
19 * Copyright (C) IBM Corporation, 2002, 2004 6 * Copyright (C) IBM Corporation, 2002, 2004
20 * 7 *
21 * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel 8 * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
@@ -2583,33 +2570,20 @@ static const struct file_operations fops_kp = {
2583 2570
2584static int __init debugfs_kprobe_init(void) 2571static int __init debugfs_kprobe_init(void)
2585{ 2572{
2586 struct dentry *dir, *file; 2573 struct dentry *dir;
2587 unsigned int value = 1; 2574 unsigned int value = 1;
2588 2575
2589 dir = debugfs_create_dir("kprobes", NULL); 2576 dir = debugfs_create_dir("kprobes", NULL);
2590 if (!dir)
2591 return -ENOMEM;
2592 2577
2593 file = debugfs_create_file("list", 0400, dir, NULL, 2578 debugfs_create_file("list", 0400, dir, NULL,
2594 &debugfs_kprobes_operations); 2579 &debugfs_kprobes_operations);
2595 if (!file)
2596 goto error;
2597 2580
2598 file = debugfs_create_file("enabled", 0600, dir, 2581 debugfs_create_file("enabled", 0600, dir, &value, &fops_kp);
2599 &value, &fops_kp);
2600 if (!file)
2601 goto error;
2602 2582
2603 file = debugfs_create_file("blacklist", 0400, dir, NULL, 2583 debugfs_create_file("blacklist", 0400, dir, NULL,
2604 &debugfs_kprobe_blacklist_ops); 2584 &debugfs_kprobe_blacklist_ops);
2605 if (!file)
2606 goto error;
2607 2585
2608 return 0; 2586 return 0;
2609
2610error:
2611 debugfs_remove(dir);
2612 return -ENOMEM;
2613} 2587}
2614 2588
2615late_initcall(debugfs_kprobe_init); 2589late_initcall(debugfs_kprobe_init);
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 46ba853656f6..35859da8bd4f 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -1,11 +1,9 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * kernel/ksysfs.c - sysfs attributes in /sys/kernel, which 3 * kernel/ksysfs.c - sysfs attributes in /sys/kernel, which
3 * are not related to any other subsystem 4 * are not related to any other subsystem
4 * 5 *
5 * Copyright (C) 2004 Kay Sievers <kay.sievers@vrfy.org> 6 * Copyright (C) 2004 Kay Sievers <kay.sievers@vrfy.org>
6 *
7 * This file is release under the GPLv2
8 *
9 */ 7 */
10 8
11#include <linux/kobject.h> 9#include <linux/kobject.h>
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 5942eeafb9ac..621467c33fef 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* Kernel thread helper functions. 2/* Kernel thread helper functions.
2 * Copyright (C) 2004 IBM Corporation, Rusty Russell. 3 * Copyright (C) 2004 IBM Corporation, Rusty Russell.
3 * 4 *
@@ -11,6 +12,7 @@
11#include <linux/kthread.h> 12#include <linux/kthread.h>
12#include <linux/completion.h> 13#include <linux/completion.h>
13#include <linux/err.h> 14#include <linux/err.h>
15#include <linux/cgroup.h>
14#include <linux/cpuset.h> 16#include <linux/cpuset.h>
15#include <linux/unistd.h> 17#include <linux/unistd.h>
16#include <linux/file.h> 18#include <linux/file.h>
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 99a5b5f46dc5..e3acead004e6 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -1,13 +1,9 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * latencytop.c: Latency display infrastructure 3 * latencytop.c: Latency display infrastructure
3 * 4 *
4 * (C) Copyright 2008 Intel Corporation 5 * (C) Copyright 2008 Intel Corporation
5 * Author: Arjan van de Ven <arjan@linux.intel.com> 6 * Author: Arjan van de Ven <arjan@linux.intel.com>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; version 2
10 * of the License.
11 */ 7 */
12 8
13/* 9/*
@@ -67,13 +63,10 @@ static struct latency_record latency_record[MAXLR];
67 63
68int latencytop_enabled; 64int latencytop_enabled;
69 65
70void clear_all_latency_tracing(struct task_struct *p) 66void clear_tsk_latency_tracing(struct task_struct *p)
71{ 67{
72 unsigned long flags; 68 unsigned long flags;
73 69
74 if (!latencytop_enabled)
75 return;
76
77 raw_spin_lock_irqsave(&latency_lock, flags); 70 raw_spin_lock_irqsave(&latency_lock, flags);
78 memset(&p->latency_record, 0, sizeof(p->latency_record)); 71 memset(&p->latency_record, 0, sizeof(p->latency_record));
79 p->latency_record_count = 0; 72 p->latency_record_count = 0;
@@ -96,9 +89,6 @@ account_global_scheduler_latency(struct task_struct *tsk,
96 int firstnonnull = MAXLR + 1; 89 int firstnonnull = MAXLR + 1;
97 int i; 90 int i;
98 91
99 if (!latencytop_enabled)
100 return;
101
102 /* skip kernel threads for now */ 92 /* skip kernel threads for now */
103 if (!tsk->mm) 93 if (!tsk->mm)
104 return; 94 return;
diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig
index ec4565122e65..54102deb50ba 100644
--- a/kernel/livepatch/Kconfig
+++ b/kernel/livepatch/Kconfig
@@ -1,3 +1,4 @@
1# SPDX-License-Identifier: GPL-2.0-only
1config HAVE_LIVEPATCH 2config HAVE_LIVEPATCH
2 bool 3 bool
3 help 4 help
diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile
index b36ceda6488e..cf9b5bcdb952 100644
--- a/kernel/livepatch/Makefile
+++ b/kernel/livepatch/Makefile
@@ -1,3 +1,4 @@
1# SPDX-License-Identifier: GPL-2.0-only
1obj-$(CONFIG_LIVEPATCH) += livepatch.o 2obj-$(CONFIG_LIVEPATCH) += livepatch.o
2 3
3livepatch-objs := core.o patch.o shadow.o transition.o 4livepatch-objs := core.o patch.o shadow.o transition.o
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index f6fbaff10e71..c4ce08f43bd6 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -1,21 +1,9 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
1/* 2/*
2 * core.c - Kernel Live Patching Core 3 * core.c - Kernel Live Patching Core
3 * 4 *
4 * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com> 5 * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
5 * Copyright (C) 2014 SUSE 6 * Copyright (C) 2014 SUSE
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2
10 * of the License, or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, see <http://www.gnu.org/licenses/>.
19 */ 7 */
20 8
21#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 9#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -30,6 +18,7 @@
30#include <linux/elf.h> 18#include <linux/elf.h>
31#include <linux/moduleloader.h> 19#include <linux/moduleloader.h>
32#include <linux/completion.h> 20#include <linux/completion.h>
21#include <linux/memory.h>
33#include <asm/cacheflush.h> 22#include <asm/cacheflush.h>
34#include "core.h" 23#include "core.h"
35#include "patch.h" 24#include "patch.h"
@@ -730,16 +719,21 @@ static int klp_init_object_loaded(struct klp_patch *patch,
730 struct klp_func *func; 719 struct klp_func *func;
731 int ret; 720 int ret;
732 721
722 mutex_lock(&text_mutex);
723
733 module_disable_ro(patch->mod); 724 module_disable_ro(patch->mod);
734 ret = klp_write_object_relocations(patch->mod, obj); 725 ret = klp_write_object_relocations(patch->mod, obj);
735 if (ret) { 726 if (ret) {
736 module_enable_ro(patch->mod, true); 727 module_enable_ro(patch->mod, true);
728 mutex_unlock(&text_mutex);
737 return ret; 729 return ret;
738 } 730 }
739 731
740 arch_klp_init_object_loaded(patch, obj); 732 arch_klp_init_object_loaded(patch, obj);
741 module_enable_ro(patch->mod, true); 733 module_enable_ro(patch->mod, true);
742 734
735 mutex_unlock(&text_mutex);
736
743 klp_for_each_func(obj, func) { 737 klp_for_each_func(obj, func) {
744 ret = klp_find_object_symbol(obj->name, func->old_name, 738 ret = klp_find_object_symbol(obj->name, func->old_name,
745 func->old_sympos, 739 func->old_sympos,
@@ -1208,14 +1202,6 @@ void klp_module_going(struct module *mod)
1208 1202
1209static int __init klp_init(void) 1203static int __init klp_init(void)
1210{ 1204{
1211 int ret;
1212
1213 ret = klp_check_compiler_support();
1214 if (ret) {
1215 pr_info("Your compiler is too old; turning off.\n");
1216 return -EINVAL;
1217 }
1218
1219 klp_root_kobj = kobject_create_and_add("livepatch", kernel_kobj); 1205 klp_root_kobj = kobject_create_and_add("livepatch", kernel_kobj);
1220 if (!klp_root_kobj) 1206 if (!klp_root_kobj)
1221 return -ENOMEM; 1207 return -ENOMEM;
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
index 99cb3ad05eb4..bd43537702bd 100644
--- a/kernel/livepatch/patch.c
+++ b/kernel/livepatch/patch.c
@@ -1,22 +1,10 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
1/* 2/*
2 * patch.c - livepatch patching functions 3 * patch.c - livepatch patching functions
3 * 4 *
4 * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com> 5 * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
5 * Copyright (C) 2014 SUSE 6 * Copyright (C) 2014 SUSE
6 * Copyright (C) 2015 Josh Poimboeuf <jpoimboe@redhat.com> 7 * Copyright (C) 2015 Josh Poimboeuf <jpoimboe@redhat.com>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version 2
11 * of the License, or (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */ 8 */
21 9
22#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c
index 83958c814439..e5c9fb295ba9 100644
--- a/kernel/livepatch/shadow.c
+++ b/kernel/livepatch/shadow.c
@@ -1,22 +1,10 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
1/* 2/*
2 * shadow.c - Shadow Variables 3 * shadow.c - Shadow Variables
3 * 4 *
4 * Copyright (C) 2014 Josh Poimboeuf <jpoimboe@redhat.com> 5 * Copyright (C) 2014 Josh Poimboeuf <jpoimboe@redhat.com>
5 * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com> 6 * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
6 * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com> 7 * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version 2
11 * of the License, or (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */ 8 */
21 9
22/** 10/**
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index c53370d596be..cdf318d86dd6 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -1,20 +1,8 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
1/* 2/*
2 * transition.c - Kernel Live Patching transition functions 3 * transition.c - Kernel Live Patching transition functions
3 * 4 *
4 * Copyright (C) 2015-2016 Josh Poimboeuf <jpoimboe@redhat.com> 5 * Copyright (C) 2015-2016 Josh Poimboeuf <jpoimboe@redhat.com>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 2
9 * of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, see <http://www.gnu.org/licenses/>.
18 */ 6 */
19 7
20#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 8#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -259,7 +247,6 @@ static int klp_check_stack(struct task_struct *task, char *err_buf)
259 int ret, nr_entries; 247 int ret, nr_entries;
260 248
261 ret = stack_trace_save_tsk_reliable(task, entries, ARRAY_SIZE(entries)); 249 ret = stack_trace_save_tsk_reliable(task, entries, ARRAY_SIZE(entries));
262 WARN_ON_ONCE(ret == -ENOSYS);
263 if (ret < 0) { 250 if (ret < 0) {
264 snprintf(err_buf, STACK_ERR_BUF_SIZE, 251 snprintf(err_buf, STACK_ERR_BUF_SIZE,
265 "%s: %s:%d has an unreliable stack\n", 252 "%s: %s:%d has an unreliable stack\n",
@@ -293,11 +280,11 @@ static int klp_check_stack(struct task_struct *task, char *err_buf)
293 */ 280 */
294static bool klp_try_switch_task(struct task_struct *task) 281static bool klp_try_switch_task(struct task_struct *task)
295{ 282{
283 static char err_buf[STACK_ERR_BUF_SIZE];
296 struct rq *rq; 284 struct rq *rq;
297 struct rq_flags flags; 285 struct rq_flags flags;
298 int ret; 286 int ret;
299 bool success = false; 287 bool success = false;
300 char err_buf[STACK_ERR_BUF_SIZE];
301 288
302 err_buf[0] = '\0'; 289 err_buf[0] = '\0';
303 290
@@ -306,6 +293,13 @@ static bool klp_try_switch_task(struct task_struct *task)
306 return true; 293 return true;
307 294
308 /* 295 /*
296 * For arches which don't have reliable stack traces, we have to rely
297 * on other methods (e.g., switching tasks at kernel exit).
298 */
299 if (!klp_have_reliable_stack())
300 return false;
301
302 /*
309 * Now try to check the stack for any to-be-patched or to-be-unpatched 303 * Now try to check the stack for any to-be-patched or to-be-unpatched
310 * functions. If all goes well, switch the task to the target patch 304 * functions. If all goes well, switch the task to the target patch
311 * state. 305 * state.
@@ -340,7 +334,6 @@ done:
340 pr_debug("%s", err_buf); 334 pr_debug("%s", err_buf);
341 335
342 return success; 336 return success;
343
344} 337}
345 338
346/* 339/*
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 6fe2f333aecb..45452facff3b 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -3,7 +3,7 @@
3# and is generally not a function of system call inputs. 3# and is generally not a function of system call inputs.
4KCOV_INSTRUMENT := n 4KCOV_INSTRUMENT := n
5 5
6obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o rwsem-xadd.o 6obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
7 7
8ifdef CONFIG_FUNCTION_TRACER 8ifdef CONFIG_FUNCTION_TRACER
9CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) 9CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h
index feb1acc54611..8c7e7d25f09c 100644
--- a/kernel/locking/lock_events.h
+++ b/kernel/locking/lock_events.h
@@ -31,12 +31,13 @@ enum lock_events {
31DECLARE_PER_CPU(unsigned long, lockevents[lockevent_num]); 31DECLARE_PER_CPU(unsigned long, lockevents[lockevent_num]);
32 32
33/* 33/*
34 * Increment the PV qspinlock statistical counters 34 * Increment the statistical counters. use raw_cpu_inc() because of lower
35 * overhead and we don't care if we loose the occasional update.
35 */ 36 */
36static inline void __lockevent_inc(enum lock_events event, bool cond) 37static inline void __lockevent_inc(enum lock_events event, bool cond)
37{ 38{
38 if (cond) 39 if (cond)
39 __this_cpu_inc(lockevents[event]); 40 raw_cpu_inc(lockevents[event]);
40} 41}
41 42
42#define lockevent_inc(ev) __lockevent_inc(LOCKEVENT_ ##ev, true) 43#define lockevent_inc(ev) __lockevent_inc(LOCKEVENT_ ##ev, true)
@@ -44,7 +45,7 @@ static inline void __lockevent_inc(enum lock_events event, bool cond)
44 45
45static inline void __lockevent_add(enum lock_events event, int inc) 46static inline void __lockevent_add(enum lock_events event, int inc)
46{ 47{
47 __this_cpu_add(lockevents[event], inc); 48 raw_cpu_add(lockevents[event], inc);
48} 49}
49 50
50#define lockevent_add(ev, c) __lockevent_add(LOCKEVENT_ ##ev, c) 51#define lockevent_add(ev, c) __lockevent_add(LOCKEVENT_ ##ev, c)
diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h
index ad7668cfc9da..239039d0ce21 100644
--- a/kernel/locking/lock_events_list.h
+++ b/kernel/locking/lock_events_list.h
@@ -56,12 +56,16 @@ LOCK_EVENT(rwsem_sleep_reader) /* # of reader sleeps */
56LOCK_EVENT(rwsem_sleep_writer) /* # of writer sleeps */ 56LOCK_EVENT(rwsem_sleep_writer) /* # of writer sleeps */
57LOCK_EVENT(rwsem_wake_reader) /* # of reader wakeups */ 57LOCK_EVENT(rwsem_wake_reader) /* # of reader wakeups */
58LOCK_EVENT(rwsem_wake_writer) /* # of writer wakeups */ 58LOCK_EVENT(rwsem_wake_writer) /* # of writer wakeups */
59LOCK_EVENT(rwsem_opt_wlock) /* # of write locks opt-spin acquired */ 59LOCK_EVENT(rwsem_opt_rlock) /* # of opt-acquired read locks */
60LOCK_EVENT(rwsem_opt_fail) /* # of failed opt-spinnings */ 60LOCK_EVENT(rwsem_opt_wlock) /* # of opt-acquired write locks */
61LOCK_EVENT(rwsem_opt_fail) /* # of failed optspins */
62LOCK_EVENT(rwsem_opt_nospin) /* # of disabled optspins */
63LOCK_EVENT(rwsem_opt_norspin) /* # of disabled reader-only optspins */
64LOCK_EVENT(rwsem_opt_rlock2) /* # of opt-acquired 2ndary read locks */
61LOCK_EVENT(rwsem_rlock) /* # of read locks acquired */ 65LOCK_EVENT(rwsem_rlock) /* # of read locks acquired */
62LOCK_EVENT(rwsem_rlock_fast) /* # of fast read locks acquired */ 66LOCK_EVENT(rwsem_rlock_fast) /* # of fast read locks acquired */
63LOCK_EVENT(rwsem_rlock_fail) /* # of failed read lock acquisitions */ 67LOCK_EVENT(rwsem_rlock_fail) /* # of failed read lock acquisitions */
64LOCK_EVENT(rwsem_rtrylock) /* # of read trylock calls */ 68LOCK_EVENT(rwsem_rlock_handoff) /* # of read lock handoffs */
65LOCK_EVENT(rwsem_wlock) /* # of write locks acquired */ 69LOCK_EVENT(rwsem_wlock) /* # of write locks acquired */
66LOCK_EVENT(rwsem_wlock_fail) /* # of failed write lock acquisitions */ 70LOCK_EVENT(rwsem_wlock_fail) /* # of failed write lock acquisitions */
67LOCK_EVENT(rwsem_wtrylock) /* # of write trylock calls */ 71LOCK_EVENT(rwsem_wlock_handoff) /* # of write lock handoffs */
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index d06190fa5082..341f52117f88 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * kernel/lockdep.c 3 * kernel/lockdep.c
3 * 4 *
@@ -150,17 +151,28 @@ unsigned long nr_lock_classes;
150static 151static
151#endif 152#endif
152struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; 153struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
154static DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS);
153 155
154static inline struct lock_class *hlock_class(struct held_lock *hlock) 156static inline struct lock_class *hlock_class(struct held_lock *hlock)
155{ 157{
156 if (!hlock->class_idx) { 158 unsigned int class_idx = hlock->class_idx;
159
160 /* Don't re-read hlock->class_idx, can't use READ_ONCE() on bitfield */
161 barrier();
162
163 if (!test_bit(class_idx, lock_classes_in_use)) {
157 /* 164 /*
158 * Someone passed in garbage, we give up. 165 * Someone passed in garbage, we give up.
159 */ 166 */
160 DEBUG_LOCKS_WARN_ON(1); 167 DEBUG_LOCKS_WARN_ON(1);
161 return NULL; 168 return NULL;
162 } 169 }
163 return lock_classes + hlock->class_idx - 1; 170
171 /*
172 * At this point, if the passed hlock->class_idx is still garbage,
173 * we just have to live with it
174 */
175 return lock_classes + class_idx;
164} 176}
165 177
166#ifdef CONFIG_LOCK_STAT 178#ifdef CONFIG_LOCK_STAT
@@ -358,6 +370,13 @@ static inline u64 iterate_chain_key(u64 key, u32 idx)
358 return k0 | (u64)k1 << 32; 370 return k0 | (u64)k1 << 32;
359} 371}
360 372
373void lockdep_init_task(struct task_struct *task)
374{
375 task->lockdep_depth = 0; /* no locks held yet */
376 task->curr_chain_key = INITIAL_CHAIN_KEY;
377 task->lockdep_recursion = 0;
378}
379
361void lockdep_off(void) 380void lockdep_off(void)
362{ 381{
363 current->lockdep_recursion++; 382 current->lockdep_recursion++;
@@ -418,13 +437,6 @@ static int verbose(struct lock_class *class)
418 return 0; 437 return 0;
419} 438}
420 439
421/*
422 * Stack-trace: tightly packed array of stack backtrace
423 * addresses. Protected by the graph_lock.
424 */
425unsigned long nr_stack_trace_entries;
426static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
427
428static void print_lockdep_off(const char *bug_msg) 440static void print_lockdep_off(const char *bug_msg)
429{ 441{
430 printk(KERN_DEBUG "%s\n", bug_msg); 442 printk(KERN_DEBUG "%s\n", bug_msg);
@@ -434,6 +446,15 @@ static void print_lockdep_off(const char *bug_msg)
434#endif 446#endif
435} 447}
436 448
449unsigned long nr_stack_trace_entries;
450
451#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
452/*
453 * Stack-trace: tightly packed array of stack backtrace
454 * addresses. Protected by the graph_lock.
455 */
456static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
457
437static int save_trace(struct lock_trace *trace) 458static int save_trace(struct lock_trace *trace)
438{ 459{
439 unsigned long *entries = stack_trace + nr_stack_trace_entries; 460 unsigned long *entries = stack_trace + nr_stack_trace_entries;
@@ -456,6 +477,7 @@ static int save_trace(struct lock_trace *trace)
456 477
457 return 1; 478 return 1;
458} 479}
480#endif
459 481
460unsigned int nr_hardirq_chains; 482unsigned int nr_hardirq_chains;
461unsigned int nr_softirq_chains; 483unsigned int nr_softirq_chains;
@@ -469,6 +491,7 @@ unsigned int max_lockdep_depth;
469DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats); 491DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats);
470#endif 492#endif
471 493
494#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
472/* 495/*
473 * Locking printouts: 496 * Locking printouts:
474 */ 497 */
@@ -486,6 +509,7 @@ static const char *usage_str[] =
486#undef LOCKDEP_STATE 509#undef LOCKDEP_STATE
487 [LOCK_USED] = "INITIAL USE", 510 [LOCK_USED] = "INITIAL USE",
488}; 511};
512#endif
489 513
490const char * __get_key_name(struct lockdep_subclass_key *key, char *str) 514const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
491{ 515{
@@ -499,15 +523,26 @@ static inline unsigned long lock_flag(enum lock_usage_bit bit)
499 523
500static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit) 524static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit)
501{ 525{
526 /*
527 * The usage character defaults to '.' (i.e., irqs disabled and not in
528 * irq context), which is the safest usage category.
529 */
502 char c = '.'; 530 char c = '.';
503 531
504 if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK)) 532 /*
533 * The order of the following usage checks matters, which will
534 * result in the outcome character as follows:
535 *
536 * - '+': irq is enabled and not in irq context
537 * - '-': in irq context and irq is disabled
538 * - '?': in irq context and irq is enabled
539 */
540 if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK)) {
505 c = '+'; 541 c = '+';
506 if (class->usage_mask & lock_flag(bit)) { 542 if (class->usage_mask & lock_flag(bit))
507 c = '-';
508 if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK))
509 c = '?'; 543 c = '?';
510 } 544 } else if (class->usage_mask & lock_flag(bit))
545 c = '-';
511 546
512 return c; 547 return c;
513} 548}
@@ -571,19 +606,22 @@ static void print_lock(struct held_lock *hlock)
571 /* 606 /*
572 * We can be called locklessly through debug_show_all_locks() so be 607 * We can be called locklessly through debug_show_all_locks() so be
573 * extra careful, the hlock might have been released and cleared. 608 * extra careful, the hlock might have been released and cleared.
609 *
610 * If this indeed happens, lets pretend it does not hurt to continue
611 * to print the lock unless the hlock class_idx does not point to a
612 * registered class. The rationale here is: since we don't attempt
613 * to distinguish whether we are in this situation, if it just
614 * happened we can't count on class_idx to tell either.
574 */ 615 */
575 unsigned int class_idx = hlock->class_idx; 616 struct lock_class *lock = hlock_class(hlock);
576
577 /* Don't re-read hlock->class_idx, can't use READ_ONCE() on bitfields: */
578 barrier();
579 617
580 if (!class_idx || (class_idx - 1) >= MAX_LOCKDEP_KEYS) { 618 if (!lock) {
581 printk(KERN_CONT "<RELEASED>\n"); 619 printk(KERN_CONT "<RELEASED>\n");
582 return; 620 return;
583 } 621 }
584 622
585 printk(KERN_CONT "%p", hlock->instance); 623 printk(KERN_CONT "%p", hlock->instance);
586 print_lock_name(lock_classes + class_idx - 1); 624 print_lock_name(lock);
587 printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip); 625 printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip);
588} 626}
589 627
@@ -731,7 +769,8 @@ look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass)
731 * Huh! same key, different name? Did someone trample 769 * Huh! same key, different name? Did someone trample
732 * on some memory? We're most confused. 770 * on some memory? We're most confused.
733 */ 771 */
734 WARN_ON_ONCE(class->name != lock->name); 772 WARN_ON_ONCE(class->name != lock->name &&
773 lock->key != &__lockdep_no_validate__);
735 return class; 774 return class;
736 } 775 }
737 } 776 }
@@ -837,11 +876,11 @@ static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS];
837static bool check_lock_chain_key(struct lock_chain *chain) 876static bool check_lock_chain_key(struct lock_chain *chain)
838{ 877{
839#ifdef CONFIG_PROVE_LOCKING 878#ifdef CONFIG_PROVE_LOCKING
840 u64 chain_key = 0; 879 u64 chain_key = INITIAL_CHAIN_KEY;
841 int i; 880 int i;
842 881
843 for (i = chain->base; i < chain->base + chain->depth; i++) 882 for (i = chain->base; i < chain->base + chain->depth; i++)
844 chain_key = iterate_chain_key(chain_key, chain_hlocks[i] + 1); 883 chain_key = iterate_chain_key(chain_key, chain_hlocks[i]);
845 /* 884 /*
846 * The 'unsigned long long' casts avoid that a compiler warning 885 * The 'unsigned long long' casts avoid that a compiler warning
847 * is reported when building tools/lib/lockdep. 886 * is reported when building tools/lib/lockdep.
@@ -1116,6 +1155,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
1116 return NULL; 1155 return NULL;
1117 } 1156 }
1118 nr_lock_classes++; 1157 nr_lock_classes++;
1158 __set_bit(class - lock_classes, lock_classes_in_use);
1119 debug_atomic_inc(nr_unused_locks); 1159 debug_atomic_inc(nr_unused_locks);
1120 class->key = key; 1160 class->key = key;
1121 class->name = lock->name; 1161 class->name = lock->name;
@@ -1227,13 +1267,17 @@ static int add_lock_to_list(struct lock_class *this,
1227#define CQ_MASK (MAX_CIRCULAR_QUEUE_SIZE-1) 1267#define CQ_MASK (MAX_CIRCULAR_QUEUE_SIZE-1)
1228 1268
1229/* 1269/*
1230 * The circular_queue and helpers is used to implement the 1270 * The circular_queue and helpers are used to implement graph
1231 * breadth-first search(BFS)algorithem, by which we can build 1271 * breadth-first search (BFS) algorithm, by which we can determine
1232 * the shortest path from the next lock to be acquired to the 1272 * whether there is a path from a lock to another. In deadlock checks,
1233 * previous held lock if there is a circular between them. 1273 * a path from the next lock to be acquired to a previous held lock
1274 * indicates that adding the <prev> -> <next> lock dependency will
1275 * produce a circle in the graph. Breadth-first search instead of
1276 * depth-first search is used in order to find the shortest (circular)
1277 * path.
1234 */ 1278 */
1235struct circular_queue { 1279struct circular_queue {
1236 unsigned long element[MAX_CIRCULAR_QUEUE_SIZE]; 1280 struct lock_list *element[MAX_CIRCULAR_QUEUE_SIZE];
1237 unsigned int front, rear; 1281 unsigned int front, rear;
1238}; 1282};
1239 1283
@@ -1259,7 +1303,7 @@ static inline int __cq_full(struct circular_queue *cq)
1259 return ((cq->rear + 1) & CQ_MASK) == cq->front; 1303 return ((cq->rear + 1) & CQ_MASK) == cq->front;
1260} 1304}
1261 1305
1262static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem) 1306static inline int __cq_enqueue(struct circular_queue *cq, struct lock_list *elem)
1263{ 1307{
1264 if (__cq_full(cq)) 1308 if (__cq_full(cq))
1265 return -1; 1309 return -1;
@@ -1269,14 +1313,21 @@ static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem)
1269 return 0; 1313 return 0;
1270} 1314}
1271 1315
1272static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem) 1316/*
1317 * Dequeue an element from the circular_queue, return a lock_list if
1318 * the queue is not empty, or NULL if otherwise.
1319 */
1320static inline struct lock_list * __cq_dequeue(struct circular_queue *cq)
1273{ 1321{
1322 struct lock_list * lock;
1323
1274 if (__cq_empty(cq)) 1324 if (__cq_empty(cq))
1275 return -1; 1325 return NULL;
1276 1326
1277 *elem = cq->element[cq->front]; 1327 lock = cq->element[cq->front];
1278 cq->front = (cq->front + 1) & CQ_MASK; 1328 cq->front = (cq->front + 1) & CQ_MASK;
1279 return 0; 1329
1330 return lock;
1280} 1331}
1281 1332
1282static inline unsigned int __cq_get_elem_count(struct circular_queue *cq) 1333static inline unsigned int __cq_get_elem_count(struct circular_queue *cq)
@@ -1321,13 +1372,32 @@ static inline int get_lock_depth(struct lock_list *child)
1321 return depth; 1372 return depth;
1322} 1373}
1323 1374
1375/*
1376 * Return the forward or backward dependency list.
1377 *
1378 * @lock: the lock_list to get its class's dependency list
1379 * @offset: the offset to struct lock_class to determine whether it is
1380 * locks_after or locks_before
1381 */
1382static inline struct list_head *get_dep_list(struct lock_list *lock, int offset)
1383{
1384 void *lock_class = lock->class;
1385
1386 return lock_class + offset;
1387}
1388
1389/*
1390 * Forward- or backward-dependency search, used for both circular dependency
1391 * checking and hardirq-unsafe/softirq-unsafe checking.
1392 */
1324static int __bfs(struct lock_list *source_entry, 1393static int __bfs(struct lock_list *source_entry,
1325 void *data, 1394 void *data,
1326 int (*match)(struct lock_list *entry, void *data), 1395 int (*match)(struct lock_list *entry, void *data),
1327 struct lock_list **target_entry, 1396 struct lock_list **target_entry,
1328 int forward) 1397 int offset)
1329{ 1398{
1330 struct lock_list *entry; 1399 struct lock_list *entry;
1400 struct lock_list *lock;
1331 struct list_head *head; 1401 struct list_head *head;
1332 struct circular_queue *cq = &lock_cq; 1402 struct circular_queue *cq = &lock_cq;
1333 int ret = 1; 1403 int ret = 1;
@@ -1338,31 +1408,21 @@ static int __bfs(struct lock_list *source_entry,
1338 goto exit; 1408 goto exit;
1339 } 1409 }
1340 1410
1341 if (forward) 1411 head = get_dep_list(source_entry, offset);
1342 head = &source_entry->class->locks_after;
1343 else
1344 head = &source_entry->class->locks_before;
1345
1346 if (list_empty(head)) 1412 if (list_empty(head))
1347 goto exit; 1413 goto exit;
1348 1414
1349 __cq_init(cq); 1415 __cq_init(cq);
1350 __cq_enqueue(cq, (unsigned long)source_entry); 1416 __cq_enqueue(cq, source_entry);
1351 1417
1352 while (!__cq_empty(cq)) { 1418 while ((lock = __cq_dequeue(cq))) {
1353 struct lock_list *lock;
1354
1355 __cq_dequeue(cq, (unsigned long *)&lock);
1356 1419
1357 if (!lock->class) { 1420 if (!lock->class) {
1358 ret = -2; 1421 ret = -2;
1359 goto exit; 1422 goto exit;
1360 } 1423 }
1361 1424
1362 if (forward) 1425 head = get_dep_list(lock, offset);
1363 head = &lock->class->locks_after;
1364 else
1365 head = &lock->class->locks_before;
1366 1426
1367 DEBUG_LOCKS_WARN_ON(!irqs_disabled()); 1427 DEBUG_LOCKS_WARN_ON(!irqs_disabled());
1368 1428
@@ -1376,7 +1436,7 @@ static int __bfs(struct lock_list *source_entry,
1376 goto exit; 1436 goto exit;
1377 } 1437 }
1378 1438
1379 if (__cq_enqueue(cq, (unsigned long)entry)) { 1439 if (__cq_enqueue(cq, entry)) {
1380 ret = -1; 1440 ret = -1;
1381 goto exit; 1441 goto exit;
1382 } 1442 }
@@ -1395,7 +1455,8 @@ static inline int __bfs_forwards(struct lock_list *src_entry,
1395 int (*match)(struct lock_list *entry, void *data), 1455 int (*match)(struct lock_list *entry, void *data),
1396 struct lock_list **target_entry) 1456 struct lock_list **target_entry)
1397{ 1457{
1398 return __bfs(src_entry, data, match, target_entry, 1); 1458 return __bfs(src_entry, data, match, target_entry,
1459 offsetof(struct lock_class, locks_after));
1399 1460
1400} 1461}
1401 1462
@@ -1404,16 +1465,11 @@ static inline int __bfs_backwards(struct lock_list *src_entry,
1404 int (*match)(struct lock_list *entry, void *data), 1465 int (*match)(struct lock_list *entry, void *data),
1405 struct lock_list **target_entry) 1466 struct lock_list **target_entry)
1406{ 1467{
1407 return __bfs(src_entry, data, match, target_entry, 0); 1468 return __bfs(src_entry, data, match, target_entry,
1469 offsetof(struct lock_class, locks_before));
1408 1470
1409} 1471}
1410 1472
1411/*
1412 * Recursive, forwards-direction lock-dependency checking, used for
1413 * both noncyclic checking and for hardirq-unsafe/softirq-unsafe
1414 * checking.
1415 */
1416
1417static void print_lock_trace(struct lock_trace *trace, unsigned int spaces) 1473static void print_lock_trace(struct lock_trace *trace, unsigned int spaces)
1418{ 1474{
1419 unsigned long *entries = stack_trace + trace->offset; 1475 unsigned long *entries = stack_trace + trace->offset;
@@ -1425,16 +1481,15 @@ static void print_lock_trace(struct lock_trace *trace, unsigned int spaces)
1425 * Print a dependency chain entry (this is only done when a deadlock 1481 * Print a dependency chain entry (this is only done when a deadlock
1426 * has been detected): 1482 * has been detected):
1427 */ 1483 */
1428static noinline int 1484static noinline void
1429print_circular_bug_entry(struct lock_list *target, int depth) 1485print_circular_bug_entry(struct lock_list *target, int depth)
1430{ 1486{
1431 if (debug_locks_silent) 1487 if (debug_locks_silent)
1432 return 0; 1488 return;
1433 printk("\n-> #%u", depth); 1489 printk("\n-> #%u", depth);
1434 print_lock_name(target->class); 1490 print_lock_name(target->class);
1435 printk(KERN_CONT ":\n"); 1491 printk(KERN_CONT ":\n");
1436 print_lock_trace(&target->trace, 6); 1492 print_lock_trace(&target->trace, 6);
1437 return 0;
1438} 1493}
1439 1494
1440static void 1495static void
@@ -1491,7 +1546,7 @@ print_circular_lock_scenario(struct held_lock *src,
1491 * When a circular dependency is detected, print the 1546 * When a circular dependency is detected, print the
1492 * header first: 1547 * header first:
1493 */ 1548 */
1494static noinline int 1549static noinline void
1495print_circular_bug_header(struct lock_list *entry, unsigned int depth, 1550print_circular_bug_header(struct lock_list *entry, unsigned int depth,
1496 struct held_lock *check_src, 1551 struct held_lock *check_src,
1497 struct held_lock *check_tgt) 1552 struct held_lock *check_tgt)
@@ -1499,7 +1554,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
1499 struct task_struct *curr = current; 1554 struct task_struct *curr = current;
1500 1555
1501 if (debug_locks_silent) 1556 if (debug_locks_silent)
1502 return 0; 1557 return;
1503 1558
1504 pr_warn("\n"); 1559 pr_warn("\n");
1505 pr_warn("======================================================\n"); 1560 pr_warn("======================================================\n");
@@ -1517,8 +1572,6 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
1517 pr_warn("\nthe existing dependency chain (in reverse order) is:\n"); 1572 pr_warn("\nthe existing dependency chain (in reverse order) is:\n");
1518 1573
1519 print_circular_bug_entry(entry, depth); 1574 print_circular_bug_entry(entry, depth);
1520
1521 return 0;
1522} 1575}
1523 1576
1524static inline int class_equal(struct lock_list *entry, void *data) 1577static inline int class_equal(struct lock_list *entry, void *data)
@@ -1526,10 +1579,10 @@ static inline int class_equal(struct lock_list *entry, void *data)
1526 return entry->class == data; 1579 return entry->class == data;
1527} 1580}
1528 1581
1529static noinline int print_circular_bug(struct lock_list *this, 1582static noinline void print_circular_bug(struct lock_list *this,
1530 struct lock_list *target, 1583 struct lock_list *target,
1531 struct held_lock *check_src, 1584 struct held_lock *check_src,
1532 struct held_lock *check_tgt) 1585 struct held_lock *check_tgt)
1533{ 1586{
1534 struct task_struct *curr = current; 1587 struct task_struct *curr = current;
1535 struct lock_list *parent; 1588 struct lock_list *parent;
@@ -1537,10 +1590,10 @@ static noinline int print_circular_bug(struct lock_list *this,
1537 int depth; 1590 int depth;
1538 1591
1539 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 1592 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
1540 return 0; 1593 return;
1541 1594
1542 if (!save_trace(&this->trace)) 1595 if (!save_trace(&this->trace))
1543 return 0; 1596 return;
1544 1597
1545 depth = get_lock_depth(target); 1598 depth = get_lock_depth(target);
1546 1599
@@ -1562,21 +1615,17 @@ static noinline int print_circular_bug(struct lock_list *this,
1562 1615
1563 printk("\nstack backtrace:\n"); 1616 printk("\nstack backtrace:\n");
1564 dump_stack(); 1617 dump_stack();
1565
1566 return 0;
1567} 1618}
1568 1619
1569static noinline int print_bfs_bug(int ret) 1620static noinline void print_bfs_bug(int ret)
1570{ 1621{
1571 if (!debug_locks_off_graph_unlock()) 1622 if (!debug_locks_off_graph_unlock())
1572 return 0; 1623 return;
1573 1624
1574 /* 1625 /*
1575 * Breadth-first-search failed, graph got corrupted? 1626 * Breadth-first-search failed, graph got corrupted?
1576 */ 1627 */
1577 WARN(1, "lockdep bfs error:%d\n", ret); 1628 WARN(1, "lockdep bfs error:%d\n", ret);
1578
1579 return 0;
1580} 1629}
1581 1630
1582static int noop_count(struct lock_list *entry, void *data) 1631static int noop_count(struct lock_list *entry, void *data)
@@ -1639,36 +1688,95 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class)
1639} 1688}
1640 1689
1641/* 1690/*
1642 * Prove that the dependency graph starting at <entry> can not 1691 * Check that the dependency graph starting at <src> can lead to
1643 * lead to <target>. Print an error and return 0 if it does. 1692 * <target> or not. Print an error and return 0 if it does.
1644 */ 1693 */
1645static noinline int 1694static noinline int
1646check_noncircular(struct lock_list *root, struct lock_class *target, 1695check_path(struct lock_class *target, struct lock_list *src_entry,
1647 struct lock_list **target_entry) 1696 struct lock_list **target_entry)
1648{ 1697{
1649 int result; 1698 int ret;
1699
1700 ret = __bfs_forwards(src_entry, (void *)target, class_equal,
1701 target_entry);
1702
1703 if (unlikely(ret < 0))
1704 print_bfs_bug(ret);
1705
1706 return ret;
1707}
1708
1709/*
1710 * Prove that the dependency graph starting at <src> can not
1711 * lead to <target>. If it can, there is a circle when adding
1712 * <target> -> <src> dependency.
1713 *
1714 * Print an error and return 0 if it does.
1715 */
1716static noinline int
1717check_noncircular(struct held_lock *src, struct held_lock *target,
1718 struct lock_trace *trace)
1719{
1720 int ret;
1721 struct lock_list *uninitialized_var(target_entry);
1722 struct lock_list src_entry = {
1723 .class = hlock_class(src),
1724 .parent = NULL,
1725 };
1650 1726
1651 debug_atomic_inc(nr_cyclic_checks); 1727 debug_atomic_inc(nr_cyclic_checks);
1652 1728
1653 result = __bfs_forwards(root, target, class_equal, target_entry); 1729 ret = check_path(hlock_class(target), &src_entry, &target_entry);
1654 1730
1655 return result; 1731 if (unlikely(!ret)) {
1732 if (!trace->nr_entries) {
1733 /*
1734 * If save_trace fails here, the printing might
1735 * trigger a WARN but because of the !nr_entries it
1736 * should not do bad things.
1737 */
1738 save_trace(trace);
1739 }
1740
1741 print_circular_bug(&src_entry, target_entry, src, target);
1742 }
1743
1744 return ret;
1656} 1745}
1657 1746
1747#ifdef CONFIG_LOCKDEP_SMALL
1748/*
1749 * Check that the dependency graph starting at <src> can lead to
1750 * <target> or not. If it can, <src> -> <target> dependency is already
1751 * in the graph.
1752 *
1753 * Print an error and return 2 if it does or 1 if it does not.
1754 */
1658static noinline int 1755static noinline int
1659check_redundant(struct lock_list *root, struct lock_class *target, 1756check_redundant(struct held_lock *src, struct held_lock *target)
1660 struct lock_list **target_entry)
1661{ 1757{
1662 int result; 1758 int ret;
1759 struct lock_list *uninitialized_var(target_entry);
1760 struct lock_list src_entry = {
1761 .class = hlock_class(src),
1762 .parent = NULL,
1763 };
1663 1764
1664 debug_atomic_inc(nr_redundant_checks); 1765 debug_atomic_inc(nr_redundant_checks);
1665 1766
1666 result = __bfs_forwards(root, target, class_equal, target_entry); 1767 ret = check_path(hlock_class(target), &src_entry, &target_entry);
1667 1768
1668 return result; 1769 if (!ret) {
1770 debug_atomic_inc(nr_redundant);
1771 ret = 2;
1772 } else if (ret < 0)
1773 ret = 0;
1774
1775 return ret;
1669} 1776}
1777#endif
1670 1778
1671#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) 1779#ifdef CONFIG_TRACE_IRQFLAGS
1672 1780
1673static inline int usage_accumulate(struct lock_list *entry, void *mask) 1781static inline int usage_accumulate(struct lock_list *entry, void *mask)
1674{ 1782{
@@ -1765,7 +1873,7 @@ static void print_lock_class_header(struct lock_class *class, int depth)
1765 */ 1873 */
1766static void __used 1874static void __used
1767print_shortest_lock_dependencies(struct lock_list *leaf, 1875print_shortest_lock_dependencies(struct lock_list *leaf,
1768 struct lock_list *root) 1876 struct lock_list *root)
1769{ 1877{
1770 struct lock_list *entry = leaf; 1878 struct lock_list *entry = leaf;
1771 int depth; 1879 int depth;
@@ -1787,8 +1895,6 @@ print_shortest_lock_dependencies(struct lock_list *leaf,
1787 entry = get_lock_parent(entry); 1895 entry = get_lock_parent(entry);
1788 depth--; 1896 depth--;
1789 } while (entry && (depth >= 0)); 1897 } while (entry && (depth >= 0));
1790
1791 return;
1792} 1898}
1793 1899
1794static void 1900static void
@@ -1847,7 +1953,7 @@ print_irq_lock_scenario(struct lock_list *safe_entry,
1847 printk("\n *** DEADLOCK ***\n\n"); 1953 printk("\n *** DEADLOCK ***\n\n");
1848} 1954}
1849 1955
1850static int 1956static void
1851print_bad_irq_dependency(struct task_struct *curr, 1957print_bad_irq_dependency(struct task_struct *curr,
1852 struct lock_list *prev_root, 1958 struct lock_list *prev_root,
1853 struct lock_list *next_root, 1959 struct lock_list *next_root,
@@ -1860,7 +1966,7 @@ print_bad_irq_dependency(struct task_struct *curr,
1860 const char *irqclass) 1966 const char *irqclass)
1861{ 1967{
1862 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 1968 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
1863 return 0; 1969 return;
1864 1970
1865 pr_warn("\n"); 1971 pr_warn("\n");
1866 pr_warn("=====================================================\n"); 1972 pr_warn("=====================================================\n");
@@ -1906,19 +2012,17 @@ print_bad_irq_dependency(struct task_struct *curr,
1906 2012
1907 pr_warn("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass); 2013 pr_warn("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass);
1908 if (!save_trace(&prev_root->trace)) 2014 if (!save_trace(&prev_root->trace))
1909 return 0; 2015 return;
1910 print_shortest_lock_dependencies(backwards_entry, prev_root); 2016 print_shortest_lock_dependencies(backwards_entry, prev_root);
1911 2017
1912 pr_warn("\nthe dependencies between the lock to be acquired"); 2018 pr_warn("\nthe dependencies between the lock to be acquired");
1913 pr_warn(" and %s-irq-unsafe lock:\n", irqclass); 2019 pr_warn(" and %s-irq-unsafe lock:\n", irqclass);
1914 if (!save_trace(&next_root->trace)) 2020 if (!save_trace(&next_root->trace))
1915 return 0; 2021 return;
1916 print_shortest_lock_dependencies(forwards_entry, next_root); 2022 print_shortest_lock_dependencies(forwards_entry, next_root);
1917 2023
1918 pr_warn("\nstack backtrace:\n"); 2024 pr_warn("\nstack backtrace:\n");
1919 dump_stack(); 2025 dump_stack();
1920
1921 return 0;
1922} 2026}
1923 2027
1924static const char *state_names[] = { 2028static const char *state_names[] = {
@@ -2065,8 +2169,10 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
2065 this.class = hlock_class(prev); 2169 this.class = hlock_class(prev);
2066 2170
2067 ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL); 2171 ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL);
2068 if (ret < 0) 2172 if (ret < 0) {
2069 return print_bfs_bug(ret); 2173 print_bfs_bug(ret);
2174 return 0;
2175 }
2070 2176
2071 usage_mask &= LOCKF_USED_IN_IRQ_ALL; 2177 usage_mask &= LOCKF_USED_IN_IRQ_ALL;
2072 if (!usage_mask) 2178 if (!usage_mask)
@@ -2082,8 +2188,10 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
2082 that.class = hlock_class(next); 2188 that.class = hlock_class(next);
2083 2189
2084 ret = find_usage_forwards(&that, forward_mask, &target_entry1); 2190 ret = find_usage_forwards(&that, forward_mask, &target_entry1);
2085 if (ret < 0) 2191 if (ret < 0) {
2086 return print_bfs_bug(ret); 2192 print_bfs_bug(ret);
2193 return 0;
2194 }
2087 if (ret == 1) 2195 if (ret == 1)
2088 return ret; 2196 return ret;
2089 2197
@@ -2095,8 +2203,10 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
2095 backward_mask = original_mask(target_entry1->class->usage_mask); 2203 backward_mask = original_mask(target_entry1->class->usage_mask);
2096 2204
2097 ret = find_usage_backwards(&this, backward_mask, &target_entry); 2205 ret = find_usage_backwards(&this, backward_mask, &target_entry);
2098 if (ret < 0) 2206 if (ret < 0) {
2099 return print_bfs_bug(ret); 2207 print_bfs_bug(ret);
2208 return 0;
2209 }
2100 if (DEBUG_LOCKS_WARN_ON(ret == 1)) 2210 if (DEBUG_LOCKS_WARN_ON(ret == 1))
2101 return 1; 2211 return 1;
2102 2212
@@ -2110,11 +2220,13 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
2110 if (DEBUG_LOCKS_WARN_ON(ret == -1)) 2220 if (DEBUG_LOCKS_WARN_ON(ret == -1))
2111 return 1; 2221 return 1;
2112 2222
2113 return print_bad_irq_dependency(curr, &this, &that, 2223 print_bad_irq_dependency(curr, &this, &that,
2114 target_entry, target_entry1, 2224 target_entry, target_entry1,
2115 prev, next, 2225 prev, next,
2116 backward_bit, forward_bit, 2226 backward_bit, forward_bit,
2117 state_name(backward_bit)); 2227 state_name(backward_bit));
2228
2229 return 0;
2118} 2230}
2119 2231
2120static void inc_chains(void) 2232static void inc_chains(void)
@@ -2142,11 +2254,10 @@ static inline void inc_chains(void)
2142 nr_process_chains++; 2254 nr_process_chains++;
2143} 2255}
2144 2256
2145#endif 2257#endif /* CONFIG_TRACE_IRQFLAGS */
2146 2258
2147static void 2259static void
2148print_deadlock_scenario(struct held_lock *nxt, 2260print_deadlock_scenario(struct held_lock *nxt, struct held_lock *prv)
2149 struct held_lock *prv)
2150{ 2261{
2151 struct lock_class *next = hlock_class(nxt); 2262 struct lock_class *next = hlock_class(nxt);
2152 struct lock_class *prev = hlock_class(prv); 2263 struct lock_class *prev = hlock_class(prv);
@@ -2164,12 +2275,12 @@ print_deadlock_scenario(struct held_lock *nxt,
2164 printk(" May be due to missing lock nesting notation\n\n"); 2275 printk(" May be due to missing lock nesting notation\n\n");
2165} 2276}
2166 2277
2167static int 2278static void
2168print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, 2279print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
2169 struct held_lock *next) 2280 struct held_lock *next)
2170{ 2281{
2171 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 2282 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
2172 return 0; 2283 return;
2173 2284
2174 pr_warn("\n"); 2285 pr_warn("\n");
2175 pr_warn("============================================\n"); 2286 pr_warn("============================================\n");
@@ -2188,8 +2299,6 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
2188 2299
2189 pr_warn("\nstack backtrace:\n"); 2300 pr_warn("\nstack backtrace:\n");
2190 dump_stack(); 2301 dump_stack();
2191
2192 return 0;
2193} 2302}
2194 2303
2195/* 2304/*
@@ -2201,8 +2310,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
2201 * Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read 2310 * Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read
2202 */ 2311 */
2203static int 2312static int
2204check_deadlock(struct task_struct *curr, struct held_lock *next, 2313check_deadlock(struct task_struct *curr, struct held_lock *next)
2205 struct lockdep_map *next_instance, int read)
2206{ 2314{
2207 struct held_lock *prev; 2315 struct held_lock *prev;
2208 struct held_lock *nest = NULL; 2316 struct held_lock *nest = NULL;
@@ -2221,7 +2329,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
2221 * Allow read-after-read recursion of the same 2329 * Allow read-after-read recursion of the same
2222 * lock class (i.e. read_lock(lock)+read_lock(lock)): 2330 * lock class (i.e. read_lock(lock)+read_lock(lock)):
2223 */ 2331 */
2224 if ((read == 2) && prev->read) 2332 if ((next->read == 2) && prev->read)
2225 return 2; 2333 return 2;
2226 2334
2227 /* 2335 /*
@@ -2231,14 +2339,15 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
2231 if (nest) 2339 if (nest)
2232 return 2; 2340 return 2;
2233 2341
2234 return print_deadlock_bug(curr, prev, next); 2342 print_deadlock_bug(curr, prev, next);
2343 return 0;
2235 } 2344 }
2236 return 1; 2345 return 1;
2237} 2346}
2238 2347
2239/* 2348/*
2240 * There was a chain-cache miss, and we are about to add a new dependency 2349 * There was a chain-cache miss, and we are about to add a new dependency
2241 * to a previous lock. We recursively validate the following rules: 2350 * to a previous lock. We validate the following rules:
2242 * 2351 *
2243 * - would the adding of the <prev> -> <next> dependency create a 2352 * - would the adding of the <prev> -> <next> dependency create a
2244 * circular dependency in the graph? [== circular deadlock] 2353 * circular dependency in the graph? [== circular deadlock]
@@ -2262,9 +2371,7 @@ static int
2262check_prev_add(struct task_struct *curr, struct held_lock *prev, 2371check_prev_add(struct task_struct *curr, struct held_lock *prev,
2263 struct held_lock *next, int distance, struct lock_trace *trace) 2372 struct held_lock *next, int distance, struct lock_trace *trace)
2264{ 2373{
2265 struct lock_list *uninitialized_var(target_entry);
2266 struct lock_list *entry; 2374 struct lock_list *entry;
2267 struct lock_list this;
2268 int ret; 2375 int ret;
2269 2376
2270 if (!hlock_class(prev)->key || !hlock_class(next)->key) { 2377 if (!hlock_class(prev)->key || !hlock_class(next)->key) {
@@ -2288,28 +2395,16 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
2288 /* 2395 /*
2289 * Prove that the new <prev> -> <next> dependency would not 2396 * Prove that the new <prev> -> <next> dependency would not
2290 * create a circular dependency in the graph. (We do this by 2397 * create a circular dependency in the graph. (We do this by
2291 * forward-recursing into the graph starting at <next>, and 2398 * a breadth-first search into the graph starting at <next>,
2292 * checking whether we can reach <prev>.) 2399 * and check whether we can reach <prev>.)
2293 * 2400 *
2294 * We are using global variables to control the recursion, to 2401 * The search is limited by the size of the circular queue (i.e.,
2295 * keep the stackframe size of the recursive functions low: 2402 * MAX_CIRCULAR_QUEUE_SIZE) which keeps track of a breadth of nodes
2403 * in the graph whose neighbours are to be checked.
2296 */ 2404 */
2297 this.class = hlock_class(next); 2405 ret = check_noncircular(next, prev, trace);
2298 this.parent = NULL; 2406 if (unlikely(ret <= 0))
2299 ret = check_noncircular(&this, hlock_class(prev), &target_entry); 2407 return 0;
2300 if (unlikely(!ret)) {
2301 if (!trace->nr_entries) {
2302 /*
2303 * If save_trace fails here, the printing might
2304 * trigger a WARN but because of the !nr_entries it
2305 * should not do bad things.
2306 */
2307 save_trace(trace);
2308 }
2309 return print_circular_bug(&this, target_entry, next, prev);
2310 }
2311 else if (unlikely(ret < 0))
2312 return print_bfs_bug(ret);
2313 2408
2314 if (!check_irq_usage(curr, prev, next)) 2409 if (!check_irq_usage(curr, prev, next))
2315 return 0; 2410 return 0;
@@ -2340,19 +2435,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
2340 } 2435 }
2341 } 2436 }
2342 2437
2438#ifdef CONFIG_LOCKDEP_SMALL
2343 /* 2439 /*
2344 * Is the <prev> -> <next> link redundant? 2440 * Is the <prev> -> <next> link redundant?
2345 */ 2441 */
2346 this.class = hlock_class(prev); 2442 ret = check_redundant(prev, next);
2347 this.parent = NULL; 2443 if (ret != 1)
2348 ret = check_redundant(&this, hlock_class(next), &target_entry); 2444 return ret;
2349 if (!ret) { 2445#endif
2350 debug_atomic_inc(nr_redundant);
2351 return 2;
2352 }
2353 if (ret < 0)
2354 return print_bfs_bug(ret);
2355
2356 2446
2357 if (!trace->nr_entries && !save_trace(trace)) 2447 if (!trace->nr_entries && !save_trace(trace))
2358 return 0; 2448 return 0;
@@ -2504,12 +2594,13 @@ static void
2504print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_next) 2594print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_next)
2505{ 2595{
2506 struct held_lock *hlock; 2596 struct held_lock *hlock;
2507 u64 chain_key = 0; 2597 u64 chain_key = INITIAL_CHAIN_KEY;
2508 int depth = curr->lockdep_depth; 2598 int depth = curr->lockdep_depth;
2509 int i; 2599 int i = get_first_held_lock(curr, hlock_next);
2510 2600
2511 printk("depth: %u\n", depth + 1); 2601 printk("depth: %u (irq_context %u)\n", depth - i + 1,
2512 for (i = get_first_held_lock(curr, hlock_next); i < depth; i++) { 2602 hlock_next->irq_context);
2603 for (; i < depth; i++) {
2513 hlock = curr->held_locks + i; 2604 hlock = curr->held_locks + i;
2514 chain_key = print_chain_key_iteration(hlock->class_idx, chain_key); 2605 chain_key = print_chain_key_iteration(hlock->class_idx, chain_key);
2515 2606
@@ -2523,13 +2614,13 @@ print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_ne
2523static void print_chain_keys_chain(struct lock_chain *chain) 2614static void print_chain_keys_chain(struct lock_chain *chain)
2524{ 2615{
2525 int i; 2616 int i;
2526 u64 chain_key = 0; 2617 u64 chain_key = INITIAL_CHAIN_KEY;
2527 int class_id; 2618 int class_id;
2528 2619
2529 printk("depth: %u\n", chain->depth); 2620 printk("depth: %u\n", chain->depth);
2530 for (i = 0; i < chain->depth; i++) { 2621 for (i = 0; i < chain->depth; i++) {
2531 class_id = chain_hlocks[chain->base + i]; 2622 class_id = chain_hlocks[chain->base + i];
2532 chain_key = print_chain_key_iteration(class_id + 1, chain_key); 2623 chain_key = print_chain_key_iteration(class_id, chain_key);
2533 2624
2534 print_lock_name(lock_classes + class_id); 2625 print_lock_name(lock_classes + class_id);
2535 printk("\n"); 2626 printk("\n");
@@ -2580,7 +2671,7 @@ static int check_no_collision(struct task_struct *curr,
2580 } 2671 }
2581 2672
2582 for (j = 0; j < chain->depth - 1; j++, i++) { 2673 for (j = 0; j < chain->depth - 1; j++, i++) {
2583 id = curr->held_locks[i].class_idx - 1; 2674 id = curr->held_locks[i].class_idx;
2584 2675
2585 if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id)) { 2676 if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id)) {
2586 print_collision(curr, hlock, chain); 2677 print_collision(curr, hlock, chain);
@@ -2663,7 +2754,7 @@ static inline int add_chain_cache(struct task_struct *curr,
2663 if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { 2754 if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
2664 chain->base = nr_chain_hlocks; 2755 chain->base = nr_chain_hlocks;
2665 for (j = 0; j < chain->depth - 1; j++, i++) { 2756 for (j = 0; j < chain->depth - 1; j++, i++) {
2666 int lock_id = curr->held_locks[i].class_idx - 1; 2757 int lock_id = curr->held_locks[i].class_idx;
2667 chain_hlocks[chain->base + j] = lock_id; 2758 chain_hlocks[chain->base + j] = lock_id;
2668 } 2759 }
2669 chain_hlocks[chain->base + j] = class - lock_classes; 2760 chain_hlocks[chain->base + j] = class - lock_classes;
@@ -2753,8 +2844,9 @@ cache_hit:
2753 return 1; 2844 return 1;
2754} 2845}
2755 2846
2756static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, 2847static int validate_chain(struct task_struct *curr,
2757 struct held_lock *hlock, int chain_head, u64 chain_key) 2848 struct held_lock *hlock,
2849 int chain_head, u64 chain_key)
2758{ 2850{
2759 /* 2851 /*
2760 * Trylock needs to maintain the stack of held locks, but it 2852 * Trylock needs to maintain the stack of held locks, but it
@@ -2775,12 +2867,18 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
2775 * - is softirq-safe, if this lock is hardirq-unsafe 2867 * - is softirq-safe, if this lock is hardirq-unsafe
2776 * 2868 *
2777 * And check whether the new lock's dependency graph 2869 * And check whether the new lock's dependency graph
2778 * could lead back to the previous lock. 2870 * could lead back to the previous lock:
2779 * 2871 *
2780 * any of these scenarios could lead to a deadlock. If 2872 * - within the current held-lock stack
2781 * All validations 2873 * - across our accumulated lock dependency records
2874 *
2875 * any of these scenarios could lead to a deadlock.
2782 */ 2876 */
2783 int ret = check_deadlock(curr, hlock, lock, hlock->read); 2877 /*
2878 * The simple case: does the current hold the same lock
2879 * already?
2880 */
2881 int ret = check_deadlock(curr, hlock);
2784 2882
2785 if (!ret) 2883 if (!ret)
2786 return 0; 2884 return 0;
@@ -2811,16 +2909,12 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
2811} 2909}
2812#else 2910#else
2813static inline int validate_chain(struct task_struct *curr, 2911static inline int validate_chain(struct task_struct *curr,
2814 struct lockdep_map *lock, struct held_lock *hlock, 2912 struct held_lock *hlock,
2815 int chain_head, u64 chain_key) 2913 int chain_head, u64 chain_key)
2816{ 2914{
2817 return 1; 2915 return 1;
2818} 2916}
2819 2917#endif /* CONFIG_PROVE_LOCKING */
2820static void print_lock_trace(struct lock_trace *trace, unsigned int spaces)
2821{
2822}
2823#endif
2824 2918
2825/* 2919/*
2826 * We are building curr_chain_key incrementally, so double-check 2920 * We are building curr_chain_key incrementally, so double-check
@@ -2831,7 +2925,7 @@ static void check_chain_key(struct task_struct *curr)
2831#ifdef CONFIG_DEBUG_LOCKDEP 2925#ifdef CONFIG_DEBUG_LOCKDEP
2832 struct held_lock *hlock, *prev_hlock = NULL; 2926 struct held_lock *hlock, *prev_hlock = NULL;
2833 unsigned int i; 2927 unsigned int i;
2834 u64 chain_key = 0; 2928 u64 chain_key = INITIAL_CHAIN_KEY;
2835 2929
2836 for (i = 0; i < curr->lockdep_depth; i++) { 2930 for (i = 0; i < curr->lockdep_depth; i++) {
2837 hlock = curr->held_locks + i; 2931 hlock = curr->held_locks + i;
@@ -2847,15 +2941,17 @@ static void check_chain_key(struct task_struct *curr)
2847 (unsigned long long)hlock->prev_chain_key); 2941 (unsigned long long)hlock->prev_chain_key);
2848 return; 2942 return;
2849 } 2943 }
2944
2850 /* 2945 /*
2851 * Whoops ran out of static storage again? 2946 * hlock->class_idx can't go beyond MAX_LOCKDEP_KEYS, but is
2947 * it registered lock class index?
2852 */ 2948 */
2853 if (DEBUG_LOCKS_WARN_ON(hlock->class_idx > MAX_LOCKDEP_KEYS)) 2949 if (DEBUG_LOCKS_WARN_ON(!test_bit(hlock->class_idx, lock_classes_in_use)))
2854 return; 2950 return;
2855 2951
2856 if (prev_hlock && (prev_hlock->irq_context != 2952 if (prev_hlock && (prev_hlock->irq_context !=
2857 hlock->irq_context)) 2953 hlock->irq_context))
2858 chain_key = 0; 2954 chain_key = INITIAL_CHAIN_KEY;
2859 chain_key = iterate_chain_key(chain_key, hlock->class_idx); 2955 chain_key = iterate_chain_key(chain_key, hlock->class_idx);
2860 prev_hlock = hlock; 2956 prev_hlock = hlock;
2861 } 2957 }
@@ -2873,14 +2969,11 @@ static void check_chain_key(struct task_struct *curr)
2873#endif 2969#endif
2874} 2970}
2875 2971
2972#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
2876static int mark_lock(struct task_struct *curr, struct held_lock *this, 2973static int mark_lock(struct task_struct *curr, struct held_lock *this,
2877 enum lock_usage_bit new_bit); 2974 enum lock_usage_bit new_bit);
2878 2975
2879#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) 2976static void print_usage_bug_scenario(struct held_lock *lock)
2880
2881
2882static void
2883print_usage_bug_scenario(struct held_lock *lock)
2884{ 2977{
2885 struct lock_class *class = hlock_class(lock); 2978 struct lock_class *class = hlock_class(lock);
2886 2979
@@ -2897,12 +2990,12 @@ print_usage_bug_scenario(struct held_lock *lock)
2897 printk("\n *** DEADLOCK ***\n\n"); 2990 printk("\n *** DEADLOCK ***\n\n");
2898} 2991}
2899 2992
2900static int 2993static void
2901print_usage_bug(struct task_struct *curr, struct held_lock *this, 2994print_usage_bug(struct task_struct *curr, struct held_lock *this,
2902 enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) 2995 enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit)
2903{ 2996{
2904 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 2997 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
2905 return 0; 2998 return;
2906 2999
2907 pr_warn("\n"); 3000 pr_warn("\n");
2908 pr_warn("================================\n"); 3001 pr_warn("================================\n");
@@ -2932,8 +3025,6 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
2932 3025
2933 pr_warn("\nstack backtrace:\n"); 3026 pr_warn("\nstack backtrace:\n");
2934 dump_stack(); 3027 dump_stack();
2935
2936 return 0;
2937} 3028}
2938 3029
2939/* 3030/*
@@ -2943,8 +3034,10 @@ static inline int
2943valid_state(struct task_struct *curr, struct held_lock *this, 3034valid_state(struct task_struct *curr, struct held_lock *this,
2944 enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit) 3035 enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit)
2945{ 3036{
2946 if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit))) 3037 if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit))) {
2947 return print_usage_bug(curr, this, bad_bit, new_bit); 3038 print_usage_bug(curr, this, bad_bit, new_bit);
3039 return 0;
3040 }
2948 return 1; 3041 return 1;
2949} 3042}
2950 3043
@@ -2952,7 +3045,7 @@ valid_state(struct task_struct *curr, struct held_lock *this,
2952/* 3045/*
2953 * print irq inversion bug: 3046 * print irq inversion bug:
2954 */ 3047 */
2955static int 3048static void
2956print_irq_inversion_bug(struct task_struct *curr, 3049print_irq_inversion_bug(struct task_struct *curr,
2957 struct lock_list *root, struct lock_list *other, 3050 struct lock_list *root, struct lock_list *other,
2958 struct held_lock *this, int forwards, 3051 struct held_lock *this, int forwards,
@@ -2963,7 +3056,7 @@ print_irq_inversion_bug(struct task_struct *curr,
2963 int depth; 3056 int depth;
2964 3057
2965 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 3058 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
2966 return 0; 3059 return;
2967 3060
2968 pr_warn("\n"); 3061 pr_warn("\n");
2969 pr_warn("========================================================\n"); 3062 pr_warn("========================================================\n");
@@ -3004,13 +3097,11 @@ print_irq_inversion_bug(struct task_struct *curr,
3004 3097
3005 pr_warn("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); 3098 pr_warn("\nthe shortest dependencies between 2nd lock and 1st lock:\n");
3006 if (!save_trace(&root->trace)) 3099 if (!save_trace(&root->trace))
3007 return 0; 3100 return;
3008 print_shortest_lock_dependencies(other, root); 3101 print_shortest_lock_dependencies(other, root);
3009 3102
3010 pr_warn("\nstack backtrace:\n"); 3103 pr_warn("\nstack backtrace:\n");
3011 dump_stack(); 3104 dump_stack();
3012
3013 return 0;
3014} 3105}
3015 3106
3016/* 3107/*
@@ -3028,13 +3119,16 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this,
3028 root.parent = NULL; 3119 root.parent = NULL;
3029 root.class = hlock_class(this); 3120 root.class = hlock_class(this);
3030 ret = find_usage_forwards(&root, lock_flag(bit), &target_entry); 3121 ret = find_usage_forwards(&root, lock_flag(bit), &target_entry);
3031 if (ret < 0) 3122 if (ret < 0) {
3032 return print_bfs_bug(ret); 3123 print_bfs_bug(ret);
3124 return 0;
3125 }
3033 if (ret == 1) 3126 if (ret == 1)
3034 return ret; 3127 return ret;
3035 3128
3036 return print_irq_inversion_bug(curr, &root, target_entry, 3129 print_irq_inversion_bug(curr, &root, target_entry,
3037 this, 1, irqclass); 3130 this, 1, irqclass);
3131 return 0;
3038} 3132}
3039 3133
3040/* 3134/*
@@ -3052,13 +3146,16 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
3052 root.parent = NULL; 3146 root.parent = NULL;
3053 root.class = hlock_class(this); 3147 root.class = hlock_class(this);
3054 ret = find_usage_backwards(&root, lock_flag(bit), &target_entry); 3148 ret = find_usage_backwards(&root, lock_flag(bit), &target_entry);
3055 if (ret < 0) 3149 if (ret < 0) {
3056 return print_bfs_bug(ret); 3150 print_bfs_bug(ret);
3151 return 0;
3152 }
3057 if (ret == 1) 3153 if (ret == 1)
3058 return ret; 3154 return ret;
3059 3155
3060 return print_irq_inversion_bug(curr, &root, target_entry, 3156 print_irq_inversion_bug(curr, &root, target_entry,
3061 this, 0, irqclass); 3157 this, 0, irqclass);
3158 return 0;
3062} 3159}
3063 3160
3064void print_irqtrace_events(struct task_struct *curr) 3161void print_irqtrace_events(struct task_struct *curr)
@@ -3141,7 +3238,7 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this,
3141 * Validate that the lock dependencies don't have conflicting usage 3238 * Validate that the lock dependencies don't have conflicting usage
3142 * states. 3239 * states.
3143 */ 3240 */
3144 if ((!read || !dir || STRICT_READ_CHECKS) && 3241 if ((!read || STRICT_READ_CHECKS) &&
3145 !usage(curr, this, excl_bit, state_name(new_bit & ~LOCK_USAGE_READ_MASK))) 3242 !usage(curr, this, excl_bit, state_name(new_bit & ~LOCK_USAGE_READ_MASK)))
3146 return 0; 3243 return 0;
3147 3244
@@ -3366,8 +3463,12 @@ void trace_softirqs_off(unsigned long ip)
3366 debug_atomic_inc(redundant_softirqs_off); 3463 debug_atomic_inc(redundant_softirqs_off);
3367} 3464}
3368 3465
3369static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) 3466static int
3467mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
3370{ 3468{
3469 if (!check)
3470 goto lock_used;
3471
3371 /* 3472 /*
3372 * If non-trylock use in a hardirq or softirq context, then 3473 * If non-trylock use in a hardirq or softirq context, then
3373 * mark the lock as used in these contexts: 3474 * mark the lock as used in these contexts:
@@ -3411,6 +3512,11 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
3411 } 3512 }
3412 } 3513 }
3413 3514
3515lock_used:
3516 /* mark it as used: */
3517 if (!mark_lock(curr, hlock, LOCK_USED))
3518 return 0;
3519
3414 return 1; 3520 return 1;
3415} 3521}
3416 3522
@@ -3442,35 +3548,6 @@ static int separate_irq_context(struct task_struct *curr,
3442 return 0; 3548 return 0;
3443} 3549}
3444 3550
3445#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
3446
3447static inline
3448int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
3449 enum lock_usage_bit new_bit)
3450{
3451 WARN_ON(1); /* Impossible innit? when we don't have TRACE_IRQFLAG */
3452 return 1;
3453}
3454
3455static inline int mark_irqflags(struct task_struct *curr,
3456 struct held_lock *hlock)
3457{
3458 return 1;
3459}
3460
3461static inline unsigned int task_irq_context(struct task_struct *task)
3462{
3463 return 0;
3464}
3465
3466static inline int separate_irq_context(struct task_struct *curr,
3467 struct held_lock *hlock)
3468{
3469 return 0;
3470}
3471
3472#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
3473
3474/* 3551/*
3475 * Mark a lock with a usage bit, and validate the state transition: 3552 * Mark a lock with a usage bit, and validate the state transition:
3476 */ 3553 */
@@ -3479,6 +3556,11 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
3479{ 3556{
3480 unsigned int new_mask = 1 << new_bit, ret = 1; 3557 unsigned int new_mask = 1 << new_bit, ret = 1;
3481 3558
3559 if (new_bit >= LOCK_USAGE_STATES) {
3560 DEBUG_LOCKS_WARN_ON(1);
3561 return 0;
3562 }
3563
3482 /* 3564 /*
3483 * If already set then do not dirty the cacheline, 3565 * If already set then do not dirty the cacheline,
3484 * nor do any checks: 3566 * nor do any checks:
@@ -3502,25 +3584,13 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
3502 return 0; 3584 return 0;
3503 3585
3504 switch (new_bit) { 3586 switch (new_bit) {
3505#define LOCKDEP_STATE(__STATE) \
3506 case LOCK_USED_IN_##__STATE: \
3507 case LOCK_USED_IN_##__STATE##_READ: \
3508 case LOCK_ENABLED_##__STATE: \
3509 case LOCK_ENABLED_##__STATE##_READ:
3510#include "lockdep_states.h"
3511#undef LOCKDEP_STATE
3512 ret = mark_lock_irq(curr, this, new_bit);
3513 if (!ret)
3514 return 0;
3515 break;
3516 case LOCK_USED: 3587 case LOCK_USED:
3517 debug_atomic_dec(nr_unused_locks); 3588 debug_atomic_dec(nr_unused_locks);
3518 break; 3589 break;
3519 default: 3590 default:
3520 if (!debug_locks_off_graph_unlock()) 3591 ret = mark_lock_irq(curr, this, new_bit);
3592 if (!ret)
3521 return 0; 3593 return 0;
3522 WARN_ON(1);
3523 return 0;
3524 } 3594 }
3525 3595
3526 graph_unlock(); 3596 graph_unlock();
@@ -3538,6 +3608,27 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
3538 return ret; 3608 return ret;
3539} 3609}
3540 3610
3611#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
3612
3613static inline int
3614mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
3615{
3616 return 1;
3617}
3618
3619static inline unsigned int task_irq_context(struct task_struct *task)
3620{
3621 return 0;
3622}
3623
3624static inline int separate_irq_context(struct task_struct *curr,
3625 struct held_lock *hlock)
3626{
3627 return 0;
3628}
3629
3630#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
3631
3541/* 3632/*
3542 * Initialize a lock instance's lock-class mapping info: 3633 * Initialize a lock instance's lock-class mapping info:
3543 */ 3634 */
@@ -3601,15 +3692,15 @@ EXPORT_SYMBOL_GPL(lockdep_init_map);
3601struct lock_class_key __lockdep_no_validate__; 3692struct lock_class_key __lockdep_no_validate__;
3602EXPORT_SYMBOL_GPL(__lockdep_no_validate__); 3693EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
3603 3694
3604static int 3695static void
3605print_lock_nested_lock_not_held(struct task_struct *curr, 3696print_lock_nested_lock_not_held(struct task_struct *curr,
3606 struct held_lock *hlock, 3697 struct held_lock *hlock,
3607 unsigned long ip) 3698 unsigned long ip)
3608{ 3699{
3609 if (!debug_locks_off()) 3700 if (!debug_locks_off())
3610 return 0; 3701 return;
3611 if (debug_locks_silent) 3702 if (debug_locks_silent)
3612 return 0; 3703 return;
3613 3704
3614 pr_warn("\n"); 3705 pr_warn("\n");
3615 pr_warn("==================================\n"); 3706 pr_warn("==================================\n");
@@ -3631,8 +3722,6 @@ print_lock_nested_lock_not_held(struct task_struct *curr,
3631 3722
3632 pr_warn("\nstack backtrace:\n"); 3723 pr_warn("\nstack backtrace:\n");
3633 dump_stack(); 3724 dump_stack();
3634
3635 return 0;
3636} 3725}
3637 3726
3638static int __lock_is_held(const struct lockdep_map *lock, int read); 3727static int __lock_is_held(const struct lockdep_map *lock, int read);
@@ -3697,24 +3786,24 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3697 if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH)) 3786 if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH))
3698 return 0; 3787 return 0;
3699 3788
3700 class_idx = class - lock_classes + 1; 3789 class_idx = class - lock_classes;
3701 3790
3702 if (depth) { 3791 if (depth) {
3703 hlock = curr->held_locks + depth - 1; 3792 hlock = curr->held_locks + depth - 1;
3704 if (hlock->class_idx == class_idx && nest_lock) { 3793 if (hlock->class_idx == class_idx && nest_lock) {
3705 if (hlock->references) { 3794 if (!references)
3706 /* 3795 references++;
3707 * Check: unsigned int references:12, overflow.
3708 */
3709 if (DEBUG_LOCKS_WARN_ON(hlock->references == (1 << 12)-1))
3710 return 0;
3711 3796
3797 if (!hlock->references)
3712 hlock->references++; 3798 hlock->references++;
3713 } else {
3714 hlock->references = 2;
3715 }
3716 3799
3717 return 1; 3800 hlock->references += references;
3801
3802 /* Overflow */
3803 if (DEBUG_LOCKS_WARN_ON(hlock->references < references))
3804 return 0;
3805
3806 return 2;
3718 } 3807 }
3719 } 3808 }
3720 3809
@@ -3741,11 +3830,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3741#endif 3830#endif
3742 hlock->pin_count = pin_count; 3831 hlock->pin_count = pin_count;
3743 3832
3744 if (check && !mark_irqflags(curr, hlock)) 3833 /* Initialize the lock usage bit */
3745 return 0; 3834 if (!mark_usage(curr, hlock, check))
3746
3747 /* mark it as used: */
3748 if (!mark_lock(curr, hlock, LOCK_USED))
3749 return 0; 3835 return 0;
3750 3836
3751 /* 3837 /*
@@ -3759,9 +3845,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3759 * the hash, not class->key. 3845 * the hash, not class->key.
3760 */ 3846 */
3761 /* 3847 /*
3762 * Whoops, we did it again.. ran straight out of our static allocation. 3848 * Whoops, we did it again.. class_idx is invalid.
3763 */ 3849 */
3764 if (DEBUG_LOCKS_WARN_ON(class_idx > MAX_LOCKDEP_KEYS)) 3850 if (DEBUG_LOCKS_WARN_ON(!test_bit(class_idx, lock_classes_in_use)))
3765 return 0; 3851 return 0;
3766 3852
3767 chain_key = curr->curr_chain_key; 3853 chain_key = curr->curr_chain_key;
@@ -3769,27 +3855,29 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3769 /* 3855 /*
3770 * How can we have a chain hash when we ain't got no keys?! 3856 * How can we have a chain hash when we ain't got no keys?!
3771 */ 3857 */
3772 if (DEBUG_LOCKS_WARN_ON(chain_key != 0)) 3858 if (DEBUG_LOCKS_WARN_ON(chain_key != INITIAL_CHAIN_KEY))
3773 return 0; 3859 return 0;
3774 chain_head = 1; 3860 chain_head = 1;
3775 } 3861 }
3776 3862
3777 hlock->prev_chain_key = chain_key; 3863 hlock->prev_chain_key = chain_key;
3778 if (separate_irq_context(curr, hlock)) { 3864 if (separate_irq_context(curr, hlock)) {
3779 chain_key = 0; 3865 chain_key = INITIAL_CHAIN_KEY;
3780 chain_head = 1; 3866 chain_head = 1;
3781 } 3867 }
3782 chain_key = iterate_chain_key(chain_key, class_idx); 3868 chain_key = iterate_chain_key(chain_key, class_idx);
3783 3869
3784 if (nest_lock && !__lock_is_held(nest_lock, -1)) 3870 if (nest_lock && !__lock_is_held(nest_lock, -1)) {
3785 return print_lock_nested_lock_not_held(curr, hlock, ip); 3871 print_lock_nested_lock_not_held(curr, hlock, ip);
3872 return 0;
3873 }
3786 3874
3787 if (!debug_locks_silent) { 3875 if (!debug_locks_silent) {
3788 WARN_ON_ONCE(depth && !hlock_class(hlock - 1)->key); 3876 WARN_ON_ONCE(depth && !hlock_class(hlock - 1)->key);
3789 WARN_ON_ONCE(!hlock_class(hlock)->key); 3877 WARN_ON_ONCE(!hlock_class(hlock)->key);
3790 } 3878 }
3791 3879
3792 if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) 3880 if (!validate_chain(curr, hlock, chain_head, chain_key))
3793 return 0; 3881 return 0;
3794 3882
3795 curr->curr_chain_key = chain_key; 3883 curr->curr_chain_key = chain_key;
@@ -3818,14 +3906,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3818 return 1; 3906 return 1;
3819} 3907}
3820 3908
3821static int 3909static void print_unlock_imbalance_bug(struct task_struct *curr,
3822print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock, 3910 struct lockdep_map *lock,
3823 unsigned long ip) 3911 unsigned long ip)
3824{ 3912{
3825 if (!debug_locks_off()) 3913 if (!debug_locks_off())
3826 return 0; 3914 return;
3827 if (debug_locks_silent) 3915 if (debug_locks_silent)
3828 return 0; 3916 return;
3829 3917
3830 pr_warn("\n"); 3918 pr_warn("\n");
3831 pr_warn("=====================================\n"); 3919 pr_warn("=====================================\n");
@@ -3843,8 +3931,6 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
3843 3931
3844 pr_warn("\nstack backtrace:\n"); 3932 pr_warn("\nstack backtrace:\n");
3845 dump_stack(); 3933 dump_stack();
3846
3847 return 0;
3848} 3934}
3849 3935
3850static int match_held_lock(const struct held_lock *hlock, 3936static int match_held_lock(const struct held_lock *hlock,
@@ -3876,7 +3962,7 @@ static int match_held_lock(const struct held_lock *hlock,
3876 if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock)) 3962 if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock))
3877 return 0; 3963 return 0;
3878 3964
3879 if (hlock->class_idx == class - lock_classes + 1) 3965 if (hlock->class_idx == class - lock_classes)
3880 return 1; 3966 return 1;
3881 } 3967 }
3882 3968
@@ -3920,22 +4006,33 @@ out:
3920} 4006}
3921 4007
3922static int reacquire_held_locks(struct task_struct *curr, unsigned int depth, 4008static int reacquire_held_locks(struct task_struct *curr, unsigned int depth,
3923 int idx) 4009 int idx, unsigned int *merged)
3924{ 4010{
3925 struct held_lock *hlock; 4011 struct held_lock *hlock;
4012 int first_idx = idx;
3926 4013
3927 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 4014 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
3928 return 0; 4015 return 0;
3929 4016
3930 for (hlock = curr->held_locks + idx; idx < depth; idx++, hlock++) { 4017 for (hlock = curr->held_locks + idx; idx < depth; idx++, hlock++) {
3931 if (!__lock_acquire(hlock->instance, 4018 switch (__lock_acquire(hlock->instance,
3932 hlock_class(hlock)->subclass, 4019 hlock_class(hlock)->subclass,
3933 hlock->trylock, 4020 hlock->trylock,
3934 hlock->read, hlock->check, 4021 hlock->read, hlock->check,
3935 hlock->hardirqs_off, 4022 hlock->hardirqs_off,
3936 hlock->nest_lock, hlock->acquire_ip, 4023 hlock->nest_lock, hlock->acquire_ip,
3937 hlock->references, hlock->pin_count)) 4024 hlock->references, hlock->pin_count)) {
4025 case 0:
3938 return 1; 4026 return 1;
4027 case 1:
4028 break;
4029 case 2:
4030 *merged += (idx == first_idx);
4031 break;
4032 default:
4033 WARN_ON(1);
4034 return 0;
4035 }
3939 } 4036 }
3940 return 0; 4037 return 0;
3941} 4038}
@@ -3946,9 +4043,9 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
3946 unsigned long ip) 4043 unsigned long ip)
3947{ 4044{
3948 struct task_struct *curr = current; 4045 struct task_struct *curr = current;
4046 unsigned int depth, merged = 0;
3949 struct held_lock *hlock; 4047 struct held_lock *hlock;
3950 struct lock_class *class; 4048 struct lock_class *class;
3951 unsigned int depth;
3952 int i; 4049 int i;
3953 4050
3954 if (unlikely(!debug_locks)) 4051 if (unlikely(!debug_locks))
@@ -3963,24 +4060,26 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
3963 return 0; 4060 return 0;
3964 4061
3965 hlock = find_held_lock(curr, lock, depth, &i); 4062 hlock = find_held_lock(curr, lock, depth, &i);
3966 if (!hlock) 4063 if (!hlock) {
3967 return print_unlock_imbalance_bug(curr, lock, ip); 4064 print_unlock_imbalance_bug(curr, lock, ip);
4065 return 0;
4066 }
3968 4067
3969 lockdep_init_map(lock, name, key, 0); 4068 lockdep_init_map(lock, name, key, 0);
3970 class = register_lock_class(lock, subclass, 0); 4069 class = register_lock_class(lock, subclass, 0);
3971 hlock->class_idx = class - lock_classes + 1; 4070 hlock->class_idx = class - lock_classes;
3972 4071
3973 curr->lockdep_depth = i; 4072 curr->lockdep_depth = i;
3974 curr->curr_chain_key = hlock->prev_chain_key; 4073 curr->curr_chain_key = hlock->prev_chain_key;
3975 4074
3976 if (reacquire_held_locks(curr, depth, i)) 4075 if (reacquire_held_locks(curr, depth, i, &merged))
3977 return 0; 4076 return 0;
3978 4077
3979 /* 4078 /*
3980 * I took it apart and put it back together again, except now I have 4079 * I took it apart and put it back together again, except now I have
3981 * these 'spare' parts.. where shall I put them. 4080 * these 'spare' parts.. where shall I put them.
3982 */ 4081 */
3983 if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) 4082 if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - merged))
3984 return 0; 4083 return 0;
3985 return 1; 4084 return 1;
3986} 4085}
@@ -3988,8 +4087,8 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
3988static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip) 4087static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
3989{ 4088{
3990 struct task_struct *curr = current; 4089 struct task_struct *curr = current;
4090 unsigned int depth, merged = 0;
3991 struct held_lock *hlock; 4091 struct held_lock *hlock;
3992 unsigned int depth;
3993 int i; 4092 int i;
3994 4093
3995 if (unlikely(!debug_locks)) 4094 if (unlikely(!debug_locks))
@@ -4004,8 +4103,10 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
4004 return 0; 4103 return 0;
4005 4104
4006 hlock = find_held_lock(curr, lock, depth, &i); 4105 hlock = find_held_lock(curr, lock, depth, &i);
4007 if (!hlock) 4106 if (!hlock) {
4008 return print_unlock_imbalance_bug(curr, lock, ip); 4107 print_unlock_imbalance_bug(curr, lock, ip);
4108 return 0;
4109 }
4009 4110
4010 curr->lockdep_depth = i; 4111 curr->lockdep_depth = i;
4011 curr->curr_chain_key = hlock->prev_chain_key; 4112 curr->curr_chain_key = hlock->prev_chain_key;
@@ -4014,7 +4115,11 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
4014 hlock->read = 1; 4115 hlock->read = 1;
4015 hlock->acquire_ip = ip; 4116 hlock->acquire_ip = ip;
4016 4117
4017 if (reacquire_held_locks(curr, depth, i)) 4118 if (reacquire_held_locks(curr, depth, i, &merged))
4119 return 0;
4120
4121 /* Merging can't happen with unchanged classes.. */
4122 if (DEBUG_LOCKS_WARN_ON(merged))
4018 return 0; 4123 return 0;
4019 4124
4020 /* 4125 /*
@@ -4023,6 +4128,7 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
4023 */ 4128 */
4024 if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) 4129 if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth))
4025 return 0; 4130 return 0;
4131
4026 return 1; 4132 return 1;
4027} 4133}
4028 4134
@@ -4034,11 +4140,11 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
4034 * @nested is an hysterical artifact, needs a tree wide cleanup. 4140 * @nested is an hysterical artifact, needs a tree wide cleanup.
4035 */ 4141 */
4036static int 4142static int
4037__lock_release(struct lockdep_map *lock, int nested, unsigned long ip) 4143__lock_release(struct lockdep_map *lock, unsigned long ip)
4038{ 4144{
4039 struct task_struct *curr = current; 4145 struct task_struct *curr = current;
4146 unsigned int depth, merged = 1;
4040 struct held_lock *hlock; 4147 struct held_lock *hlock;
4041 unsigned int depth;
4042 int i; 4148 int i;
4043 4149
4044 if (unlikely(!debug_locks)) 4150 if (unlikely(!debug_locks))
@@ -4049,16 +4155,20 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
4049 * So we're all set to release this lock.. wait what lock? We don't 4155 * So we're all set to release this lock.. wait what lock? We don't
4050 * own any locks, you've been drinking again? 4156 * own any locks, you've been drinking again?
4051 */ 4157 */
4052 if (DEBUG_LOCKS_WARN_ON(depth <= 0)) 4158 if (depth <= 0) {
4053 return print_unlock_imbalance_bug(curr, lock, ip); 4159 print_unlock_imbalance_bug(curr, lock, ip);
4160 return 0;
4161 }
4054 4162
4055 /* 4163 /*
4056 * Check whether the lock exists in the current stack 4164 * Check whether the lock exists in the current stack
4057 * of held locks: 4165 * of held locks:
4058 */ 4166 */
4059 hlock = find_held_lock(curr, lock, depth, &i); 4167 hlock = find_held_lock(curr, lock, depth, &i);
4060 if (!hlock) 4168 if (!hlock) {
4061 return print_unlock_imbalance_bug(curr, lock, ip); 4169 print_unlock_imbalance_bug(curr, lock, ip);
4170 return 0;
4171 }
4062 4172
4063 if (hlock->instance == lock) 4173 if (hlock->instance == lock)
4064 lock_release_holdtime(hlock); 4174 lock_release_holdtime(hlock);
@@ -4093,14 +4203,15 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
4093 if (i == depth-1) 4203 if (i == depth-1)
4094 return 1; 4204 return 1;
4095 4205
4096 if (reacquire_held_locks(curr, depth, i + 1)) 4206 if (reacquire_held_locks(curr, depth, i + 1, &merged))
4097 return 0; 4207 return 0;
4098 4208
4099 /* 4209 /*
4100 * We had N bottles of beer on the wall, we drank one, but now 4210 * We had N bottles of beer on the wall, we drank one, but now
4101 * there's not N-1 bottles of beer left on the wall... 4211 * there's not N-1 bottles of beer left on the wall...
4212 * Pouring two of the bottles together is acceptable.
4102 */ 4213 */
4103 DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth-1); 4214 DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - merged);
4104 4215
4105 /* 4216 /*
4106 * Since reacquire_held_locks() would have called check_chain_key() 4217 * Since reacquire_held_locks() would have called check_chain_key()
@@ -4318,7 +4429,7 @@ void lock_release(struct lockdep_map *lock, int nested,
4318 check_flags(flags); 4429 check_flags(flags);
4319 current->lockdep_recursion = 1; 4430 current->lockdep_recursion = 1;
4320 trace_lock_release(lock, ip); 4431 trace_lock_release(lock, ip);
4321 if (__lock_release(lock, nested, ip)) 4432 if (__lock_release(lock, ip))
4322 check_chain_key(current); 4433 check_chain_key(current);
4323 current->lockdep_recursion = 0; 4434 current->lockdep_recursion = 0;
4324 raw_local_irq_restore(flags); 4435 raw_local_irq_restore(flags);
@@ -4401,14 +4512,14 @@ void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
4401EXPORT_SYMBOL_GPL(lock_unpin_lock); 4512EXPORT_SYMBOL_GPL(lock_unpin_lock);
4402 4513
4403#ifdef CONFIG_LOCK_STAT 4514#ifdef CONFIG_LOCK_STAT
4404static int 4515static void print_lock_contention_bug(struct task_struct *curr,
4405print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, 4516 struct lockdep_map *lock,
4406 unsigned long ip) 4517 unsigned long ip)
4407{ 4518{
4408 if (!debug_locks_off()) 4519 if (!debug_locks_off())
4409 return 0; 4520 return;
4410 if (debug_locks_silent) 4521 if (debug_locks_silent)
4411 return 0; 4522 return;
4412 4523
4413 pr_warn("\n"); 4524 pr_warn("\n");
4414 pr_warn("=================================\n"); 4525 pr_warn("=================================\n");
@@ -4426,8 +4537,6 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
4426 4537
4427 pr_warn("\nstack backtrace:\n"); 4538 pr_warn("\nstack backtrace:\n");
4428 dump_stack(); 4539 dump_stack();
4429
4430 return 0;
4431} 4540}
4432 4541
4433static void 4542static void
@@ -4572,9 +4681,7 @@ void lockdep_reset(void)
4572 int i; 4681 int i;
4573 4682
4574 raw_local_irq_save(flags); 4683 raw_local_irq_save(flags);
4575 current->curr_chain_key = 0; 4684 lockdep_init_task(current);
4576 current->lockdep_depth = 0;
4577 current->lockdep_recursion = 0;
4578 memset(current->held_locks, 0, MAX_LOCK_DEPTH*sizeof(struct held_lock)); 4685 memset(current->held_locks, 0, MAX_LOCK_DEPTH*sizeof(struct held_lock));
4579 nr_hardirq_chains = 0; 4686 nr_hardirq_chains = 0;
4580 nr_softirq_chains = 0; 4687 nr_softirq_chains = 0;
@@ -4614,9 +4721,9 @@ static void remove_class_from_lock_chain(struct pending_free *pf,
4614 return; 4721 return;
4615 4722
4616recalc: 4723recalc:
4617 chain_key = 0; 4724 chain_key = INITIAL_CHAIN_KEY;
4618 for (i = chain->base; i < chain->base + chain->depth; i++) 4725 for (i = chain->base; i < chain->base + chain->depth; i++)
4619 chain_key = iterate_chain_key(chain_key, chain_hlocks[i] + 1); 4726 chain_key = iterate_chain_key(chain_key, chain_hlocks[i]);
4620 if (chain->depth && chain->chain_key == chain_key) 4727 if (chain->depth && chain->chain_key == chain_key)
4621 return; 4728 return;
4622 /* Overwrite the chain key for concurrent RCU readers. */ 4729 /* Overwrite the chain key for concurrent RCU readers. */
@@ -4690,6 +4797,7 @@ static void zap_class(struct pending_free *pf, struct lock_class *class)
4690 WRITE_ONCE(class->key, NULL); 4797 WRITE_ONCE(class->key, NULL);
4691 WRITE_ONCE(class->name, NULL); 4798 WRITE_ONCE(class->name, NULL);
4692 nr_lock_classes--; 4799 nr_lock_classes--;
4800 __clear_bit(class - lock_classes, lock_classes_in_use);
4693 } else { 4801 } else {
4694 WARN_ONCE(true, "%s() failed for class %s\n", __func__, 4802 WARN_ONCE(true, "%s() failed for class %s\n", __func__,
4695 class->name); 4803 class->name);
@@ -5035,6 +5143,7 @@ void __init lockdep_init(void)
5035 5143
5036 printk(" memory used by lock dependency info: %zu kB\n", 5144 printk(" memory used by lock dependency info: %zu kB\n",
5037 (sizeof(lock_classes) + 5145 (sizeof(lock_classes) +
5146 sizeof(lock_classes_in_use) +
5038 sizeof(classhash_table) + 5147 sizeof(classhash_table) +
5039 sizeof(list_entries) + 5148 sizeof(list_entries) +
5040 sizeof(list_entries_in_use) + 5149 sizeof(list_entries_in_use) +
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index 150ec3f0c5b5..cc83568d5012 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -131,7 +131,6 @@ extern unsigned int nr_hardirq_chains;
131extern unsigned int nr_softirq_chains; 131extern unsigned int nr_softirq_chains;
132extern unsigned int nr_process_chains; 132extern unsigned int nr_process_chains;
133extern unsigned int max_lockdep_depth; 133extern unsigned int max_lockdep_depth;
134extern unsigned int max_recursion_depth;
135 134
136extern unsigned int max_bfs_queue_depth; 135extern unsigned int max_bfs_queue_depth;
137 136
@@ -160,25 +159,22 @@ lockdep_count_backward_deps(struct lock_class *class)
160 * and we want to avoid too much cache bouncing. 159 * and we want to avoid too much cache bouncing.
161 */ 160 */
162struct lockdep_stats { 161struct lockdep_stats {
163 int chain_lookup_hits; 162 unsigned long chain_lookup_hits;
164 int chain_lookup_misses; 163 unsigned int chain_lookup_misses;
165 int hardirqs_on_events; 164 unsigned long hardirqs_on_events;
166 int hardirqs_off_events; 165 unsigned long hardirqs_off_events;
167 int redundant_hardirqs_on; 166 unsigned long redundant_hardirqs_on;
168 int redundant_hardirqs_off; 167 unsigned long redundant_hardirqs_off;
169 int softirqs_on_events; 168 unsigned long softirqs_on_events;
170 int softirqs_off_events; 169 unsigned long softirqs_off_events;
171 int redundant_softirqs_on; 170 unsigned long redundant_softirqs_on;
172 int redundant_softirqs_off; 171 unsigned long redundant_softirqs_off;
173 int nr_unused_locks; 172 int nr_unused_locks;
174 int nr_redundant_checks; 173 unsigned int nr_redundant_checks;
175 int nr_redundant; 174 unsigned int nr_redundant;
176 int nr_cyclic_checks; 175 unsigned int nr_cyclic_checks;
177 int nr_cyclic_check_recursions; 176 unsigned int nr_find_usage_forwards_checks;
178 int nr_find_usage_forwards_checks; 177 unsigned int nr_find_usage_backwards_checks;
179 int nr_find_usage_forwards_recursions;
180 int nr_find_usage_backwards_checks;
181 int nr_find_usage_backwards_recursions;
182 178
183 /* 179 /*
184 * Per lock class locking operation stat counts 180 * Per lock class locking operation stat counts
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 80a463d31a8d..c513031cd7e3 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -975,7 +975,7 @@ static int __init lock_torture_init(void)
975 goto unwind; 975 goto unwind;
976 } 976 }
977 if (stutter > 0) { 977 if (stutter > 0) {
978 firsterr = torture_stutter_init(stutter); 978 firsterr = torture_stutter_init(stutter, stutter);
979 if (firsterr) 979 if (firsterr)
980 goto unwind; 980 goto unwind;
981 } 981 }
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index db578783dd36..0c601ae072b3 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * kernel/locking/mutex.c 3 * kernel/locking/mutex.c
3 * 4 *
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index f17dad99eec8..364d38a0c444 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1#include <linux/atomic.h> 2#include <linux/atomic.h>
2#include <linux/rwsem.h> 3#include <linux/rwsem.h>
3#include <linux/percpu.h> 4#include <linux/percpu.h>
@@ -17,7 +18,7 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
17 return -ENOMEM; 18 return -ENOMEM;
18 19
19 /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ 20 /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
20 rcu_sync_init(&sem->rss, RCU_SCHED_SYNC); 21 rcu_sync_init(&sem->rss);
21 __init_rwsem(&sem->rw_sem, name, rwsem_key); 22 __init_rwsem(&sem->rw_sem, name, rwsem_key);
22 rcuwait_init(&sem->writer); 23 rcuwait_init(&sem->writer);
23 sem->readers_block = 0; 24 sem->readers_block = 0;
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index c7471c3fb798..fe9ca92faa2a 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -1,16 +1,7 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
1/* 2/*
2 * Queued read/write locks 3 * Queued read/write locks
3 * 4 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P. 5 * (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P.
15 * 6 *
16 * Authors: Waiman Long <waiman.long@hp.com> 7 * Authors: Waiman Long <waiman.long@hp.com>
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index e14b32c69639..2473f10c6956 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -1,16 +1,7 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
1/* 2/*
2 * Queued spinlock 3 * Queued spinlock
3 * 4 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. 5 * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
15 * (C) Copyright 2013-2014,2018 Red Hat, Inc. 6 * (C) Copyright 2013-2014,2018 Red Hat, Inc.
16 * (C) Copyright 2015 Intel Corp. 7 * (C) Copyright 2015 Intel Corp.
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index 54152670ff24..e625bb410aa2 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -1,13 +1,5 @@
1/* SPDX-License-Identifier: GPL-2.0-or-later */
1/* 2/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 * 3 *
12 * Authors: Waiman Long <longman@redhat.com> 4 * Authors: Waiman Long <longman@redhat.com>
13 */ 5 */
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 978d63a8261c..38fbf9fa7f1b 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * RT-Mutexes: simple blocking mutual exclusion locks with PI support 3 * RT-Mutexes: simple blocking mutual exclusion locks with PI support
3 * 4 *
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
deleted file mode 100644
index 6b3ee9948bf1..000000000000
--- a/kernel/locking/rwsem-xadd.c
+++ /dev/null
@@ -1,729 +0,0 @@
1// SPDX-License-Identifier: GPL-2.0
2/* rwsem.c: R/W semaphores: contention handling functions
3 *
4 * Written by David Howells (dhowells@redhat.com).
5 * Derived from arch/i386/kernel/semaphore.c
6 *
7 * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
8 * and Michel Lespinasse <walken@google.com>
9 *
10 * Optimistic spinning by Tim Chen <tim.c.chen@intel.com>
11 * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes.
12 */
13#include <linux/rwsem.h>
14#include <linux/init.h>
15#include <linux/export.h>
16#include <linux/sched/signal.h>
17#include <linux/sched/rt.h>
18#include <linux/sched/wake_q.h>
19#include <linux/sched/debug.h>
20#include <linux/osq_lock.h>
21
22#include "rwsem.h"
23
24/*
25 * Guide to the rw_semaphore's count field for common values.
26 * (32-bit case illustrated, similar for 64-bit)
27 *
28 * 0x0000000X (1) X readers active or attempting lock, no writer waiting
29 * X = #active_readers + #readers attempting to lock
30 * (X*ACTIVE_BIAS)
31 *
32 * 0x00000000 rwsem is unlocked, and no one is waiting for the lock or
33 * attempting to read lock or write lock.
34 *
35 * 0xffff000X (1) X readers active or attempting lock, with waiters for lock
36 * X = #active readers + # readers attempting lock
37 * (X*ACTIVE_BIAS + WAITING_BIAS)
38 * (2) 1 writer attempting lock, no waiters for lock
39 * X-1 = #active readers + #readers attempting lock
40 * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS)
41 * (3) 1 writer active, no waiters for lock
42 * X-1 = #active readers + #readers attempting lock
43 * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS)
44 *
45 * 0xffff0001 (1) 1 reader active or attempting lock, waiters for lock
46 * (WAITING_BIAS + ACTIVE_BIAS)
47 * (2) 1 writer active or attempting lock, no waiters for lock
48 * (ACTIVE_WRITE_BIAS)
49 *
50 * 0xffff0000 (1) There are writers or readers queued but none active
51 * or in the process of attempting lock.
52 * (WAITING_BIAS)
53 * Note: writer can attempt to steal lock for this count by adding
54 * ACTIVE_WRITE_BIAS in cmpxchg and checking the old count
55 *
56 * 0xfffe0001 (1) 1 writer active, or attempting lock. Waiters on queue.
57 * (ACTIVE_WRITE_BIAS + WAITING_BIAS)
58 *
59 * Note: Readers attempt to lock by adding ACTIVE_BIAS in down_read and checking
60 * the count becomes more than 0 for successful lock acquisition,
61 * i.e. the case where there are only readers or nobody has lock.
62 * (1st and 2nd case above).
63 *
64 * Writers attempt to lock by adding ACTIVE_WRITE_BIAS in down_write and
65 * checking the count becomes ACTIVE_WRITE_BIAS for successful lock
66 * acquisition (i.e. nobody else has lock or attempts lock). If
67 * unsuccessful, in rwsem_down_write_failed, we'll check to see if there
68 * are only waiters but none active (5th case above), and attempt to
69 * steal the lock.
70 *
71 */
72
73/*
74 * Initialize an rwsem:
75 */
76void __init_rwsem(struct rw_semaphore *sem, const char *name,
77 struct lock_class_key *key)
78{
79#ifdef CONFIG_DEBUG_LOCK_ALLOC
80 /*
81 * Make sure we are not reinitializing a held semaphore:
82 */
83 debug_check_no_locks_freed((void *)sem, sizeof(*sem));
84 lockdep_init_map(&sem->dep_map, name, key, 0);
85#endif
86 atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE);
87 raw_spin_lock_init(&sem->wait_lock);
88 INIT_LIST_HEAD(&sem->wait_list);
89#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
90 sem->owner = NULL;
91 osq_lock_init(&sem->osq);
92#endif
93}
94
95EXPORT_SYMBOL(__init_rwsem);
96
97enum rwsem_waiter_type {
98 RWSEM_WAITING_FOR_WRITE,
99 RWSEM_WAITING_FOR_READ
100};
101
102struct rwsem_waiter {
103 struct list_head list;
104 struct task_struct *task;
105 enum rwsem_waiter_type type;
106};
107
108enum rwsem_wake_type {
109 RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */
110 RWSEM_WAKE_READERS, /* Wake readers only */
111 RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */
112};
113
114/*
115 * handle the lock release when processes blocked on it that can now run
116 * - if we come here from up_xxxx(), then:
117 * - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed)
118 * - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so)
119 * - there must be someone on the queue
120 * - the wait_lock must be held by the caller
121 * - tasks are marked for wakeup, the caller must later invoke wake_up_q()
122 * to actually wakeup the blocked task(s) and drop the reference count,
123 * preferably when the wait_lock is released
124 * - woken process blocks are discarded from the list after having task zeroed
125 * - writers are only marked woken if downgrading is false
126 */
127static void __rwsem_mark_wake(struct rw_semaphore *sem,
128 enum rwsem_wake_type wake_type,
129 struct wake_q_head *wake_q)
130{
131 struct rwsem_waiter *waiter, *tmp;
132 long oldcount, woken = 0, adjustment = 0;
133
134 /*
135 * Take a peek at the queue head waiter such that we can determine
136 * the wakeup(s) to perform.
137 */
138 waiter = list_first_entry(&sem->wait_list, struct rwsem_waiter, list);
139
140 if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
141 if (wake_type == RWSEM_WAKE_ANY) {
142 /*
143 * Mark writer at the front of the queue for wakeup.
144 * Until the task is actually later awoken later by
145 * the caller, other writers are able to steal it.
146 * Readers, on the other hand, will block as they
147 * will notice the queued writer.
148 */
149 wake_q_add(wake_q, waiter->task);
150 lockevent_inc(rwsem_wake_writer);
151 }
152
153 return;
154 }
155
156 /*
157 * Writers might steal the lock before we grant it to the next reader.
158 * We prefer to do the first reader grant before counting readers
159 * so we can bail out early if a writer stole the lock.
160 */
161 if (wake_type != RWSEM_WAKE_READ_OWNED) {
162 adjustment = RWSEM_ACTIVE_READ_BIAS;
163 try_reader_grant:
164 oldcount = atomic_long_fetch_add(adjustment, &sem->count);
165 if (unlikely(oldcount < RWSEM_WAITING_BIAS)) {
166 /*
167 * If the count is still less than RWSEM_WAITING_BIAS
168 * after removing the adjustment, it is assumed that
169 * a writer has stolen the lock. We have to undo our
170 * reader grant.
171 */
172 if (atomic_long_add_return(-adjustment, &sem->count) <
173 RWSEM_WAITING_BIAS)
174 return;
175
176 /* Last active locker left. Retry waking readers. */
177 goto try_reader_grant;
178 }
179 /*
180 * Set it to reader-owned to give spinners an early
181 * indication that readers now have the lock.
182 */
183 __rwsem_set_reader_owned(sem, waiter->task);
184 }
185
186 /*
187 * Grant an infinite number of read locks to the readers at the front
188 * of the queue. We know that woken will be at least 1 as we accounted
189 * for above. Note we increment the 'active part' of the count by the
190 * number of readers before waking any processes up.
191 */
192 list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) {
193 struct task_struct *tsk;
194
195 if (waiter->type == RWSEM_WAITING_FOR_WRITE)
196 break;
197
198 woken++;
199 tsk = waiter->task;
200
201 get_task_struct(tsk);
202 list_del(&waiter->list);
203 /*
204 * Ensure calling get_task_struct() before setting the reader
205 * waiter to nil such that rwsem_down_read_failed() cannot
206 * race with do_exit() by always holding a reference count
207 * to the task to wakeup.
208 */
209 smp_store_release(&waiter->task, NULL);
210 /*
211 * Ensure issuing the wakeup (either by us or someone else)
212 * after setting the reader waiter to nil.
213 */
214 wake_q_add_safe(wake_q, tsk);
215 }
216
217 adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
218 lockevent_cond_inc(rwsem_wake_reader, woken);
219 if (list_empty(&sem->wait_list)) {
220 /* hit end of list above */
221 adjustment -= RWSEM_WAITING_BIAS;
222 }
223
224 if (adjustment)
225 atomic_long_add(adjustment, &sem->count);
226}
227
228/*
229 * This function must be called with the sem->wait_lock held to prevent
230 * race conditions between checking the rwsem wait list and setting the
231 * sem->count accordingly.
232 */
233static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
234{
235 /*
236 * Avoid trying to acquire write lock if count isn't RWSEM_WAITING_BIAS.
237 */
238 if (count != RWSEM_WAITING_BIAS)
239 return false;
240
241 /*
242 * Acquire the lock by trying to set it to ACTIVE_WRITE_BIAS. If there
243 * are other tasks on the wait list, we need to add on WAITING_BIAS.
244 */
245 count = list_is_singular(&sem->wait_list) ?
246 RWSEM_ACTIVE_WRITE_BIAS :
247 RWSEM_ACTIVE_WRITE_BIAS + RWSEM_WAITING_BIAS;
248
249 if (atomic_long_cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, count)
250 == RWSEM_WAITING_BIAS) {
251 rwsem_set_owner(sem);
252 return true;
253 }
254
255 return false;
256}
257
258#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
259/*
260 * Try to acquire write lock before the writer has been put on wait queue.
261 */
262static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
263{
264 long count = atomic_long_read(&sem->count);
265
266 while (!count || count == RWSEM_WAITING_BIAS) {
267 if (atomic_long_try_cmpxchg_acquire(&sem->count, &count,
268 count + RWSEM_ACTIVE_WRITE_BIAS)) {
269 rwsem_set_owner(sem);
270 lockevent_inc(rwsem_opt_wlock);
271 return true;
272 }
273 }
274 return false;
275}
276
277static inline bool owner_on_cpu(struct task_struct *owner)
278{
279 /*
280 * As lock holder preemption issue, we both skip spinning if
281 * task is not on cpu or its cpu is preempted
282 */
283 return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
284}
285
286static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
287{
288 struct task_struct *owner;
289 bool ret = true;
290
291 BUILD_BUG_ON(!rwsem_has_anonymous_owner(RWSEM_OWNER_UNKNOWN));
292
293 if (need_resched())
294 return false;
295
296 rcu_read_lock();
297 owner = READ_ONCE(sem->owner);
298 if (owner) {
299 ret = is_rwsem_owner_spinnable(owner) &&
300 owner_on_cpu(owner);
301 }
302 rcu_read_unlock();
303 return ret;
304}
305
306/*
307 * Return true only if we can still spin on the owner field of the rwsem.
308 */
309static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem)
310{
311 struct task_struct *owner = READ_ONCE(sem->owner);
312
313 if (!is_rwsem_owner_spinnable(owner))
314 return false;
315
316 rcu_read_lock();
317 while (owner && (READ_ONCE(sem->owner) == owner)) {
318 /*
319 * Ensure we emit the owner->on_cpu, dereference _after_
320 * checking sem->owner still matches owner, if that fails,
321 * owner might point to free()d memory, if it still matches,
322 * the rcu_read_lock() ensures the memory stays valid.
323 */
324 barrier();
325
326 /*
327 * abort spinning when need_resched or owner is not running or
328 * owner's cpu is preempted.
329 */
330 if (need_resched() || !owner_on_cpu(owner)) {
331 rcu_read_unlock();
332 return false;
333 }
334
335 cpu_relax();
336 }
337 rcu_read_unlock();
338
339 /*
340 * If there is a new owner or the owner is not set, we continue
341 * spinning.
342 */
343 return is_rwsem_owner_spinnable(READ_ONCE(sem->owner));
344}
345
346static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
347{
348 bool taken = false;
349
350 preempt_disable();
351
352 /* sem->wait_lock should not be held when doing optimistic spinning */
353 if (!rwsem_can_spin_on_owner(sem))
354 goto done;
355
356 if (!osq_lock(&sem->osq))
357 goto done;
358
359 /*
360 * Optimistically spin on the owner field and attempt to acquire the
361 * lock whenever the owner changes. Spinning will be stopped when:
362 * 1) the owning writer isn't running; or
363 * 2) readers own the lock as we can't determine if they are
364 * actively running or not.
365 */
366 while (rwsem_spin_on_owner(sem)) {
367 /*
368 * Try to acquire the lock
369 */
370 if (rwsem_try_write_lock_unqueued(sem)) {
371 taken = true;
372 break;
373 }
374
375 /*
376 * When there's no owner, we might have preempted between the
377 * owner acquiring the lock and setting the owner field. If
378 * we're an RT task that will live-lock because we won't let
379 * the owner complete.
380 */
381 if (!sem->owner && (need_resched() || rt_task(current)))
382 break;
383
384 /*
385 * The cpu_relax() call is a compiler barrier which forces
386 * everything in this loop to be re-loaded. We don't need
387 * memory barriers as we'll eventually observe the right
388 * values at the cost of a few extra spins.
389 */
390 cpu_relax();
391 }
392 osq_unlock(&sem->osq);
393done:
394 preempt_enable();
395 lockevent_cond_inc(rwsem_opt_fail, !taken);
396 return taken;
397}
398
399/*
400 * Return true if the rwsem has active spinner
401 */
402static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
403{
404 return osq_is_locked(&sem->osq);
405}
406
407#else
408static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
409{
410 return false;
411}
412
413static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
414{
415 return false;
416}
417#endif
418
419/*
420 * Wait for the read lock to be granted
421 */
422static inline struct rw_semaphore __sched *
423__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state)
424{
425 long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
426 struct rwsem_waiter waiter;
427 DEFINE_WAKE_Q(wake_q);
428
429 waiter.task = current;
430 waiter.type = RWSEM_WAITING_FOR_READ;
431
432 raw_spin_lock_irq(&sem->wait_lock);
433 if (list_empty(&sem->wait_list)) {
434 /*
435 * In case the wait queue is empty and the lock isn't owned
436 * by a writer, this reader can exit the slowpath and return
437 * immediately as its RWSEM_ACTIVE_READ_BIAS has already
438 * been set in the count.
439 */
440 if (atomic_long_read(&sem->count) >= 0) {
441 raw_spin_unlock_irq(&sem->wait_lock);
442 rwsem_set_reader_owned(sem);
443 lockevent_inc(rwsem_rlock_fast);
444 return sem;
445 }
446 adjustment += RWSEM_WAITING_BIAS;
447 }
448 list_add_tail(&waiter.list, &sem->wait_list);
449
450 /* we're now waiting on the lock, but no longer actively locking */
451 count = atomic_long_add_return(adjustment, &sem->count);
452
453 /*
454 * If there are no active locks, wake the front queued process(es).
455 *
456 * If there are no writers and we are first in the queue,
457 * wake our own waiter to join the existing active readers !
458 */
459 if (count == RWSEM_WAITING_BIAS ||
460 (count > RWSEM_WAITING_BIAS &&
461 adjustment != -RWSEM_ACTIVE_READ_BIAS))
462 __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
463
464 raw_spin_unlock_irq(&sem->wait_lock);
465 wake_up_q(&wake_q);
466
467 /* wait to be given the lock */
468 while (true) {
469 set_current_state(state);
470 if (!waiter.task)
471 break;
472 if (signal_pending_state(state, current)) {
473 raw_spin_lock_irq(&sem->wait_lock);
474 if (waiter.task)
475 goto out_nolock;
476 raw_spin_unlock_irq(&sem->wait_lock);
477 break;
478 }
479 schedule();
480 lockevent_inc(rwsem_sleep_reader);
481 }
482
483 __set_current_state(TASK_RUNNING);
484 lockevent_inc(rwsem_rlock);
485 return sem;
486out_nolock:
487 list_del(&waiter.list);
488 if (list_empty(&sem->wait_list))
489 atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count);
490 raw_spin_unlock_irq(&sem->wait_lock);
491 __set_current_state(TASK_RUNNING);
492 lockevent_inc(rwsem_rlock_fail);
493 return ERR_PTR(-EINTR);
494}
495
496__visible struct rw_semaphore * __sched
497rwsem_down_read_failed(struct rw_semaphore *sem)
498{
499 return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE);
500}
501EXPORT_SYMBOL(rwsem_down_read_failed);
502
503__visible struct rw_semaphore * __sched
504rwsem_down_read_failed_killable(struct rw_semaphore *sem)
505{
506 return __rwsem_down_read_failed_common(sem, TASK_KILLABLE);
507}
508EXPORT_SYMBOL(rwsem_down_read_failed_killable);
509
510/*
511 * Wait until we successfully acquire the write lock
512 */
513static inline struct rw_semaphore *
514__rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
515{
516 long count;
517 bool waiting = true; /* any queued threads before us */
518 struct rwsem_waiter waiter;
519 struct rw_semaphore *ret = sem;
520 DEFINE_WAKE_Q(wake_q);
521
522 /* undo write bias from down_write operation, stop active locking */
523 count = atomic_long_sub_return(RWSEM_ACTIVE_WRITE_BIAS, &sem->count);
524
525 /* do optimistic spinning and steal lock if possible */
526 if (rwsem_optimistic_spin(sem))
527 return sem;
528
529 /*
530 * Optimistic spinning failed, proceed to the slowpath
531 * and block until we can acquire the sem.
532 */
533 waiter.task = current;
534 waiter.type = RWSEM_WAITING_FOR_WRITE;
535
536 raw_spin_lock_irq(&sem->wait_lock);
537
538 /* account for this before adding a new element to the list */
539 if (list_empty(&sem->wait_list))
540 waiting = false;
541
542 list_add_tail(&waiter.list, &sem->wait_list);
543
544 /* we're now waiting on the lock, but no longer actively locking */
545 if (waiting) {
546 count = atomic_long_read(&sem->count);
547
548 /*
549 * If there were already threads queued before us and there are
550 * no active writers, the lock must be read owned; so we try to
551 * wake any read locks that were queued ahead of us.
552 */
553 if (count > RWSEM_WAITING_BIAS) {
554 __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q);
555 /*
556 * The wakeup is normally called _after_ the wait_lock
557 * is released, but given that we are proactively waking
558 * readers we can deal with the wake_q overhead as it is
559 * similar to releasing and taking the wait_lock again
560 * for attempting rwsem_try_write_lock().
561 */
562 wake_up_q(&wake_q);
563
564 /*
565 * Reinitialize wake_q after use.
566 */
567 wake_q_init(&wake_q);
568 }
569
570 } else
571 count = atomic_long_add_return(RWSEM_WAITING_BIAS, &sem->count);
572
573 /* wait until we successfully acquire the lock */
574 set_current_state(state);
575 while (true) {
576 if (rwsem_try_write_lock(count, sem))
577 break;
578 raw_spin_unlock_irq(&sem->wait_lock);
579
580 /* Block until there are no active lockers. */
581 do {
582 if (signal_pending_state(state, current))
583 goto out_nolock;
584
585 schedule();
586 lockevent_inc(rwsem_sleep_writer);
587 set_current_state(state);
588 } while ((count = atomic_long_read(&sem->count)) & RWSEM_ACTIVE_MASK);
589
590 raw_spin_lock_irq(&sem->wait_lock);
591 }
592 __set_current_state(TASK_RUNNING);
593 list_del(&waiter.list);
594 raw_spin_unlock_irq(&sem->wait_lock);
595 lockevent_inc(rwsem_wlock);
596
597 return ret;
598
599out_nolock:
600 __set_current_state(TASK_RUNNING);
601 raw_spin_lock_irq(&sem->wait_lock);
602 list_del(&waiter.list);
603 if (list_empty(&sem->wait_list))
604 atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count);
605 else
606 __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
607 raw_spin_unlock_irq(&sem->wait_lock);
608 wake_up_q(&wake_q);
609 lockevent_inc(rwsem_wlock_fail);
610
611 return ERR_PTR(-EINTR);
612}
613
614__visible struct rw_semaphore * __sched
615rwsem_down_write_failed(struct rw_semaphore *sem)
616{
617 return __rwsem_down_write_failed_common(sem, TASK_UNINTERRUPTIBLE);
618}
619EXPORT_SYMBOL(rwsem_down_write_failed);
620
621__visible struct rw_semaphore * __sched
622rwsem_down_write_failed_killable(struct rw_semaphore *sem)
623{
624 return __rwsem_down_write_failed_common(sem, TASK_KILLABLE);
625}
626EXPORT_SYMBOL(rwsem_down_write_failed_killable);
627
628/*
629 * handle waking up a waiter on the semaphore
630 * - up_read/up_write has decremented the active part of count if we come here
631 */
632__visible
633struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
634{
635 unsigned long flags;
636 DEFINE_WAKE_Q(wake_q);
637
638 /*
639 * __rwsem_down_write_failed_common(sem)
640 * rwsem_optimistic_spin(sem)
641 * osq_unlock(sem->osq)
642 * ...
643 * atomic_long_add_return(&sem->count)
644 *
645 * - VS -
646 *
647 * __up_write()
648 * if (atomic_long_sub_return_release(&sem->count) < 0)
649 * rwsem_wake(sem)
650 * osq_is_locked(&sem->osq)
651 *
652 * And __up_write() must observe !osq_is_locked() when it observes the
653 * atomic_long_add_return() in order to not miss a wakeup.
654 *
655 * This boils down to:
656 *
657 * [S.rel] X = 1 [RmW] r0 = (Y += 0)
658 * MB RMB
659 * [RmW] Y += 1 [L] r1 = X
660 *
661 * exists (r0=1 /\ r1=0)
662 */
663 smp_rmb();
664
665 /*
666 * If a spinner is present, it is not necessary to do the wakeup.
667 * Try to do wakeup only if the trylock succeeds to minimize
668 * spinlock contention which may introduce too much delay in the
669 * unlock operation.
670 *
671 * spinning writer up_write/up_read caller
672 * --------------- -----------------------
673 * [S] osq_unlock() [L] osq
674 * MB RMB
675 * [RmW] rwsem_try_write_lock() [RmW] spin_trylock(wait_lock)
676 *
677 * Here, it is important to make sure that there won't be a missed
678 * wakeup while the rwsem is free and the only spinning writer goes
679 * to sleep without taking the rwsem. Even when the spinning writer
680 * is just going to break out of the waiting loop, it will still do
681 * a trylock in rwsem_down_write_failed() before sleeping. IOW, if
682 * rwsem_has_spinner() is true, it will guarantee at least one
683 * trylock attempt on the rwsem later on.
684 */
685 if (rwsem_has_spinner(sem)) {
686 /*
687 * The smp_rmb() here is to make sure that the spinner
688 * state is consulted before reading the wait_lock.
689 */
690 smp_rmb();
691 if (!raw_spin_trylock_irqsave(&sem->wait_lock, flags))
692 return sem;
693 goto locked;
694 }
695 raw_spin_lock_irqsave(&sem->wait_lock, flags);
696locked:
697
698 if (!list_empty(&sem->wait_list))
699 __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
700
701 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
702 wake_up_q(&wake_q);
703
704 return sem;
705}
706EXPORT_SYMBOL(rwsem_wake);
707
708/*
709 * downgrade a write lock into a read lock
710 * - caller incremented waiting part of count and discovered it still negative
711 * - just wake up any readers at the front of the queue
712 */
713__visible
714struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
715{
716 unsigned long flags;
717 DEFINE_WAKE_Q(wake_q);
718
719 raw_spin_lock_irqsave(&sem->wait_lock, flags);
720
721 if (!list_empty(&sem->wait_list))
722 __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);
723
724 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
725 wake_up_q(&wake_q);
726
727 return sem;
728}
729EXPORT_SYMBOL(rwsem_downgrade_wake);
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index ccbf18f560ff..37524a47f002 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -3,17 +3,1438 @@
3 * 3 *
4 * Written by David Howells (dhowells@redhat.com). 4 * Written by David Howells (dhowells@redhat.com).
5 * Derived from asm-i386/semaphore.h 5 * Derived from asm-i386/semaphore.h
6 *
7 * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
8 * and Michel Lespinasse <walken@google.com>
9 *
10 * Optimistic spinning by Tim Chen <tim.c.chen@intel.com>
11 * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes.
12 *
13 * Rwsem count bit fields re-definition and rwsem rearchitecture by
14 * Waiman Long <longman@redhat.com> and
15 * Peter Zijlstra <peterz@infradead.org>.
6 */ 16 */
7 17
8#include <linux/types.h> 18#include <linux/types.h>
9#include <linux/kernel.h> 19#include <linux/kernel.h>
10#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/sched/rt.h>
22#include <linux/sched/task.h>
11#include <linux/sched/debug.h> 23#include <linux/sched/debug.h>
24#include <linux/sched/wake_q.h>
25#include <linux/sched/signal.h>
26#include <linux/sched/clock.h>
12#include <linux/export.h> 27#include <linux/export.h>
13#include <linux/rwsem.h> 28#include <linux/rwsem.h>
14#include <linux/atomic.h> 29#include <linux/atomic.h>
15 30
16#include "rwsem.h" 31#include "rwsem.h"
32#include "lock_events.h"
33
34/*
35 * The least significant 3 bits of the owner value has the following
36 * meanings when set.
37 * - Bit 0: RWSEM_READER_OWNED - The rwsem is owned by readers
38 * - Bit 1: RWSEM_RD_NONSPINNABLE - Readers cannot spin on this lock.
39 * - Bit 2: RWSEM_WR_NONSPINNABLE - Writers cannot spin on this lock.
40 *
41 * When the rwsem is either owned by an anonymous writer, or it is
42 * reader-owned, but a spinning writer has timed out, both nonspinnable
43 * bits will be set to disable optimistic spinning by readers and writers.
44 * In the later case, the last unlocking reader should then check the
45 * writer nonspinnable bit and clear it only to give writers preference
46 * to acquire the lock via optimistic spinning, but not readers. Similar
47 * action is also done in the reader slowpath.
48
49 * When a writer acquires a rwsem, it puts its task_struct pointer
50 * into the owner field. It is cleared after an unlock.
51 *
52 * When a reader acquires a rwsem, it will also puts its task_struct
53 * pointer into the owner field with the RWSEM_READER_OWNED bit set.
54 * On unlock, the owner field will largely be left untouched. So
55 * for a free or reader-owned rwsem, the owner value may contain
56 * information about the last reader that acquires the rwsem.
57 *
58 * That information may be helpful in debugging cases where the system
59 * seems to hang on a reader owned rwsem especially if only one reader
60 * is involved. Ideally we would like to track all the readers that own
61 * a rwsem, but the overhead is simply too big.
62 *
63 * Reader optimistic spinning is helpful when the reader critical section
64 * is short and there aren't that many readers around. It makes readers
65 * relatively more preferred than writers. When a writer times out spinning
66 * on a reader-owned lock and set the nospinnable bits, there are two main
67 * reasons for that.
68 *
69 * 1) The reader critical section is long, perhaps the task sleeps after
70 * acquiring the read lock.
71 * 2) There are just too many readers contending the lock causing it to
72 * take a while to service all of them.
73 *
74 * In the former case, long reader critical section will impede the progress
75 * of writers which is usually more important for system performance. In
76 * the later case, reader optimistic spinning tends to make the reader
77 * groups that contain readers that acquire the lock together smaller
78 * leading to more of them. That may hurt performance in some cases. In
79 * other words, the setting of nonspinnable bits indicates that reader
80 * optimistic spinning may not be helpful for those workloads that cause
81 * it.
82 *
83 * Therefore, any writers that had observed the setting of the writer
84 * nonspinnable bit for a given rwsem after they fail to acquire the lock
85 * via optimistic spinning will set the reader nonspinnable bit once they
86 * acquire the write lock. Similarly, readers that observe the setting
87 * of reader nonspinnable bit at slowpath entry will set the reader
88 * nonspinnable bits when they acquire the read lock via the wakeup path.
89 *
90 * Once the reader nonspinnable bit is on, it will only be reset when
91 * a writer is able to acquire the rwsem in the fast path or somehow a
92 * reader or writer in the slowpath doesn't observe the nonspinable bit.
93 *
94 * This is to discourage reader optmistic spinning on that particular
95 * rwsem and make writers more preferred. This adaptive disabling of reader
96 * optimistic spinning will alleviate the negative side effect of this
97 * feature.
98 */
99#define RWSEM_READER_OWNED (1UL << 0)
100#define RWSEM_RD_NONSPINNABLE (1UL << 1)
101#define RWSEM_WR_NONSPINNABLE (1UL << 2)
102#define RWSEM_NONSPINNABLE (RWSEM_RD_NONSPINNABLE | RWSEM_WR_NONSPINNABLE)
103#define RWSEM_OWNER_FLAGS_MASK (RWSEM_READER_OWNED | RWSEM_NONSPINNABLE)
104
105#ifdef CONFIG_DEBUG_RWSEMS
106# define DEBUG_RWSEMS_WARN_ON(c, sem) do { \
107 if (!debug_locks_silent && \
108 WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\
109 #c, atomic_long_read(&(sem)->count), \
110 atomic_long_read(&(sem)->owner), (long)current, \
111 list_empty(&(sem)->wait_list) ? "" : "not ")) \
112 debug_locks_off(); \
113 } while (0)
114#else
115# define DEBUG_RWSEMS_WARN_ON(c, sem)
116#endif
117
118/*
119 * On 64-bit architectures, the bit definitions of the count are:
120 *
121 * Bit 0 - writer locked bit
122 * Bit 1 - waiters present bit
123 * Bit 2 - lock handoff bit
124 * Bits 3-7 - reserved
125 * Bits 8-62 - 55-bit reader count
126 * Bit 63 - read fail bit
127 *
128 * On 32-bit architectures, the bit definitions of the count are:
129 *
130 * Bit 0 - writer locked bit
131 * Bit 1 - waiters present bit
132 * Bit 2 - lock handoff bit
133 * Bits 3-7 - reserved
134 * Bits 8-30 - 23-bit reader count
135 * Bit 31 - read fail bit
136 *
137 * It is not likely that the most significant bit (read fail bit) will ever
138 * be set. This guard bit is still checked anyway in the down_read() fastpath
139 * just in case we need to use up more of the reader bits for other purpose
140 * in the future.
141 *
142 * atomic_long_fetch_add() is used to obtain reader lock, whereas
143 * atomic_long_cmpxchg() will be used to obtain writer lock.
144 *
145 * There are three places where the lock handoff bit may be set or cleared.
146 * 1) rwsem_mark_wake() for readers.
147 * 2) rwsem_try_write_lock() for writers.
148 * 3) Error path of rwsem_down_write_slowpath().
149 *
150 * For all the above cases, wait_lock will be held. A writer must also
151 * be the first one in the wait_list to be eligible for setting the handoff
152 * bit. So concurrent setting/clearing of handoff bit is not possible.
153 */
154#define RWSEM_WRITER_LOCKED (1UL << 0)
155#define RWSEM_FLAG_WAITERS (1UL << 1)
156#define RWSEM_FLAG_HANDOFF (1UL << 2)
157#define RWSEM_FLAG_READFAIL (1UL << (BITS_PER_LONG - 1))
158
159#define RWSEM_READER_SHIFT 8
160#define RWSEM_READER_BIAS (1UL << RWSEM_READER_SHIFT)
161#define RWSEM_READER_MASK (~(RWSEM_READER_BIAS - 1))
162#define RWSEM_WRITER_MASK RWSEM_WRITER_LOCKED
163#define RWSEM_LOCK_MASK (RWSEM_WRITER_MASK|RWSEM_READER_MASK)
164#define RWSEM_READ_FAILED_MASK (RWSEM_WRITER_MASK|RWSEM_FLAG_WAITERS|\
165 RWSEM_FLAG_HANDOFF|RWSEM_FLAG_READFAIL)
166
167/*
168 * All writes to owner are protected by WRITE_ONCE() to make sure that
169 * store tearing can't happen as optimistic spinners may read and use
170 * the owner value concurrently without lock. Read from owner, however,
171 * may not need READ_ONCE() as long as the pointer value is only used
172 * for comparison and isn't being dereferenced.
173 */
174static inline void rwsem_set_owner(struct rw_semaphore *sem)
175{
176 atomic_long_set(&sem->owner, (long)current);
177}
178
179static inline void rwsem_clear_owner(struct rw_semaphore *sem)
180{
181 atomic_long_set(&sem->owner, 0);
182}
183
184/*
185 * Test the flags in the owner field.
186 */
187static inline bool rwsem_test_oflags(struct rw_semaphore *sem, long flags)
188{
189 return atomic_long_read(&sem->owner) & flags;
190}
191
192/*
193 * The task_struct pointer of the last owning reader will be left in
194 * the owner field.
195 *
196 * Note that the owner value just indicates the task has owned the rwsem
197 * previously, it may not be the real owner or one of the real owners
198 * anymore when that field is examined, so take it with a grain of salt.
199 *
200 * The reader non-spinnable bit is preserved.
201 */
202static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
203 struct task_struct *owner)
204{
205 unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED |
206 (atomic_long_read(&sem->owner) & RWSEM_RD_NONSPINNABLE);
207
208 atomic_long_set(&sem->owner, val);
209}
210
211static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
212{
213 __rwsem_set_reader_owned(sem, current);
214}
215
216/*
217 * Return true if the rwsem is owned by a reader.
218 */
219static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
220{
221#ifdef CONFIG_DEBUG_RWSEMS
222 /*
223 * Check the count to see if it is write-locked.
224 */
225 long count = atomic_long_read(&sem->count);
226
227 if (count & RWSEM_WRITER_MASK)
228 return false;
229#endif
230 return rwsem_test_oflags(sem, RWSEM_READER_OWNED);
231}
232
233#ifdef CONFIG_DEBUG_RWSEMS
234/*
235 * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there
236 * is a task pointer in owner of a reader-owned rwsem, it will be the
237 * real owner or one of the real owners. The only exception is when the
238 * unlock is done by up_read_non_owner().
239 */
240static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
241{
242 unsigned long val = atomic_long_read(&sem->owner);
243
244 while ((val & ~RWSEM_OWNER_FLAGS_MASK) == (unsigned long)current) {
245 if (atomic_long_try_cmpxchg(&sem->owner, &val,
246 val & RWSEM_OWNER_FLAGS_MASK))
247 return;
248 }
249}
250#else
251static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
252{
253}
254#endif
255
256/*
257 * Set the RWSEM_NONSPINNABLE bits if the RWSEM_READER_OWNED flag
258 * remains set. Otherwise, the operation will be aborted.
259 */
260static inline void rwsem_set_nonspinnable(struct rw_semaphore *sem)
261{
262 unsigned long owner = atomic_long_read(&sem->owner);
263
264 do {
265 if (!(owner & RWSEM_READER_OWNED))
266 break;
267 if (owner & RWSEM_NONSPINNABLE)
268 break;
269 } while (!atomic_long_try_cmpxchg(&sem->owner, &owner,
270 owner | RWSEM_NONSPINNABLE));
271}
272
273static inline bool rwsem_read_trylock(struct rw_semaphore *sem)
274{
275 long cnt = atomic_long_add_return_acquire(RWSEM_READER_BIAS, &sem->count);
276 if (WARN_ON_ONCE(cnt < 0))
277 rwsem_set_nonspinnable(sem);
278 return !(cnt & RWSEM_READ_FAILED_MASK);
279}
280
281/*
282 * Return just the real task structure pointer of the owner
283 */
284static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem)
285{
286 return (struct task_struct *)
287 (atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK);
288}
289
290/*
291 * Return the real task structure pointer of the owner and the embedded
292 * flags in the owner. pflags must be non-NULL.
293 */
294static inline struct task_struct *
295rwsem_owner_flags(struct rw_semaphore *sem, unsigned long *pflags)
296{
297 unsigned long owner = atomic_long_read(&sem->owner);
298
299 *pflags = owner & RWSEM_OWNER_FLAGS_MASK;
300 return (struct task_struct *)(owner & ~RWSEM_OWNER_FLAGS_MASK);
301}
302
303/*
304 * Guide to the rw_semaphore's count field.
305 *
306 * When the RWSEM_WRITER_LOCKED bit in count is set, the lock is owned
307 * by a writer.
308 *
309 * The lock is owned by readers when
310 * (1) the RWSEM_WRITER_LOCKED isn't set in count,
311 * (2) some of the reader bits are set in count, and
312 * (3) the owner field has RWSEM_READ_OWNED bit set.
313 *
314 * Having some reader bits set is not enough to guarantee a readers owned
315 * lock as the readers may be in the process of backing out from the count
316 * and a writer has just released the lock. So another writer may steal
317 * the lock immediately after that.
318 */
319
320/*
321 * Initialize an rwsem:
322 */
323void __init_rwsem(struct rw_semaphore *sem, const char *name,
324 struct lock_class_key *key)
325{
326#ifdef CONFIG_DEBUG_LOCK_ALLOC
327 /*
328 * Make sure we are not reinitializing a held semaphore:
329 */
330 debug_check_no_locks_freed((void *)sem, sizeof(*sem));
331 lockdep_init_map(&sem->dep_map, name, key, 0);
332#endif
333 atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE);
334 raw_spin_lock_init(&sem->wait_lock);
335 INIT_LIST_HEAD(&sem->wait_list);
336 atomic_long_set(&sem->owner, 0L);
337#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
338 osq_lock_init(&sem->osq);
339#endif
340}
341EXPORT_SYMBOL(__init_rwsem);
342
343enum rwsem_waiter_type {
344 RWSEM_WAITING_FOR_WRITE,
345 RWSEM_WAITING_FOR_READ
346};
347
348struct rwsem_waiter {
349 struct list_head list;
350 struct task_struct *task;
351 enum rwsem_waiter_type type;
352 unsigned long timeout;
353 unsigned long last_rowner;
354};
355#define rwsem_first_waiter(sem) \
356 list_first_entry(&sem->wait_list, struct rwsem_waiter, list)
357
358enum rwsem_wake_type {
359 RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */
360 RWSEM_WAKE_READERS, /* Wake readers only */
361 RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */
362};
363
364enum writer_wait_state {
365 WRITER_NOT_FIRST, /* Writer is not first in wait list */
366 WRITER_FIRST, /* Writer is first in wait list */
367 WRITER_HANDOFF /* Writer is first & handoff needed */
368};
369
370/*
371 * The typical HZ value is either 250 or 1000. So set the minimum waiting
372 * time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait
373 * queue before initiating the handoff protocol.
374 */
375#define RWSEM_WAIT_TIMEOUT DIV_ROUND_UP(HZ, 250)
376
377/*
378 * Magic number to batch-wakeup waiting readers, even when writers are
379 * also present in the queue. This both limits the amount of work the
380 * waking thread must do and also prevents any potential counter overflow,
381 * however unlikely.
382 */
383#define MAX_READERS_WAKEUP 0x100
384
385/*
386 * handle the lock release when processes blocked on it that can now run
387 * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must
388 * have been set.
389 * - there must be someone on the queue
390 * - the wait_lock must be held by the caller
391 * - tasks are marked for wakeup, the caller must later invoke wake_up_q()
392 * to actually wakeup the blocked task(s) and drop the reference count,
393 * preferably when the wait_lock is released
394 * - woken process blocks are discarded from the list after having task zeroed
395 * - writers are only marked woken if downgrading is false
396 */
397static void rwsem_mark_wake(struct rw_semaphore *sem,
398 enum rwsem_wake_type wake_type,
399 struct wake_q_head *wake_q)
400{
401 struct rwsem_waiter *waiter, *tmp;
402 long oldcount, woken = 0, adjustment = 0;
403 struct list_head wlist;
404
405 lockdep_assert_held(&sem->wait_lock);
406
407 /*
408 * Take a peek at the queue head waiter such that we can determine
409 * the wakeup(s) to perform.
410 */
411 waiter = rwsem_first_waiter(sem);
412
413 if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
414 if (wake_type == RWSEM_WAKE_ANY) {
415 /*
416 * Mark writer at the front of the queue for wakeup.
417 * Until the task is actually later awoken later by
418 * the caller, other writers are able to steal it.
419 * Readers, on the other hand, will block as they
420 * will notice the queued writer.
421 */
422 wake_q_add(wake_q, waiter->task);
423 lockevent_inc(rwsem_wake_writer);
424 }
425
426 return;
427 }
428
429 /*
430 * No reader wakeup if there are too many of them already.
431 */
432 if (unlikely(atomic_long_read(&sem->count) < 0))
433 return;
434
435 /*
436 * Writers might steal the lock before we grant it to the next reader.
437 * We prefer to do the first reader grant before counting readers
438 * so we can bail out early if a writer stole the lock.
439 */
440 if (wake_type != RWSEM_WAKE_READ_OWNED) {
441 struct task_struct *owner;
442
443 adjustment = RWSEM_READER_BIAS;
444 oldcount = atomic_long_fetch_add(adjustment, &sem->count);
445 if (unlikely(oldcount & RWSEM_WRITER_MASK)) {
446 /*
447 * When we've been waiting "too" long (for writers
448 * to give up the lock), request a HANDOFF to
449 * force the issue.
450 */
451 if (!(oldcount & RWSEM_FLAG_HANDOFF) &&
452 time_after(jiffies, waiter->timeout)) {
453 adjustment -= RWSEM_FLAG_HANDOFF;
454 lockevent_inc(rwsem_rlock_handoff);
455 }
456
457 atomic_long_add(-adjustment, &sem->count);
458 return;
459 }
460 /*
461 * Set it to reader-owned to give spinners an early
462 * indication that readers now have the lock.
463 * The reader nonspinnable bit seen at slowpath entry of
464 * the reader is copied over.
465 */
466 owner = waiter->task;
467 if (waiter->last_rowner & RWSEM_RD_NONSPINNABLE) {
468 owner = (void *)((unsigned long)owner | RWSEM_RD_NONSPINNABLE);
469 lockevent_inc(rwsem_opt_norspin);
470 }
471 __rwsem_set_reader_owned(sem, owner);
472 }
473
474 /*
475 * Grant up to MAX_READERS_WAKEUP read locks to all the readers in the
476 * queue. We know that the woken will be at least 1 as we accounted
477 * for above. Note we increment the 'active part' of the count by the
478 * number of readers before waking any processes up.
479 *
480 * This is an adaptation of the phase-fair R/W locks where at the
481 * reader phase (first waiter is a reader), all readers are eligible
482 * to acquire the lock at the same time irrespective of their order
483 * in the queue. The writers acquire the lock according to their
484 * order in the queue.
485 *
486 * We have to do wakeup in 2 passes to prevent the possibility that
487 * the reader count may be decremented before it is incremented. It
488 * is because the to-be-woken waiter may not have slept yet. So it
489 * may see waiter->task got cleared, finish its critical section and
490 * do an unlock before the reader count increment.
491 *
492 * 1) Collect the read-waiters in a separate list, count them and
493 * fully increment the reader count in rwsem.
494 * 2) For each waiters in the new list, clear waiter->task and
495 * put them into wake_q to be woken up later.
496 */
497 INIT_LIST_HEAD(&wlist);
498 list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) {
499 if (waiter->type == RWSEM_WAITING_FOR_WRITE)
500 continue;
501
502 woken++;
503 list_move_tail(&waiter->list, &wlist);
504
505 /*
506 * Limit # of readers that can be woken up per wakeup call.
507 */
508 if (woken >= MAX_READERS_WAKEUP)
509 break;
510 }
511
512 adjustment = woken * RWSEM_READER_BIAS - adjustment;
513 lockevent_cond_inc(rwsem_wake_reader, woken);
514 if (list_empty(&sem->wait_list)) {
515 /* hit end of list above */
516 adjustment -= RWSEM_FLAG_WAITERS;
517 }
518
519 /*
520 * When we've woken a reader, we no longer need to force writers
521 * to give up the lock and we can clear HANDOFF.
522 */
523 if (woken && (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF))
524 adjustment -= RWSEM_FLAG_HANDOFF;
525
526 if (adjustment)
527 atomic_long_add(adjustment, &sem->count);
528
529 /* 2nd pass */
530 list_for_each_entry_safe(waiter, tmp, &wlist, list) {
531 struct task_struct *tsk;
532
533 tsk = waiter->task;
534 get_task_struct(tsk);
535
536 /*
537 * Ensure calling get_task_struct() before setting the reader
538 * waiter to nil such that rwsem_down_read_slowpath() cannot
539 * race with do_exit() by always holding a reference count
540 * to the task to wakeup.
541 */
542 smp_store_release(&waiter->task, NULL);
543 /*
544 * Ensure issuing the wakeup (either by us or someone else)
545 * after setting the reader waiter to nil.
546 */
547 wake_q_add_safe(wake_q, tsk);
548 }
549}
550
551/*
552 * This function must be called with the sem->wait_lock held to prevent
553 * race conditions between checking the rwsem wait list and setting the
554 * sem->count accordingly.
555 *
556 * If wstate is WRITER_HANDOFF, it will make sure that either the handoff
557 * bit is set or the lock is acquired with handoff bit cleared.
558 */
559static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
560 enum writer_wait_state wstate)
561{
562 long count, new;
563
564 lockdep_assert_held(&sem->wait_lock);
565
566 count = atomic_long_read(&sem->count);
567 do {
568 bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
569
570 if (has_handoff && wstate == WRITER_NOT_FIRST)
571 return false;
572
573 new = count;
574
575 if (count & RWSEM_LOCK_MASK) {
576 if (has_handoff || (wstate != WRITER_HANDOFF))
577 return false;
578
579 new |= RWSEM_FLAG_HANDOFF;
580 } else {
581 new |= RWSEM_WRITER_LOCKED;
582 new &= ~RWSEM_FLAG_HANDOFF;
583
584 if (list_is_singular(&sem->wait_list))
585 new &= ~RWSEM_FLAG_WAITERS;
586 }
587 } while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new));
588
589 /*
590 * We have either acquired the lock with handoff bit cleared or
591 * set the handoff bit.
592 */
593 if (new & RWSEM_FLAG_HANDOFF)
594 return false;
595
596 rwsem_set_owner(sem);
597 return true;
598}
599
600#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
601/*
602 * Try to acquire read lock before the reader is put on wait queue.
603 * Lock acquisition isn't allowed if the rwsem is locked or a writer handoff
604 * is ongoing.
605 */
606static inline bool rwsem_try_read_lock_unqueued(struct rw_semaphore *sem)
607{
608 long count = atomic_long_read(&sem->count);
609
610 if (count & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))
611 return false;
612
613 count = atomic_long_fetch_add_acquire(RWSEM_READER_BIAS, &sem->count);
614 if (!(count & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) {
615 rwsem_set_reader_owned(sem);
616 lockevent_inc(rwsem_opt_rlock);
617 return true;
618 }
619
620 /* Back out the change */
621 atomic_long_add(-RWSEM_READER_BIAS, &sem->count);
622 return false;
623}
624
625/*
626 * Try to acquire write lock before the writer has been put on wait queue.
627 */
628static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
629{
630 long count = atomic_long_read(&sem->count);
631
632 while (!(count & (RWSEM_LOCK_MASK|RWSEM_FLAG_HANDOFF))) {
633 if (atomic_long_try_cmpxchg_acquire(&sem->count, &count,
634 count | RWSEM_WRITER_LOCKED)) {
635 rwsem_set_owner(sem);
636 lockevent_inc(rwsem_opt_wlock);
637 return true;
638 }
639 }
640 return false;
641}
642
643static inline bool owner_on_cpu(struct task_struct *owner)
644{
645 /*
646 * As lock holder preemption issue, we both skip spinning if
647 * task is not on cpu or its cpu is preempted
648 */
649 return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
650}
651
652static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem,
653 unsigned long nonspinnable)
654{
655 struct task_struct *owner;
656 unsigned long flags;
657 bool ret = true;
658
659 BUILD_BUG_ON(!(RWSEM_OWNER_UNKNOWN & RWSEM_NONSPINNABLE));
660
661 if (need_resched()) {
662 lockevent_inc(rwsem_opt_fail);
663 return false;
664 }
665
666 preempt_disable();
667 rcu_read_lock();
668 owner = rwsem_owner_flags(sem, &flags);
669 if ((flags & nonspinnable) || (owner && !owner_on_cpu(owner)))
670 ret = false;
671 rcu_read_unlock();
672 preempt_enable();
673
674 lockevent_cond_inc(rwsem_opt_fail, !ret);
675 return ret;
676}
677
678/*
679 * The rwsem_spin_on_owner() function returns the folowing 4 values
680 * depending on the lock owner state.
681 * OWNER_NULL : owner is currently NULL
682 * OWNER_WRITER: when owner changes and is a writer
683 * OWNER_READER: when owner changes and the new owner may be a reader.
684 * OWNER_NONSPINNABLE:
685 * when optimistic spinning has to stop because either the
686 * owner stops running, is unknown, or its timeslice has
687 * been used up.
688 */
689enum owner_state {
690 OWNER_NULL = 1 << 0,
691 OWNER_WRITER = 1 << 1,
692 OWNER_READER = 1 << 2,
693 OWNER_NONSPINNABLE = 1 << 3,
694};
695#define OWNER_SPINNABLE (OWNER_NULL | OWNER_WRITER | OWNER_READER)
696
697static inline enum owner_state
698rwsem_owner_state(struct task_struct *owner, unsigned long flags, unsigned long nonspinnable)
699{
700 if (flags & nonspinnable)
701 return OWNER_NONSPINNABLE;
702
703 if (flags & RWSEM_READER_OWNED)
704 return OWNER_READER;
705
706 return owner ? OWNER_WRITER : OWNER_NULL;
707}
708
709static noinline enum owner_state
710rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable)
711{
712 struct task_struct *new, *owner;
713 unsigned long flags, new_flags;
714 enum owner_state state;
715
716 owner = rwsem_owner_flags(sem, &flags);
717 state = rwsem_owner_state(owner, flags, nonspinnable);
718 if (state != OWNER_WRITER)
719 return state;
720
721 rcu_read_lock();
722 for (;;) {
723 if (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF) {
724 state = OWNER_NONSPINNABLE;
725 break;
726 }
727
728 new = rwsem_owner_flags(sem, &new_flags);
729 if ((new != owner) || (new_flags != flags)) {
730 state = rwsem_owner_state(new, new_flags, nonspinnable);
731 break;
732 }
733
734 /*
735 * Ensure we emit the owner->on_cpu, dereference _after_
736 * checking sem->owner still matches owner, if that fails,
737 * owner might point to free()d memory, if it still matches,
738 * the rcu_read_lock() ensures the memory stays valid.
739 */
740 barrier();
741
742 if (need_resched() || !owner_on_cpu(owner)) {
743 state = OWNER_NONSPINNABLE;
744 break;
745 }
746
747 cpu_relax();
748 }
749 rcu_read_unlock();
750
751 return state;
752}
753
754/*
755 * Calculate reader-owned rwsem spinning threshold for writer
756 *
757 * The more readers own the rwsem, the longer it will take for them to
758 * wind down and free the rwsem. So the empirical formula used to
759 * determine the actual spinning time limit here is:
760 *
761 * Spinning threshold = (10 + nr_readers/2)us
762 *
763 * The limit is capped to a maximum of 25us (30 readers). This is just
764 * a heuristic and is subjected to change in the future.
765 */
766static inline u64 rwsem_rspin_threshold(struct rw_semaphore *sem)
767{
768 long count = atomic_long_read(&sem->count);
769 int readers = count >> RWSEM_READER_SHIFT;
770 u64 delta;
771
772 if (readers > 30)
773 readers = 30;
774 delta = (20 + readers) * NSEC_PER_USEC / 2;
775
776 return sched_clock() + delta;
777}
778
779static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock)
780{
781 bool taken = false;
782 int prev_owner_state = OWNER_NULL;
783 int loop = 0;
784 u64 rspin_threshold = 0;
785 unsigned long nonspinnable = wlock ? RWSEM_WR_NONSPINNABLE
786 : RWSEM_RD_NONSPINNABLE;
787
788 preempt_disable();
789
790 /* sem->wait_lock should not be held when doing optimistic spinning */
791 if (!osq_lock(&sem->osq))
792 goto done;
793
794 /*
795 * Optimistically spin on the owner field and attempt to acquire the
796 * lock whenever the owner changes. Spinning will be stopped when:
797 * 1) the owning writer isn't running; or
798 * 2) readers own the lock and spinning time has exceeded limit.
799 */
800 for (;;) {
801 enum owner_state owner_state;
802
803 owner_state = rwsem_spin_on_owner(sem, nonspinnable);
804 if (!(owner_state & OWNER_SPINNABLE))
805 break;
806
807 /*
808 * Try to acquire the lock
809 */
810 taken = wlock ? rwsem_try_write_lock_unqueued(sem)
811 : rwsem_try_read_lock_unqueued(sem);
812
813 if (taken)
814 break;
815
816 /*
817 * Time-based reader-owned rwsem optimistic spinning
818 */
819 if (wlock && (owner_state == OWNER_READER)) {
820 /*
821 * Re-initialize rspin_threshold every time when
822 * the owner state changes from non-reader to reader.
823 * This allows a writer to steal the lock in between
824 * 2 reader phases and have the threshold reset at
825 * the beginning of the 2nd reader phase.
826 */
827 if (prev_owner_state != OWNER_READER) {
828 if (rwsem_test_oflags(sem, nonspinnable))
829 break;
830 rspin_threshold = rwsem_rspin_threshold(sem);
831 loop = 0;
832 }
833
834 /*
835 * Check time threshold once every 16 iterations to
836 * avoid calling sched_clock() too frequently so
837 * as to reduce the average latency between the times
838 * when the lock becomes free and when the spinner
839 * is ready to do a trylock.
840 */
841 else if (!(++loop & 0xf) && (sched_clock() > rspin_threshold)) {
842 rwsem_set_nonspinnable(sem);
843 lockevent_inc(rwsem_opt_nospin);
844 break;
845 }
846 }
847
848 /*
849 * An RT task cannot do optimistic spinning if it cannot
850 * be sure the lock holder is running or live-lock may
851 * happen if the current task and the lock holder happen
852 * to run in the same CPU. However, aborting optimistic
853 * spinning while a NULL owner is detected may miss some
854 * opportunity where spinning can continue without causing
855 * problem.
856 *
857 * There are 2 possible cases where an RT task may be able
858 * to continue spinning.
859 *
860 * 1) The lock owner is in the process of releasing the
861 * lock, sem->owner is cleared but the lock has not
862 * been released yet.
863 * 2) The lock was free and owner cleared, but another
864 * task just comes in and acquire the lock before
865 * we try to get it. The new owner may be a spinnable
866 * writer.
867 *
868 * To take advantage of two scenarios listed agove, the RT
869 * task is made to retry one more time to see if it can
870 * acquire the lock or continue spinning on the new owning
871 * writer. Of course, if the time lag is long enough or the
872 * new owner is not a writer or spinnable, the RT task will
873 * quit spinning.
874 *
875 * If the owner is a writer, the need_resched() check is
876 * done inside rwsem_spin_on_owner(). If the owner is not
877 * a writer, need_resched() check needs to be done here.
878 */
879 if (owner_state != OWNER_WRITER) {
880 if (need_resched())
881 break;
882 if (rt_task(current) &&
883 (prev_owner_state != OWNER_WRITER))
884 break;
885 }
886 prev_owner_state = owner_state;
887
888 /*
889 * The cpu_relax() call is a compiler barrier which forces
890 * everything in this loop to be re-loaded. We don't need
891 * memory barriers as we'll eventually observe the right
892 * values at the cost of a few extra spins.
893 */
894 cpu_relax();
895 }
896 osq_unlock(&sem->osq);
897done:
898 preempt_enable();
899 lockevent_cond_inc(rwsem_opt_fail, !taken);
900 return taken;
901}
902
903/*
904 * Clear the owner's RWSEM_WR_NONSPINNABLE bit if it is set. This should
905 * only be called when the reader count reaches 0.
906 *
907 * This give writers better chance to acquire the rwsem first before
908 * readers when the rwsem was being held by readers for a relatively long
909 * period of time. Race can happen that an optimistic spinner may have
910 * just stolen the rwsem and set the owner, but just clearing the
911 * RWSEM_WR_NONSPINNABLE bit will do no harm anyway.
912 */
913static inline void clear_wr_nonspinnable(struct rw_semaphore *sem)
914{
915 if (rwsem_test_oflags(sem, RWSEM_WR_NONSPINNABLE))
916 atomic_long_andnot(RWSEM_WR_NONSPINNABLE, &sem->owner);
917}
918
919/*
920 * This function is called when the reader fails to acquire the lock via
921 * optimistic spinning. In this case we will still attempt to do a trylock
922 * when comparing the rwsem state right now with the state when entering
923 * the slowpath indicates that the reader is still in a valid reader phase.
924 * This happens when the following conditions are true:
925 *
926 * 1) The lock is currently reader owned, and
927 * 2) The lock is previously not reader-owned or the last read owner changes.
928 *
929 * In the former case, we have transitioned from a writer phase to a
930 * reader-phase while spinning. In the latter case, it means the reader
931 * phase hasn't ended when we entered the optimistic spinning loop. In
932 * both cases, the reader is eligible to acquire the lock. This is the
933 * secondary path where a read lock is acquired optimistically.
934 *
935 * The reader non-spinnable bit wasn't set at time of entry or it will
936 * not be here at all.
937 */
938static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem,
939 unsigned long last_rowner)
940{
941 unsigned long owner = atomic_long_read(&sem->owner);
942
943 if (!(owner & RWSEM_READER_OWNED))
944 return false;
945
946 if (((owner ^ last_rowner) & ~RWSEM_OWNER_FLAGS_MASK) &&
947 rwsem_try_read_lock_unqueued(sem)) {
948 lockevent_inc(rwsem_opt_rlock2);
949 lockevent_add(rwsem_opt_fail, -1);
950 return true;
951 }
952 return false;
953}
954#else
955static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem,
956 unsigned long nonspinnable)
957{
958 return false;
959}
960
961static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock)
962{
963 return false;
964}
965
966static inline void clear_wr_nonspinnable(struct rw_semaphore *sem) { }
967
968static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem,
969 unsigned long last_rowner)
970{
971 return false;
972}
973#endif
974
975/*
976 * Wait for the read lock to be granted
977 */
978static struct rw_semaphore __sched *
979rwsem_down_read_slowpath(struct rw_semaphore *sem, int state)
980{
981 long count, adjustment = -RWSEM_READER_BIAS;
982 struct rwsem_waiter waiter;
983 DEFINE_WAKE_Q(wake_q);
984 bool wake = false;
985
986 /*
987 * Save the current read-owner of rwsem, if available, and the
988 * reader nonspinnable bit.
989 */
990 waiter.last_rowner = atomic_long_read(&sem->owner);
991 if (!(waiter.last_rowner & RWSEM_READER_OWNED))
992 waiter.last_rowner &= RWSEM_RD_NONSPINNABLE;
993
994 if (!rwsem_can_spin_on_owner(sem, RWSEM_RD_NONSPINNABLE))
995 goto queue;
996
997 /*
998 * Undo read bias from down_read() and do optimistic spinning.
999 */
1000 atomic_long_add(-RWSEM_READER_BIAS, &sem->count);
1001 adjustment = 0;
1002 if (rwsem_optimistic_spin(sem, false)) {
1003 /*
1004 * Wake up other readers in the wait list if the front
1005 * waiter is a reader.
1006 */
1007 if ((atomic_long_read(&sem->count) & RWSEM_FLAG_WAITERS)) {
1008 raw_spin_lock_irq(&sem->wait_lock);
1009 if (!list_empty(&sem->wait_list))
1010 rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED,
1011 &wake_q);
1012 raw_spin_unlock_irq(&sem->wait_lock);
1013 wake_up_q(&wake_q);
1014 }
1015 return sem;
1016 } else if (rwsem_reader_phase_trylock(sem, waiter.last_rowner)) {
1017 return sem;
1018 }
1019
1020queue:
1021 waiter.task = current;
1022 waiter.type = RWSEM_WAITING_FOR_READ;
1023 waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
1024
1025 raw_spin_lock_irq(&sem->wait_lock);
1026 if (list_empty(&sem->wait_list)) {
1027 /*
1028 * In case the wait queue is empty and the lock isn't owned
1029 * by a writer or has the handoff bit set, this reader can
1030 * exit the slowpath and return immediately as its
1031 * RWSEM_READER_BIAS has already been set in the count.
1032 */
1033 if (adjustment && !(atomic_long_read(&sem->count) &
1034 (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) {
1035 raw_spin_unlock_irq(&sem->wait_lock);
1036 rwsem_set_reader_owned(sem);
1037 lockevent_inc(rwsem_rlock_fast);
1038 return sem;
1039 }
1040 adjustment += RWSEM_FLAG_WAITERS;
1041 }
1042 list_add_tail(&waiter.list, &sem->wait_list);
1043
1044 /* we're now waiting on the lock, but no longer actively locking */
1045 if (adjustment)
1046 count = atomic_long_add_return(adjustment, &sem->count);
1047 else
1048 count = atomic_long_read(&sem->count);
1049
1050 /*
1051 * If there are no active locks, wake the front queued process(es).
1052 *
1053 * If there are no writers and we are first in the queue,
1054 * wake our own waiter to join the existing active readers !
1055 */
1056 if (!(count & RWSEM_LOCK_MASK)) {
1057 clear_wr_nonspinnable(sem);
1058 wake = true;
1059 }
1060 if (wake || (!(count & RWSEM_WRITER_MASK) &&
1061 (adjustment & RWSEM_FLAG_WAITERS)))
1062 rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
1063
1064 raw_spin_unlock_irq(&sem->wait_lock);
1065 wake_up_q(&wake_q);
1066
1067 /* wait to be given the lock */
1068 while (true) {
1069 set_current_state(state);
1070 if (!waiter.task)
1071 break;
1072 if (signal_pending_state(state, current)) {
1073 raw_spin_lock_irq(&sem->wait_lock);
1074 if (waiter.task)
1075 goto out_nolock;
1076 raw_spin_unlock_irq(&sem->wait_lock);
1077 break;
1078 }
1079 schedule();
1080 lockevent_inc(rwsem_sleep_reader);
1081 }
1082
1083 __set_current_state(TASK_RUNNING);
1084 lockevent_inc(rwsem_rlock);
1085 return sem;
1086out_nolock:
1087 list_del(&waiter.list);
1088 if (list_empty(&sem->wait_list)) {
1089 atomic_long_andnot(RWSEM_FLAG_WAITERS|RWSEM_FLAG_HANDOFF,
1090 &sem->count);
1091 }
1092 raw_spin_unlock_irq(&sem->wait_lock);
1093 __set_current_state(TASK_RUNNING);
1094 lockevent_inc(rwsem_rlock_fail);
1095 return ERR_PTR(-EINTR);
1096}
1097
1098/*
1099 * This function is called by the a write lock owner. So the owner value
1100 * won't get changed by others.
1101 */
1102static inline void rwsem_disable_reader_optspin(struct rw_semaphore *sem,
1103 bool disable)
1104{
1105 if (unlikely(disable)) {
1106 atomic_long_or(RWSEM_RD_NONSPINNABLE, &sem->owner);
1107 lockevent_inc(rwsem_opt_norspin);
1108 }
1109}
1110
1111/*
1112 * Wait until we successfully acquire the write lock
1113 */
1114static struct rw_semaphore *
1115rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
1116{
1117 long count;
1118 bool disable_rspin;
1119 enum writer_wait_state wstate;
1120 struct rwsem_waiter waiter;
1121 struct rw_semaphore *ret = sem;
1122 DEFINE_WAKE_Q(wake_q);
1123
1124 /* do optimistic spinning and steal lock if possible */
1125 if (rwsem_can_spin_on_owner(sem, RWSEM_WR_NONSPINNABLE) &&
1126 rwsem_optimistic_spin(sem, true))
1127 return sem;
1128
1129 /*
1130 * Disable reader optimistic spinning for this rwsem after
1131 * acquiring the write lock when the setting of the nonspinnable
1132 * bits are observed.
1133 */
1134 disable_rspin = atomic_long_read(&sem->owner) & RWSEM_NONSPINNABLE;
1135
1136 /*
1137 * Optimistic spinning failed, proceed to the slowpath
1138 * and block until we can acquire the sem.
1139 */
1140 waiter.task = current;
1141 waiter.type = RWSEM_WAITING_FOR_WRITE;
1142 waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
1143
1144 raw_spin_lock_irq(&sem->wait_lock);
1145
1146 /* account for this before adding a new element to the list */
1147 wstate = list_empty(&sem->wait_list) ? WRITER_FIRST : WRITER_NOT_FIRST;
1148
1149 list_add_tail(&waiter.list, &sem->wait_list);
1150
1151 /* we're now waiting on the lock */
1152 if (wstate == WRITER_NOT_FIRST) {
1153 count = atomic_long_read(&sem->count);
1154
1155 /*
1156 * If there were already threads queued before us and:
1157 * 1) there are no no active locks, wake the front
1158 * queued process(es) as the handoff bit might be set.
1159 * 2) there are no active writers and some readers, the lock
1160 * must be read owned; so we try to wake any read lock
1161 * waiters that were queued ahead of us.
1162 */
1163 if (count & RWSEM_WRITER_MASK)
1164 goto wait;
1165
1166 rwsem_mark_wake(sem, (count & RWSEM_READER_MASK)
1167 ? RWSEM_WAKE_READERS
1168 : RWSEM_WAKE_ANY, &wake_q);
1169
1170 if (!wake_q_empty(&wake_q)) {
1171 /*
1172 * We want to minimize wait_lock hold time especially
1173 * when a large number of readers are to be woken up.
1174 */
1175 raw_spin_unlock_irq(&sem->wait_lock);
1176 wake_up_q(&wake_q);
1177 wake_q_init(&wake_q); /* Used again, reinit */
1178 raw_spin_lock_irq(&sem->wait_lock);
1179 }
1180 } else {
1181 atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count);
1182 }
1183
1184wait:
1185 /* wait until we successfully acquire the lock */
1186 set_current_state(state);
1187 while (true) {
1188 if (rwsem_try_write_lock(sem, wstate))
1189 break;
1190
1191 raw_spin_unlock_irq(&sem->wait_lock);
1192
1193 /* Block until there are no active lockers. */
1194 for (;;) {
1195 if (signal_pending_state(state, current))
1196 goto out_nolock;
1197
1198 schedule();
1199 lockevent_inc(rwsem_sleep_writer);
1200 set_current_state(state);
1201 /*
1202 * If HANDOFF bit is set, unconditionally do
1203 * a trylock.
1204 */
1205 if (wstate == WRITER_HANDOFF)
1206 break;
1207
1208 if ((wstate == WRITER_NOT_FIRST) &&
1209 (rwsem_first_waiter(sem) == &waiter))
1210 wstate = WRITER_FIRST;
1211
1212 count = atomic_long_read(&sem->count);
1213 if (!(count & RWSEM_LOCK_MASK))
1214 break;
1215
1216 /*
1217 * The setting of the handoff bit is deferred
1218 * until rwsem_try_write_lock() is called.
1219 */
1220 if ((wstate == WRITER_FIRST) && (rt_task(current) ||
1221 time_after(jiffies, waiter.timeout))) {
1222 wstate = WRITER_HANDOFF;
1223 lockevent_inc(rwsem_wlock_handoff);
1224 break;
1225 }
1226 }
1227
1228 raw_spin_lock_irq(&sem->wait_lock);
1229 }
1230 __set_current_state(TASK_RUNNING);
1231 list_del(&waiter.list);
1232 rwsem_disable_reader_optspin(sem, disable_rspin);
1233 raw_spin_unlock_irq(&sem->wait_lock);
1234 lockevent_inc(rwsem_wlock);
1235
1236 return ret;
1237
1238out_nolock:
1239 __set_current_state(TASK_RUNNING);
1240 raw_spin_lock_irq(&sem->wait_lock);
1241 list_del(&waiter.list);
1242
1243 if (unlikely(wstate == WRITER_HANDOFF))
1244 atomic_long_add(-RWSEM_FLAG_HANDOFF, &sem->count);
1245
1246 if (list_empty(&sem->wait_list))
1247 atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count);
1248 else
1249 rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
1250 raw_spin_unlock_irq(&sem->wait_lock);
1251 wake_up_q(&wake_q);
1252 lockevent_inc(rwsem_wlock_fail);
1253
1254 return ERR_PTR(-EINTR);
1255}
1256
1257/*
1258 * handle waking up a waiter on the semaphore
1259 * - up_read/up_write has decremented the active part of count if we come here
1260 */
1261static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem, long count)
1262{
1263 unsigned long flags;
1264 DEFINE_WAKE_Q(wake_q);
1265
1266 raw_spin_lock_irqsave(&sem->wait_lock, flags);
1267
1268 if (!list_empty(&sem->wait_list))
1269 rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
1270
1271 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
1272 wake_up_q(&wake_q);
1273
1274 return sem;
1275}
1276
1277/*
1278 * downgrade a write lock into a read lock
1279 * - caller incremented waiting part of count and discovered it still negative
1280 * - just wake up any readers at the front of the queue
1281 */
1282static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
1283{
1284 unsigned long flags;
1285 DEFINE_WAKE_Q(wake_q);
1286
1287 raw_spin_lock_irqsave(&sem->wait_lock, flags);
1288
1289 if (!list_empty(&sem->wait_list))
1290 rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);
1291
1292 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
1293 wake_up_q(&wake_q);
1294
1295 return sem;
1296}
1297
1298/*
1299 * lock for reading
1300 */
1301inline void __down_read(struct rw_semaphore *sem)
1302{
1303 if (!rwsem_read_trylock(sem)) {
1304 rwsem_down_read_slowpath(sem, TASK_UNINTERRUPTIBLE);
1305 DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
1306 } else {
1307 rwsem_set_reader_owned(sem);
1308 }
1309}
1310
1311static inline int __down_read_killable(struct rw_semaphore *sem)
1312{
1313 if (!rwsem_read_trylock(sem)) {
1314 if (IS_ERR(rwsem_down_read_slowpath(sem, TASK_KILLABLE)))
1315 return -EINTR;
1316 DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
1317 } else {
1318 rwsem_set_reader_owned(sem);
1319 }
1320 return 0;
1321}
1322
1323static inline int __down_read_trylock(struct rw_semaphore *sem)
1324{
1325 /*
1326 * Optimize for the case when the rwsem is not locked at all.
1327 */
1328 long tmp = RWSEM_UNLOCKED_VALUE;
1329
1330 do {
1331 if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
1332 tmp + RWSEM_READER_BIAS)) {
1333 rwsem_set_reader_owned(sem);
1334 return 1;
1335 }
1336 } while (!(tmp & RWSEM_READ_FAILED_MASK));
1337 return 0;
1338}
1339
1340/*
1341 * lock for writing
1342 */
1343static inline void __down_write(struct rw_semaphore *sem)
1344{
1345 long tmp = RWSEM_UNLOCKED_VALUE;
1346
1347 if (unlikely(!atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
1348 RWSEM_WRITER_LOCKED)))
1349 rwsem_down_write_slowpath(sem, TASK_UNINTERRUPTIBLE);
1350 else
1351 rwsem_set_owner(sem);
1352}
1353
1354static inline int __down_write_killable(struct rw_semaphore *sem)
1355{
1356 long tmp = RWSEM_UNLOCKED_VALUE;
1357
1358 if (unlikely(!atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
1359 RWSEM_WRITER_LOCKED))) {
1360 if (IS_ERR(rwsem_down_write_slowpath(sem, TASK_KILLABLE)))
1361 return -EINTR;
1362 } else {
1363 rwsem_set_owner(sem);
1364 }
1365 return 0;
1366}
1367
1368static inline int __down_write_trylock(struct rw_semaphore *sem)
1369{
1370 long tmp = RWSEM_UNLOCKED_VALUE;
1371
1372 if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
1373 RWSEM_WRITER_LOCKED)) {
1374 rwsem_set_owner(sem);
1375 return true;
1376 }
1377 return false;
1378}
1379
1380/*
1381 * unlock after reading
1382 */
1383inline void __up_read(struct rw_semaphore *sem)
1384{
1385 long tmp;
1386
1387 DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
1388 rwsem_clear_reader_owned(sem);
1389 tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count);
1390 DEBUG_RWSEMS_WARN_ON(tmp < 0, sem);
1391 if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) ==
1392 RWSEM_FLAG_WAITERS)) {
1393 clear_wr_nonspinnable(sem);
1394 rwsem_wake(sem, tmp);
1395 }
1396}
1397
1398/*
1399 * unlock after writing
1400 */
1401static inline void __up_write(struct rw_semaphore *sem)
1402{
1403 long tmp;
1404
1405 /*
1406 * sem->owner may differ from current if the ownership is transferred
1407 * to an anonymous writer by setting the RWSEM_NONSPINNABLE bits.
1408 */
1409 DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) &&
1410 !rwsem_test_oflags(sem, RWSEM_NONSPINNABLE), sem);
1411 rwsem_clear_owner(sem);
1412 tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count);
1413 if (unlikely(tmp & RWSEM_FLAG_WAITERS))
1414 rwsem_wake(sem, tmp);
1415}
1416
1417/*
1418 * downgrade write lock to read lock
1419 */
1420static inline void __downgrade_write(struct rw_semaphore *sem)
1421{
1422 long tmp;
1423
1424 /*
1425 * When downgrading from exclusive to shared ownership,
1426 * anything inside the write-locked region cannot leak
1427 * into the read side. In contrast, anything in the
1428 * read-locked region is ok to be re-ordered into the
1429 * write side. As such, rely on RELEASE semantics.
1430 */
1431 DEBUG_RWSEMS_WARN_ON(rwsem_owner(sem) != current, sem);
1432 tmp = atomic_long_fetch_add_release(
1433 -RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count);
1434 rwsem_set_reader_owned(sem);
1435 if (tmp & RWSEM_FLAG_WAITERS)
1436 rwsem_downgrade_wake(sem);
1437}
17 1438
18/* 1439/*
19 * lock for reading 1440 * lock for reading
@@ -25,7 +1446,6 @@ void __sched down_read(struct rw_semaphore *sem)
25 1446
26 LOCK_CONTENDED(sem, __down_read_trylock, __down_read); 1447 LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
27} 1448}
28
29EXPORT_SYMBOL(down_read); 1449EXPORT_SYMBOL(down_read);
30 1450
31int __sched down_read_killable(struct rw_semaphore *sem) 1451int __sched down_read_killable(struct rw_semaphore *sem)
@@ -40,7 +1460,6 @@ int __sched down_read_killable(struct rw_semaphore *sem)
40 1460
41 return 0; 1461 return 0;
42} 1462}
43
44EXPORT_SYMBOL(down_read_killable); 1463EXPORT_SYMBOL(down_read_killable);
45 1464
46/* 1465/*
@@ -54,7 +1473,6 @@ int down_read_trylock(struct rw_semaphore *sem)
54 rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); 1473 rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
55 return ret; 1474 return ret;
56} 1475}
57
58EXPORT_SYMBOL(down_read_trylock); 1476EXPORT_SYMBOL(down_read_trylock);
59 1477
60/* 1478/*
@@ -64,10 +1482,8 @@ void __sched down_write(struct rw_semaphore *sem)
64{ 1482{
65 might_sleep(); 1483 might_sleep();
66 rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); 1484 rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
67
68 LOCK_CONTENDED(sem, __down_write_trylock, __down_write); 1485 LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
69} 1486}
70
71EXPORT_SYMBOL(down_write); 1487EXPORT_SYMBOL(down_write);
72 1488
73/* 1489/*
@@ -78,14 +1494,14 @@ int __sched down_write_killable(struct rw_semaphore *sem)
78 might_sleep(); 1494 might_sleep();
79 rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); 1495 rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
80 1496
81 if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) { 1497 if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock,
1498 __down_write_killable)) {
82 rwsem_release(&sem->dep_map, 1, _RET_IP_); 1499 rwsem_release(&sem->dep_map, 1, _RET_IP_);
83 return -EINTR; 1500 return -EINTR;
84 } 1501 }
85 1502
86 return 0; 1503 return 0;
87} 1504}
88
89EXPORT_SYMBOL(down_write_killable); 1505EXPORT_SYMBOL(down_write_killable);
90 1506
91/* 1507/*
@@ -100,7 +1516,6 @@ int down_write_trylock(struct rw_semaphore *sem)
100 1516
101 return ret; 1517 return ret;
102} 1518}
103
104EXPORT_SYMBOL(down_write_trylock); 1519EXPORT_SYMBOL(down_write_trylock);
105 1520
106/* 1521/*
@@ -109,10 +1524,8 @@ EXPORT_SYMBOL(down_write_trylock);
109void up_read(struct rw_semaphore *sem) 1524void up_read(struct rw_semaphore *sem)
110{ 1525{
111 rwsem_release(&sem->dep_map, 1, _RET_IP_); 1526 rwsem_release(&sem->dep_map, 1, _RET_IP_);
112
113 __up_read(sem); 1527 __up_read(sem);
114} 1528}
115
116EXPORT_SYMBOL(up_read); 1529EXPORT_SYMBOL(up_read);
117 1530
118/* 1531/*
@@ -121,10 +1534,8 @@ EXPORT_SYMBOL(up_read);
121void up_write(struct rw_semaphore *sem) 1534void up_write(struct rw_semaphore *sem)
122{ 1535{
123 rwsem_release(&sem->dep_map, 1, _RET_IP_); 1536 rwsem_release(&sem->dep_map, 1, _RET_IP_);
124
125 __up_write(sem); 1537 __up_write(sem);
126} 1538}
127
128EXPORT_SYMBOL(up_write); 1539EXPORT_SYMBOL(up_write);
129 1540
130/* 1541/*
@@ -133,10 +1544,8 @@ EXPORT_SYMBOL(up_write);
133void downgrade_write(struct rw_semaphore *sem) 1544void downgrade_write(struct rw_semaphore *sem)
134{ 1545{
135 lock_downgrade(&sem->dep_map, _RET_IP_); 1546 lock_downgrade(&sem->dep_map, _RET_IP_);
136
137 __downgrade_write(sem); 1547 __downgrade_write(sem);
138} 1548}
139
140EXPORT_SYMBOL(downgrade_write); 1549EXPORT_SYMBOL(downgrade_write);
141 1550
142#ifdef CONFIG_DEBUG_LOCK_ALLOC 1551#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -145,40 +1554,32 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
145{ 1554{
146 might_sleep(); 1555 might_sleep();
147 rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); 1556 rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
148
149 LOCK_CONTENDED(sem, __down_read_trylock, __down_read); 1557 LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
150} 1558}
151
152EXPORT_SYMBOL(down_read_nested); 1559EXPORT_SYMBOL(down_read_nested);
153 1560
154void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) 1561void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
155{ 1562{
156 might_sleep(); 1563 might_sleep();
157 rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); 1564 rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
158
159 LOCK_CONTENDED(sem, __down_write_trylock, __down_write); 1565 LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
160} 1566}
161
162EXPORT_SYMBOL(_down_write_nest_lock); 1567EXPORT_SYMBOL(_down_write_nest_lock);
163 1568
164void down_read_non_owner(struct rw_semaphore *sem) 1569void down_read_non_owner(struct rw_semaphore *sem)
165{ 1570{
166 might_sleep(); 1571 might_sleep();
167
168 __down_read(sem); 1572 __down_read(sem);
169 __rwsem_set_reader_owned(sem, NULL); 1573 __rwsem_set_reader_owned(sem, NULL);
170} 1574}
171
172EXPORT_SYMBOL(down_read_non_owner); 1575EXPORT_SYMBOL(down_read_non_owner);
173 1576
174void down_write_nested(struct rw_semaphore *sem, int subclass) 1577void down_write_nested(struct rw_semaphore *sem, int subclass)
175{ 1578{
176 might_sleep(); 1579 might_sleep();
177 rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); 1580 rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
178
179 LOCK_CONTENDED(sem, __down_write_trylock, __down_write); 1581 LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
180} 1582}
181
182EXPORT_SYMBOL(down_write_nested); 1583EXPORT_SYMBOL(down_write_nested);
183 1584
184int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass) 1585int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass)
@@ -186,23 +1587,21 @@ int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass)
186 might_sleep(); 1587 might_sleep();
187 rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); 1588 rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
188 1589
189 if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) { 1590 if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock,
1591 __down_write_killable)) {
190 rwsem_release(&sem->dep_map, 1, _RET_IP_); 1592 rwsem_release(&sem->dep_map, 1, _RET_IP_);
191 return -EINTR; 1593 return -EINTR;
192 } 1594 }
193 1595
194 return 0; 1596 return 0;
195} 1597}
196
197EXPORT_SYMBOL(down_write_killable_nested); 1598EXPORT_SYMBOL(down_write_killable_nested);
198 1599
199void up_read_non_owner(struct rw_semaphore *sem) 1600void up_read_non_owner(struct rw_semaphore *sem)
200{ 1601{
201 DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED), 1602 DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
202 sem);
203 __up_read(sem); 1603 __up_read(sem);
204} 1604}
205
206EXPORT_SYMBOL(up_read_non_owner); 1605EXPORT_SYMBOL(up_read_non_owner);
207 1606
208#endif 1607#endif
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index 64877f5294e3..2534ce49f648 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -1,304 +1,10 @@
1/* SPDX-License-Identifier: GPL-2.0 */ 1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * The least significant 2 bits of the owner value has the following
4 * meanings when set.
5 * - RWSEM_READER_OWNED (bit 0): The rwsem is owned by readers
6 * - RWSEM_ANONYMOUSLY_OWNED (bit 1): The rwsem is anonymously owned,
7 * i.e. the owner(s) cannot be readily determined. It can be reader
8 * owned or the owning writer is indeterminate.
9 *
10 * When a writer acquires a rwsem, it puts its task_struct pointer
11 * into the owner field. It is cleared after an unlock.
12 *
13 * When a reader acquires a rwsem, it will also puts its task_struct
14 * pointer into the owner field with both the RWSEM_READER_OWNED and
15 * RWSEM_ANONYMOUSLY_OWNED bits set. On unlock, the owner field will
16 * largely be left untouched. So for a free or reader-owned rwsem,
17 * the owner value may contain information about the last reader that
18 * acquires the rwsem. The anonymous bit is set because that particular
19 * reader may or may not still own the lock.
20 *
21 * That information may be helpful in debugging cases where the system
22 * seems to hang on a reader owned rwsem especially if only one reader
23 * is involved. Ideally we would like to track all the readers that own
24 * a rwsem, but the overhead is simply too big.
25 */
26#include "lock_events.h"
27 2
28#define RWSEM_READER_OWNED (1UL << 0) 3#ifndef __INTERNAL_RWSEM_H
29#define RWSEM_ANONYMOUSLY_OWNED (1UL << 1) 4#define __INTERNAL_RWSEM_H
5#include <linux/rwsem.h>
30 6
31#ifdef CONFIG_DEBUG_RWSEMS 7extern void __down_read(struct rw_semaphore *sem);
32# define DEBUG_RWSEMS_WARN_ON(c, sem) do { \ 8extern void __up_read(struct rw_semaphore *sem);
33 if (!debug_locks_silent && \
34 WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\
35 #c, atomic_long_read(&(sem)->count), \
36 (long)((sem)->owner), (long)current, \
37 list_empty(&(sem)->wait_list) ? "" : "not ")) \
38 debug_locks_off(); \
39 } while (0)
40#else
41# define DEBUG_RWSEMS_WARN_ON(c, sem)
42#endif
43 9
44/* 10#endif /* __INTERNAL_RWSEM_H */
45 * R/W semaphores originally for PPC using the stuff in lib/rwsem.c.
46 * Adapted largely from include/asm-i386/rwsem.h
47 * by Paul Mackerras <paulus@samba.org>.
48 */
49
50/*
51 * the semaphore definition
52 */
53#ifdef CONFIG_64BIT
54# define RWSEM_ACTIVE_MASK 0xffffffffL
55#else
56# define RWSEM_ACTIVE_MASK 0x0000ffffL
57#endif
58
59#define RWSEM_ACTIVE_BIAS 0x00000001L
60#define RWSEM_WAITING_BIAS (-RWSEM_ACTIVE_MASK-1)
61#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS
62#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
63
64#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
65/*
66 * All writes to owner are protected by WRITE_ONCE() to make sure that
67 * store tearing can't happen as optimistic spinners may read and use
68 * the owner value concurrently without lock. Read from owner, however,
69 * may not need READ_ONCE() as long as the pointer value is only used
70 * for comparison and isn't being dereferenced.
71 */
72static inline void rwsem_set_owner(struct rw_semaphore *sem)
73{
74 WRITE_ONCE(sem->owner, current);
75}
76
77static inline void rwsem_clear_owner(struct rw_semaphore *sem)
78{
79 WRITE_ONCE(sem->owner, NULL);
80}
81
82/*
83 * The task_struct pointer of the last owning reader will be left in
84 * the owner field.
85 *
86 * Note that the owner value just indicates the task has owned the rwsem
87 * previously, it may not be the real owner or one of the real owners
88 * anymore when that field is examined, so take it with a grain of salt.
89 */
90static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
91 struct task_struct *owner)
92{
93 unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED
94 | RWSEM_ANONYMOUSLY_OWNED;
95
96 WRITE_ONCE(sem->owner, (struct task_struct *)val);
97}
98
99static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
100{
101 __rwsem_set_reader_owned(sem, current);
102}
103
104/*
105 * Return true if the a rwsem waiter can spin on the rwsem's owner
106 * and steal the lock, i.e. the lock is not anonymously owned.
107 * N.B. !owner is considered spinnable.
108 */
109static inline bool is_rwsem_owner_spinnable(struct task_struct *owner)
110{
111 return !((unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED);
112}
113
114/*
115 * Return true if rwsem is owned by an anonymous writer or readers.
116 */
117static inline bool rwsem_has_anonymous_owner(struct task_struct *owner)
118{
119 return (unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED;
120}
121
122#ifdef CONFIG_DEBUG_RWSEMS
123/*
124 * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there
125 * is a task pointer in owner of a reader-owned rwsem, it will be the
126 * real owner or one of the real owners. The only exception is when the
127 * unlock is done by up_read_non_owner().
128 */
129#define rwsem_clear_reader_owned rwsem_clear_reader_owned
130static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
131{
132 unsigned long val = (unsigned long)current | RWSEM_READER_OWNED
133 | RWSEM_ANONYMOUSLY_OWNED;
134 if (READ_ONCE(sem->owner) == (struct task_struct *)val)
135 cmpxchg_relaxed((unsigned long *)&sem->owner, val,
136 RWSEM_READER_OWNED | RWSEM_ANONYMOUSLY_OWNED);
137}
138#endif
139
140#else
141static inline void rwsem_set_owner(struct rw_semaphore *sem)
142{
143}
144
145static inline void rwsem_clear_owner(struct rw_semaphore *sem)
146{
147}
148
149static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
150 struct task_struct *owner)
151{
152}
153
154static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
155{
156}
157#endif
158
159#ifndef rwsem_clear_reader_owned
160static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
161{
162}
163#endif
164
165extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
166extern struct rw_semaphore *rwsem_down_read_failed_killable(struct rw_semaphore *sem);
167extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
168extern struct rw_semaphore *rwsem_down_write_failed_killable(struct rw_semaphore *sem);
169extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem);
170extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
171
172/*
173 * lock for reading
174 */
175static inline void __down_read(struct rw_semaphore *sem)
176{
177 if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) {
178 rwsem_down_read_failed(sem);
179 DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner &
180 RWSEM_READER_OWNED), sem);
181 } else {
182 rwsem_set_reader_owned(sem);
183 }
184}
185
186static inline int __down_read_killable(struct rw_semaphore *sem)
187{
188 if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) {
189 if (IS_ERR(rwsem_down_read_failed_killable(sem)))
190 return -EINTR;
191 DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner &
192 RWSEM_READER_OWNED), sem);
193 } else {
194 rwsem_set_reader_owned(sem);
195 }
196 return 0;
197}
198
199static inline int __down_read_trylock(struct rw_semaphore *sem)
200{
201 /*
202 * Optimize for the case when the rwsem is not locked at all.
203 */
204 long tmp = RWSEM_UNLOCKED_VALUE;
205
206 lockevent_inc(rwsem_rtrylock);
207 do {
208 if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
209 tmp + RWSEM_ACTIVE_READ_BIAS)) {
210 rwsem_set_reader_owned(sem);
211 return 1;
212 }
213 } while (tmp >= 0);
214 return 0;
215}
216
217/*
218 * lock for writing
219 */
220static inline void __down_write(struct rw_semaphore *sem)
221{
222 long tmp;
223
224 tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS,
225 &sem->count);
226 if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS))
227 rwsem_down_write_failed(sem);
228 rwsem_set_owner(sem);
229}
230
231static inline int __down_write_killable(struct rw_semaphore *sem)
232{
233 long tmp;
234
235 tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS,
236 &sem->count);
237 if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS))
238 if (IS_ERR(rwsem_down_write_failed_killable(sem)))
239 return -EINTR;
240 rwsem_set_owner(sem);
241 return 0;
242}
243
244static inline int __down_write_trylock(struct rw_semaphore *sem)
245{
246 long tmp;
247
248 lockevent_inc(rwsem_wtrylock);
249 tmp = atomic_long_cmpxchg_acquire(&sem->count, RWSEM_UNLOCKED_VALUE,
250 RWSEM_ACTIVE_WRITE_BIAS);
251 if (tmp == RWSEM_UNLOCKED_VALUE) {
252 rwsem_set_owner(sem);
253 return true;
254 }
255 return false;
256}
257
258/*
259 * unlock after reading
260 */
261static inline void __up_read(struct rw_semaphore *sem)
262{
263 long tmp;
264
265 DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED),
266 sem);
267 rwsem_clear_reader_owned(sem);
268 tmp = atomic_long_dec_return_release(&sem->count);
269 if (unlikely(tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0))
270 rwsem_wake(sem);
271}
272
273/*
274 * unlock after writing
275 */
276static inline void __up_write(struct rw_semaphore *sem)
277{
278 DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem);
279 rwsem_clear_owner(sem);
280 if (unlikely(atomic_long_sub_return_release(RWSEM_ACTIVE_WRITE_BIAS,
281 &sem->count) < 0))
282 rwsem_wake(sem);
283}
284
285/*
286 * downgrade write lock to read lock
287 */
288static inline void __downgrade_write(struct rw_semaphore *sem)
289{
290 long tmp;
291
292 /*
293 * When downgrading from exclusive to shared ownership,
294 * anything inside the write-locked region cannot leak
295 * into the read side. In contrast, anything in the
296 * read-locked region is ok to be re-ordered into the
297 * write side. As such, rely on RELEASE semantics.
298 */
299 DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem);
300 tmp = atomic_long_add_return_release(-RWSEM_WAITING_BIAS, &sem->count);
301 rwsem_set_reader_owned(sem);
302 if (tmp < 0)
303 rwsem_downgrade_wake(sem);
304}
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index 561acdd39960..d9dd94defc0a 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -1,9 +1,8 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * Copyright (c) 2008 Intel Corporation 3 * Copyright (c) 2008 Intel Corporation
3 * Author: Matthew Wilcox <willy@linux.intel.com> 4 * Author: Matthew Wilcox <willy@linux.intel.com>
4 * 5 *
5 * Distributed under the terms of the GNU GPL, version 2
6 *
7 * This file implements counting semaphores. 6 * This file implements counting semaphores.
8 * A counting semaphore may be acquired 'n' times before sleeping. 7 * A counting semaphore may be acquired 'n' times before sleeping.
9 * See mutex.c for single-acquisition sleeping locks which enforce 8 * See mutex.c for single-acquisition sleeping locks which enforce
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 65a3b7e55b9f..3e82f449b4ff 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -1,19 +1,6 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
1/* 2/*
2 * Module-based API test facility for ww_mutexes 3 * Module-based API test facility for ww_mutexes
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, you can access it online at
16 * http://www.gnu.org/licenses/gpl-2.0.html.
17 */ 4 */
18 5
19#include <linux/kernel.h> 6#include <linux/kernel.h>
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 1490e63f69a9..6e1970719dc2 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -95,6 +95,7 @@ static void devm_memremap_pages_release(void *data)
95 pgmap->kill(pgmap->ref); 95 pgmap->kill(pgmap->ref);
96 for_each_device_pfn(pfn, pgmap) 96 for_each_device_pfn(pfn, pgmap)
97 put_page(pfn_to_page(pfn)); 97 put_page(pfn_to_page(pfn));
98 pgmap->cleanup(pgmap->ref);
98 99
99 /* pages are dead and unused, undo the arch mapping */ 100 /* pages are dead and unused, undo the arch mapping */
100 align_start = res->start & ~(SECTION_SIZE - 1); 101 align_start = res->start & ~(SECTION_SIZE - 1);
@@ -133,8 +134,8 @@ static void devm_memremap_pages_release(void *data)
133 * 2/ The altmap field may optionally be initialized, in which case altmap_valid 134 * 2/ The altmap field may optionally be initialized, in which case altmap_valid
134 * must be set to true 135 * must be set to true
135 * 136 *
136 * 3/ pgmap->ref must be 'live' on entry and will be killed at 137 * 3/ pgmap->ref must be 'live' on entry and will be killed and reaped
137 * devm_memremap_pages_release() time, or if this routine fails. 138 * at devm_memremap_pages_release() time, or if this routine fails.
138 * 139 *
139 * 4/ res is expected to be a host memory range that could feasibly be 140 * 4/ res is expected to be a host memory range that could feasibly be
140 * treated as a "System RAM" range, i.e. not a device mmio range, but 141 * treated as a "System RAM" range, i.e. not a device mmio range, but
@@ -156,8 +157,10 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
156 pgprot_t pgprot = PAGE_KERNEL; 157 pgprot_t pgprot = PAGE_KERNEL;
157 int error, nid, is_ram; 158 int error, nid, is_ram;
158 159
159 if (!pgmap->ref || !pgmap->kill) 160 if (!pgmap->ref || !pgmap->kill || !pgmap->cleanup) {
161 WARN(1, "Missing reference count teardown definition\n");
160 return ERR_PTR(-EINVAL); 162 return ERR_PTR(-EINVAL);
163 }
161 164
162 align_start = res->start & ~(SECTION_SIZE - 1); 165 align_start = res->start & ~(SECTION_SIZE - 1);
163 align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) 166 align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
@@ -168,14 +171,16 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
168 if (conflict_pgmap) { 171 if (conflict_pgmap) {
169 dev_WARN(dev, "Conflicting mapping in same section\n"); 172 dev_WARN(dev, "Conflicting mapping in same section\n");
170 put_dev_pagemap(conflict_pgmap); 173 put_dev_pagemap(conflict_pgmap);
171 return ERR_PTR(-ENOMEM); 174 error = -ENOMEM;
175 goto err_array;
172 } 176 }
173 177
174 conflict_pgmap = get_dev_pagemap(PHYS_PFN(align_end), NULL); 178 conflict_pgmap = get_dev_pagemap(PHYS_PFN(align_end), NULL);
175 if (conflict_pgmap) { 179 if (conflict_pgmap) {
176 dev_WARN(dev, "Conflicting mapping in same section\n"); 180 dev_WARN(dev, "Conflicting mapping in same section\n");
177 put_dev_pagemap(conflict_pgmap); 181 put_dev_pagemap(conflict_pgmap);
178 return ERR_PTR(-ENOMEM); 182 error = -ENOMEM;
183 goto err_array;
179 } 184 }
180 185
181 is_ram = region_intersects(align_start, align_size, 186 is_ram = region_intersects(align_start, align_size,
@@ -267,10 +272,18 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
267 pgmap_array_delete(res); 272 pgmap_array_delete(res);
268 err_array: 273 err_array:
269 pgmap->kill(pgmap->ref); 274 pgmap->kill(pgmap->ref);
275 pgmap->cleanup(pgmap->ref);
276
270 return ERR_PTR(error); 277 return ERR_PTR(error);
271} 278}
272EXPORT_SYMBOL_GPL(devm_memremap_pages); 279EXPORT_SYMBOL_GPL(devm_memremap_pages);
273 280
281void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap)
282{
283 devm_release_action(dev, devm_memremap_pages_release, pgmap);
284}
285EXPORT_SYMBOL_GPL(devm_memunmap_pages);
286
274unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) 287unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
275{ 288{
276 /* number of pfns from base where pfn_to_page() is valid */ 289 /* number of pfns from base where pfn_to_page() is valid */
diff --git a/kernel/module-internal.h b/kernel/module-internal.h
index 79c9be2dbbe9..33783abc377b 100644
--- a/kernel/module-internal.h
+++ b/kernel/module-internal.h
@@ -1,12 +1,8 @@
1/* SPDX-License-Identifier: GPL-2.0-or-later */
1/* Module internals 2/* Module internals
2 * 3 *
3 * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. 4 * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 5 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */ 6 */
11 7
12#include <linux/elf.h> 8#include <linux/elf.h>
@@ -20,7 +16,7 @@ struct load_info {
20 unsigned long len; 16 unsigned long len;
21 Elf_Shdr *sechdrs; 17 Elf_Shdr *sechdrs;
22 char *secstrings, *strtab; 18 char *secstrings, *strtab;
23 unsigned long symoffs, stroffs; 19 unsigned long symoffs, stroffs, init_typeoffs, core_typeoffs;
24 struct _ddebug *debug; 20 struct _ddebug *debug;
25 unsigned int num_debug; 21 unsigned int num_debug;
26 bool sig_ok; 22 bool sig_ok;
diff --git a/kernel/module.c b/kernel/module.c
index a9e1e7f2c224..a2cee14a83f3 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1,20 +1,8 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
1/* 2/*
2 Copyright (C) 2002 Richard Henderson 3 Copyright (C) 2002 Richard Henderson
3 Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM. 4 Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
4 5
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18*/ 6*/
19#include <linux/export.h> 7#include <linux/export.h>
20#include <linux/extable.h> 8#include <linux/extable.h>
@@ -2642,6 +2630,8 @@ static void layout_symtab(struct module *mod, struct load_info *info)
2642 info->symoffs = ALIGN(mod->core_layout.size, symsect->sh_addralign ?: 1); 2630 info->symoffs = ALIGN(mod->core_layout.size, symsect->sh_addralign ?: 1);
2643 info->stroffs = mod->core_layout.size = info->symoffs + ndst * sizeof(Elf_Sym); 2631 info->stroffs = mod->core_layout.size = info->symoffs + ndst * sizeof(Elf_Sym);
2644 mod->core_layout.size += strtab_size; 2632 mod->core_layout.size += strtab_size;
2633 info->core_typeoffs = mod->core_layout.size;
2634 mod->core_layout.size += ndst * sizeof(char);
2645 mod->core_layout.size = debug_align(mod->core_layout.size); 2635 mod->core_layout.size = debug_align(mod->core_layout.size);
2646 2636
2647 /* Put string table section at end of init part of module. */ 2637 /* Put string table section at end of init part of module. */
@@ -2655,6 +2645,8 @@ static void layout_symtab(struct module *mod, struct load_info *info)
2655 __alignof__(struct mod_kallsyms)); 2645 __alignof__(struct mod_kallsyms));
2656 info->mod_kallsyms_init_off = mod->init_layout.size; 2646 info->mod_kallsyms_init_off = mod->init_layout.size;
2657 mod->init_layout.size += sizeof(struct mod_kallsyms); 2647 mod->init_layout.size += sizeof(struct mod_kallsyms);
2648 info->init_typeoffs = mod->init_layout.size;
2649 mod->init_layout.size += nsrc * sizeof(char);
2658 mod->init_layout.size = debug_align(mod->init_layout.size); 2650 mod->init_layout.size = debug_align(mod->init_layout.size);
2659} 2651}
2660 2652
@@ -2678,20 +2670,23 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
2678 mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym); 2670 mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
2679 /* Make sure we get permanent strtab: don't use info->strtab. */ 2671 /* Make sure we get permanent strtab: don't use info->strtab. */
2680 mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr; 2672 mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr;
2673 mod->kallsyms->typetab = mod->init_layout.base + info->init_typeoffs;
2681 2674
2682 /* Set types up while we still have access to sections. */ 2675 /*
2683 for (i = 0; i < mod->kallsyms->num_symtab; i++) 2676 * Now populate the cut down core kallsyms for after init
2684 mod->kallsyms->symtab[i].st_size 2677 * and set types up while we still have access to sections.
2685 = elf_type(&mod->kallsyms->symtab[i], info); 2678 */
2686
2687 /* Now populate the cut down core kallsyms for after init. */
2688 mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs; 2679 mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs;
2689 mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs; 2680 mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs;
2681 mod->core_kallsyms.typetab = mod->core_layout.base + info->core_typeoffs;
2690 src = mod->kallsyms->symtab; 2682 src = mod->kallsyms->symtab;
2691 for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) { 2683 for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) {
2684 mod->kallsyms->typetab[i] = elf_type(src + i, info);
2692 if (i == 0 || is_livepatch_module(mod) || 2685 if (i == 0 || is_livepatch_module(mod) ||
2693 is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum, 2686 is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
2694 info->index.pcpu)) { 2687 info->index.pcpu)) {
2688 mod->core_kallsyms.typetab[ndst] =
2689 mod->kallsyms->typetab[i];
2695 dst[ndst] = src[i]; 2690 dst[ndst] = src[i];
2696 dst[ndst++].st_name = s - mod->core_kallsyms.strtab; 2691 dst[ndst++].st_name = s - mod->core_kallsyms.strtab;
2697 s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name], 2692 s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name],
@@ -3088,6 +3083,11 @@ static int find_module_sections(struct module *mod, struct load_info *info)
3088 sizeof(*mod->tracepoints_ptrs), 3083 sizeof(*mod->tracepoints_ptrs),
3089 &mod->num_tracepoints); 3084 &mod->num_tracepoints);
3090#endif 3085#endif
3086#ifdef CONFIG_TREE_SRCU
3087 mod->srcu_struct_ptrs = section_objs(info, "___srcu_struct_ptrs",
3088 sizeof(*mod->srcu_struct_ptrs),
3089 &mod->num_srcu_structs);
3090#endif
3091#ifdef CONFIG_BPF_EVENTS 3091#ifdef CONFIG_BPF_EVENTS
3092 mod->bpf_raw_events = section_objs(info, "__bpf_raw_tp_map", 3092 mod->bpf_raw_events = section_objs(info, "__bpf_raw_tp_map",
3093 sizeof(*mod->bpf_raw_events), 3093 sizeof(*mod->bpf_raw_events),
@@ -4091,7 +4091,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
4091 const Elf_Sym *sym = &kallsyms->symtab[symnum]; 4091 const Elf_Sym *sym = &kallsyms->symtab[symnum];
4092 4092
4093 *value = kallsyms_symbol_value(sym); 4093 *value = kallsyms_symbol_value(sym);
4094 *type = sym->st_size; 4094 *type = kallsyms->typetab[symnum];
4095 strlcpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN); 4095 strlcpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN);
4096 strlcpy(module_name, mod->name, MODULE_NAME_LEN); 4096 strlcpy(module_name, mod->name, MODULE_NAME_LEN);
4097 *exported = is_exported(name, *value, mod); 4097 *exported = is_exported(name, *value, mod);
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index 6b9a926fd86b..b10fb1986ca9 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -1,12 +1,8 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
1/* Module signature checker 2/* Module signature checker
2 * 3 *
3 * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. 4 * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 5 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */ 6 */
11 7
12#include <linux/kernel.h> 8#include <linux/kernel.h>
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 6196af8a8223..d9f5081d578d 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1#include <linux/kdebug.h> 2#include <linux/kdebug.h>
2#include <linux/kprobes.h> 3#include <linux/kprobes.h>
3#include <linux/export.h> 4#include <linux/export.h>
@@ -22,6 +23,7 @@ static int notifier_chain_register(struct notifier_block **nl,
22 struct notifier_block *n) 23 struct notifier_block *n)
23{ 24{
24 while ((*nl) != NULL) { 25 while ((*nl) != NULL) {
26 WARN_ONCE(((*nl) == n), "double register detected");
25 if (n->priority > (*nl)->priority) 27 if (n->priority > (*nl)->priority)
26 break; 28 break;
27 nl = &((*nl)->next); 29 nl = &((*nl)->next);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index f6c5d330059a..c815f58e6bc0 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -1,13 +1,9 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * Copyright (C) 2006 IBM Corporation 3 * Copyright (C) 2006 IBM Corporation
3 * 4 *
4 * Author: Serge Hallyn <serue@us.ibm.com> 5 * Author: Serge Hallyn <serue@us.ibm.com>
5 * 6 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation, version 2 of the
9 * License.
10 *
11 * Jun 2006 - namespaces support 7 * Jun 2006 - namespaces support
12 * OpenVZ, SWsoft Inc. 8 * OpenVZ, SWsoft Inc.
13 * Pavel Emelianov <xemul@openvz.org> 9 * Pavel Emelianov <xemul@openvz.org>
diff --git a/kernel/panic.c b/kernel/panic.c
index c1fcaad337b7..4d9f55bf7d38 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * linux/kernel/panic.c 3 * linux/kernel/panic.c
3 * 4 *
@@ -51,6 +52,7 @@ EXPORT_SYMBOL_GPL(panic_timeout);
51#define PANIC_PRINT_TIMER_INFO 0x00000004 52#define PANIC_PRINT_TIMER_INFO 0x00000004
52#define PANIC_PRINT_LOCK_INFO 0x00000008 53#define PANIC_PRINT_LOCK_INFO 0x00000008
53#define PANIC_PRINT_FTRACE_INFO 0x00000010 54#define PANIC_PRINT_FTRACE_INFO 0x00000010
55#define PANIC_PRINT_ALL_PRINTK_MSG 0x00000020
54unsigned long panic_print; 56unsigned long panic_print;
55 57
56ATOMIC_NOTIFIER_HEAD(panic_notifier_list); 58ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
@@ -134,6 +136,9 @@ EXPORT_SYMBOL(nmi_panic);
134 136
135static void panic_print_sys_info(void) 137static void panic_print_sys_info(void)
136{ 138{
139 if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG)
140 console_flush_on_panic(CONSOLE_REPLAY_ALL);
141
137 if (panic_print & PANIC_PRINT_TASK_INFO) 142 if (panic_print & PANIC_PRINT_TASK_INFO)
138 show_state(); 143 show_state();
139 144
@@ -277,7 +282,7 @@ void panic(const char *fmt, ...)
277 * panic() is not being callled from OOPS. 282 * panic() is not being callled from OOPS.
278 */ 283 */
279 debug_locks_off(); 284 debug_locks_off();
280 console_flush_on_panic(); 285 console_flush_on_panic(CONSOLE_FLUSH_PENDING);
281 286
282 panic_print_sys_info(); 287 panic_print_sys_info();
283 288
@@ -306,6 +311,8 @@ void panic(const char *fmt, ...)
306 * shutting down. But if there is a chance of 311 * shutting down. But if there is a chance of
307 * rebooting the system it will be rebooted. 312 * rebooting the system it will be rebooted.
308 */ 313 */
314 if (panic_reboot_mode != REBOOT_UNDEFINED)
315 reboot_mode = panic_reboot_mode;
309 emergency_restart(); 316 emergency_restart();
310 } 317 }
311#ifdef __sparc__ 318#ifdef __sparc__
@@ -321,6 +328,9 @@ void panic(const char *fmt, ...)
321 disabled_wait(); 328 disabled_wait();
322#endif 329#endif
323 pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf); 330 pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf);
331
332 /* Do not scroll important messages printed above */
333 suppress_printk = 1;
324 local_irq_enable(); 334 local_irq_enable();
325 for (i = 0; ; i += PANIC_TIMER_STEP) { 335 for (i = 0; ; i += PANIC_TIMER_STEP) {
326 touch_softlockup_watchdog(); 336 touch_softlockup_watchdog();
diff --git a/kernel/params.c b/kernel/params.c
index ce89f757e6da..cf448785d058 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -1,19 +1,7 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
1/* Helpers for initial module or kernel cmdline parsing 2/* Helpers for initial module or kernel cmdline parsing
2 Copyright (C) 2001 Rusty Russell. 3 Copyright (C) 2001 Rusty Russell.
3 4
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/ 5*/
18#include <linux/kernel.h> 6#include <linux/kernel.h>
19#include <linux/string.h> 7#include <linux/string.h>
diff --git a/kernel/pid.c b/kernel/pid.c
index 20881598bdfa..16263b526560 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * Generic pidhash and scalable, time-bounded PID allocator 3 * Generic pidhash and scalable, time-bounded PID allocator
3 * 4 *
@@ -32,12 +33,13 @@
32#include <linux/init.h> 33#include <linux/init.h>
33#include <linux/rculist.h> 34#include <linux/rculist.h>
34#include <linux/memblock.h> 35#include <linux/memblock.h>
35#include <linux/hash.h>
36#include <linux/pid_namespace.h> 36#include <linux/pid_namespace.h>
37#include <linux/init_task.h> 37#include <linux/init_task.h>
38#include <linux/syscalls.h> 38#include <linux/syscalls.h>
39#include <linux/proc_ns.h> 39#include <linux/proc_ns.h>
40#include <linux/proc_fs.h> 40#include <linux/proc_fs.h>
41#include <linux/anon_inodes.h>
42#include <linux/sched/signal.h>
41#include <linux/sched/task.h> 43#include <linux/sched/task.h>
42#include <linux/idr.h> 44#include <linux/idr.h>
43 45
@@ -214,6 +216,8 @@ struct pid *alloc_pid(struct pid_namespace *ns)
214 for (type = 0; type < PIDTYPE_MAX; ++type) 216 for (type = 0; type < PIDTYPE_MAX; ++type)
215 INIT_HLIST_HEAD(&pid->tasks[type]); 217 INIT_HLIST_HEAD(&pid->tasks[type]);
216 218
219 init_waitqueue_head(&pid->wait_pidfd);
220
217 upid = pid->numbers + ns->level; 221 upid = pid->numbers + ns->level;
218 spin_lock_irq(&pidmap_lock); 222 spin_lock_irq(&pidmap_lock);
219 if (!(ns->pid_allocated & PIDNS_ADDING)) 223 if (!(ns->pid_allocated & PIDNS_ADDING))
@@ -451,6 +455,73 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
451 return idr_get_next(&ns->idr, &nr); 455 return idr_get_next(&ns->idr, &nr);
452} 456}
453 457
458/**
459 * pidfd_create() - Create a new pid file descriptor.
460 *
461 * @pid: struct pid that the pidfd will reference
462 *
463 * This creates a new pid file descriptor with the O_CLOEXEC flag set.
464 *
465 * Note, that this function can only be called after the fd table has
466 * been unshared to avoid leaking the pidfd to the new process.
467 *
468 * Return: On success, a cloexec pidfd is returned.
469 * On error, a negative errno number will be returned.
470 */
471static int pidfd_create(struct pid *pid)
472{
473 int fd;
474
475 fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
476 O_RDWR | O_CLOEXEC);
477 if (fd < 0)
478 put_pid(pid);
479
480 return fd;
481}
482
483/**
484 * pidfd_open() - Open new pid file descriptor.
485 *
486 * @pid: pid for which to retrieve a pidfd
487 * @flags: flags to pass
488 *
489 * This creates a new pid file descriptor with the O_CLOEXEC flag set for
490 * the process identified by @pid. Currently, the process identified by
491 * @pid must be a thread-group leader. This restriction currently exists
492 * for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot
493 * be used with CLONE_THREAD) and pidfd polling (only supports thread group
494 * leaders).
495 *
496 * Return: On success, a cloexec pidfd is returned.
497 * On error, a negative errno number will be returned.
498 */
499SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
500{
501 int fd, ret;
502 struct pid *p;
503
504 if (flags)
505 return -EINVAL;
506
507 if (pid <= 0)
508 return -EINVAL;
509
510 p = find_get_pid(pid);
511 if (!p)
512 return -ESRCH;
513
514 ret = 0;
515 rcu_read_lock();
516 if (!pid_task(p, PIDTYPE_TGID))
517 ret = -EINVAL;
518 rcu_read_unlock();
519
520 fd = ret ?: pidfd_create(p);
521 put_pid(p);
522 return fd;
523}
524
454void __init pid_idr_init(void) 525void __init pid_idr_init(void)
455{ 526{
456 /* Verify no one has done anything silly: */ 527 /* Verify no one has done anything silly: */
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index aa6e72fb7c08..6d726cef241c 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * Pid namespaces 3 * Pid namespaces
3 * 4 *
@@ -325,7 +326,7 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
325 } 326 }
326 327
327 read_lock(&tasklist_lock); 328 read_lock(&tasklist_lock);
328 force_sig(SIGKILL, pid_ns->child_reaper); 329 send_sig(SIGKILL, pid_ns->child_reaper, 1);
329 read_unlock(&tasklist_lock); 330 read_unlock(&tasklist_lock);
330 331
331 do_exit(0); 332 do_exit(0);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 9bbaaab14b36..ff8592ddedee 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -1,3 +1,4 @@
1# SPDX-License-Identifier: GPL-2.0-only
1config SUSPEND 2config SUSPEND
2 bool "Suspend to RAM and standby" 3 bool "Suspend to RAM and standby"
3 depends on ARCH_SUSPEND_POSSIBLE 4 depends on ARCH_SUSPEND_POSSIBLE
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 7d66ee68aaaf..0a9326f5f421 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -223,7 +223,7 @@ int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
223 * All CPUs of a domain must have the same micro-architecture 223 * All CPUs of a domain must have the same micro-architecture
224 * since they all share the same table. 224 * since they all share the same table.
225 */ 225 */
226 cap = arch_scale_cpu_capacity(NULL, cpu); 226 cap = arch_scale_cpu_capacity(cpu);
227 if (prev_cap && prev_cap != cap) { 227 if (prev_cap && prev_cap != cap) {
228 pr_err("CPUs of %*pbl must have the same capacity\n", 228 pr_err("CPUs of %*pbl must have the same capacity\n",
229 cpumask_pr_args(span)); 229 cpumask_pr_args(span));
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index c8c272df7154..cd7434e6000d 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * kernel/power/hibernate.c - Hibernation (a.k.a suspend-to-disk) support. 3 * kernel/power/hibernate.c - Hibernation (a.k.a suspend-to-disk) support.
3 * 4 *
@@ -6,8 +7,6 @@
6 * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz> 7 * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz>
7 * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc. 8 * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc.
8 * Copyright (C) 2012 Bojan Smojver <bojan@rexursive.com> 9 * Copyright (C) 2012 Bojan Smojver <bojan@rexursive.com>
9 *
10 * This file is released under the GPLv2.
11 */ 10 */
12 11
13#define pr_fmt(fmt) "PM: " fmt 12#define pr_fmt(fmt) "PM: " fmt
@@ -129,7 +128,7 @@ static int hibernation_test(int level) { return 0; }
129static int platform_begin(int platform_mode) 128static int platform_begin(int platform_mode)
130{ 129{
131 return (platform_mode && hibernation_ops) ? 130 return (platform_mode && hibernation_ops) ?
132 hibernation_ops->begin() : 0; 131 hibernation_ops->begin(PMSG_FREEZE) : 0;
133} 132}
134 133
135/** 134/**
@@ -257,6 +256,11 @@ void swsusp_show_speed(ktime_t start, ktime_t stop,
257 (kps % 1000) / 10); 256 (kps % 1000) / 10);
258} 257}
259 258
259__weak int arch_resume_nosmt(void)
260{
261 return 0;
262}
263
260/** 264/**
261 * create_image - Create a hibernation image. 265 * create_image - Create a hibernation image.
262 * @platform_mode: Whether or not to use the platform driver. 266 * @platform_mode: Whether or not to use the platform driver.
@@ -324,6 +328,10 @@ static int create_image(int platform_mode)
324 Enable_cpus: 328 Enable_cpus:
325 suspend_enable_secondary_cpus(); 329 suspend_enable_secondary_cpus();
326 330
331 /* Allow architectures to do nosmt-specific post-resume dances */
332 if (!in_suspend)
333 error = arch_resume_nosmt();
334
327 Platform_finish: 335 Platform_finish:
328 platform_finish(platform_mode); 336 platform_finish(platform_mode);
329 337
@@ -542,7 +550,7 @@ int hibernation_platform_enter(void)
542 * hibernation_ops->finish() before saving the image, so we should let 550 * hibernation_ops->finish() before saving the image, so we should let
543 * the firmware know that we're going to enter the sleep state after all 551 * the firmware know that we're going to enter the sleep state after all
544 */ 552 */
545 error = hibernation_ops->begin(); 553 error = hibernation_ops->begin(PMSG_HIBERNATE);
546 if (error) 554 if (error)
547 goto Close; 555 goto Close;
548 556
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 4f43e724f6eb..bdbd605c4215 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -1,11 +1,9 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * kernel/power/main.c - PM subsystem core functionality. 3 * kernel/power/main.c - PM subsystem core functionality.
3 * 4 *
4 * Copyright (c) 2003 Patrick Mochel 5 * Copyright (c) 2003 Patrick Mochel
5 * Copyright (c) 2003 Open Source Development Lab 6 * Copyright (c) 2003 Open Source Development Lab
6 *
7 * This file is released under the GPLv2
8 *
9 */ 7 */
10 8
11#include <linux/export.h> 9#include <linux/export.h>
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 9e58bdc8a562..44bee462ff57 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -75,8 +75,6 @@ static inline void hibernate_reserved_size_init(void) {}
75static inline void hibernate_image_size_init(void) {} 75static inline void hibernate_image_size_init(void) {}
76#endif /* !CONFIG_HIBERNATION */ 76#endif /* !CONFIG_HIBERNATION */
77 77
78extern int pfn_is_nosave(unsigned long);
79
80#define power_attr(_name) \ 78#define power_attr(_name) \
81static struct kobj_attribute _name##_attr = { \ 79static struct kobj_attribute _name##_attr = { \
82 .attr = { \ 80 .attr = { \
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 7ef6866b521d..6d475281c730 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -1,7 +1,6 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * poweroff.c - sysrq handler to gracefully power down machine. 3 * poweroff.c - sysrq handler to gracefully power down machine.
3 *
4 * This file is released under the GPL v2
5 */ 4 */
6 5
7#include <linux/kernel.h> 6#include <linux/kernel.h>
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 9d22131afc1e..33e3febaba53 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * This module exposes the interface to kernel space for specifying 3 * This module exposes the interface to kernel space for specifying
3 * QoS dependencies. It provides infrastructure for registration of: 4 * QoS dependencies. It provides infrastructure for registration of:
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index bc9558ab1e5b..83105874f255 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * linux/kernel/power/snapshot.c 3 * linux/kernel/power/snapshot.c
3 * 4 *
@@ -5,9 +6,6 @@
5 * 6 *
6 * Copyright (C) 1998-2005 Pavel Machek <pavel@ucw.cz> 7 * Copyright (C) 1998-2005 Pavel Machek <pavel@ucw.cz>
7 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> 8 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
8 *
9 * This file is released under the GPLv2.
10 *
11 */ 9 */
12 10
13#define pr_fmt(fmt) "PM: " fmt 11#define pr_fmt(fmt) "PM: " fmt
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index ef908c134b34..c874a7026e24 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -1,11 +1,10 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * kernel/power/suspend.c - Suspend to RAM and standby functionality. 3 * kernel/power/suspend.c - Suspend to RAM and standby functionality.
3 * 4 *
4 * Copyright (c) 2003 Patrick Mochel 5 * Copyright (c) 2003 Patrick Mochel
5 * Copyright (c) 2003 Open Source Development Lab 6 * Copyright (c) 2003 Open Source Development Lab
6 * Copyright (c) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc. 7 * Copyright (c) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
7 *
8 * This file is released under the GPLv2.
9 */ 8 */
10 9
11#define pr_fmt(fmt) "PM: " fmt 10#define pr_fmt(fmt) "PM: " fmt
@@ -62,11 +61,17 @@ static DECLARE_SWAIT_QUEUE_HEAD(s2idle_wait_head);
62enum s2idle_states __read_mostly s2idle_state; 61enum s2idle_states __read_mostly s2idle_state;
63static DEFINE_RAW_SPINLOCK(s2idle_lock); 62static DEFINE_RAW_SPINLOCK(s2idle_lock);
64 63
65bool pm_suspend_via_s2idle(void) 64/**
65 * pm_suspend_default_s2idle - Check if suspend-to-idle is the default suspend.
66 *
67 * Return 'true' if suspend-to-idle has been selected as the default system
68 * suspend method.
69 */
70bool pm_suspend_default_s2idle(void)
66{ 71{
67 return mem_sleep_current == PM_SUSPEND_TO_IDLE; 72 return mem_sleep_current == PM_SUSPEND_TO_IDLE;
68} 73}
69EXPORT_SYMBOL_GPL(pm_suspend_via_s2idle); 74EXPORT_SYMBOL_GPL(pm_suspend_default_s2idle);
70 75
71void s2idle_set_ops(const struct platform_s2idle_ops *ops) 76void s2idle_set_ops(const struct platform_s2idle_ops *ops)
72{ 77{
@@ -488,6 +493,9 @@ int suspend_devices_and_enter(suspend_state_t state)
488 493
489 pm_suspend_target_state = state; 494 pm_suspend_target_state = state;
490 495
496 if (state == PM_SUSPEND_TO_IDLE)
497 pm_set_suspend_no_platform();
498
491 error = platform_suspend_begin(state); 499 error = platform_suspend_begin(state);
492 if (error) 500 if (error)
493 goto Close; 501 goto Close;
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 6a897e8b2a88..60564b58de07 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -1,9 +1,8 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * kernel/power/suspend_test.c - Suspend to RAM and standby test facility. 3 * kernel/power/suspend_test.c - Suspend to RAM and standby test facility.
3 * 4 *
4 * Copyright (c) 2009 Pavel Machek <pavel@ucw.cz> 5 * Copyright (c) 2009 Pavel Machek <pavel@ucw.cz>
5 *
6 * This file is released under the GPLv2.
7 */ 6 */
8 7
9#include <linux/init.h> 8#include <linux/init.h>
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index d7f6c1a288d3..ca0fcb5ced71 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * linux/kernel/power/swap.c 3 * linux/kernel/power/swap.c
3 * 4 *
@@ -7,9 +8,6 @@
7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> 8 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
8 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> 9 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
9 * Copyright (C) 2010-2012 Bojan Smojver <bojan@rexursive.com> 10 * Copyright (C) 2010-2012 Bojan Smojver <bojan@rexursive.com>
10 *
11 * This file is released under the GPLv2.
12 *
13 */ 11 */
14 12
15#define pr_fmt(fmt) "PM: " fmt 13#define pr_fmt(fmt) "PM: " fmt
@@ -976,12 +974,11 @@ static int get_swap_reader(struct swap_map_handle *handle,
976 last = handle->maps = NULL; 974 last = handle->maps = NULL;
977 offset = swsusp_header->image; 975 offset = swsusp_header->image;
978 while (offset) { 976 while (offset) {
979 tmp = kmalloc(sizeof(*handle->maps), GFP_KERNEL); 977 tmp = kzalloc(sizeof(*handle->maps), GFP_KERNEL);
980 if (!tmp) { 978 if (!tmp) {
981 release_swap_reader(handle); 979 release_swap_reader(handle);
982 return -ENOMEM; 980 return -ENOMEM;
983 } 981 }
984 memset(tmp, 0, sizeof(*tmp));
985 if (!handle->maps) 982 if (!handle->maps)
986 handle->maps = tmp; 983 handle->maps = tmp;
987 if (last) 984 if (last)
diff --git a/kernel/power/user.c b/kernel/power/user.c
index cb24e840a3e6..77438954cc2b 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -1,12 +1,10 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * linux/kernel/power/user.c 3 * linux/kernel/power/user.c
3 * 4 *
4 * This file provides the user space interface for software suspend/resume. 5 * This file provides the user space interface for software suspend/resume.
5 * 6 *
6 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> 7 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
7 *
8 * This file is released under the GPLv2.
9 *
10 */ 8 */
11 9
12#include <linux/suspend.h> 10#include <linux/suspend.h>
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
index 4a2ffc39eb95..4d052fc6bcde 100644
--- a/kernel/printk/Makefile
+++ b/kernel/printk/Makefile
@@ -1,3 +1,4 @@
1# SPDX-License-Identifier: GPL-2.0-only
1obj-y = printk.o 2obj-y = printk.o
2obj-$(CONFIG_PRINTK) += printk_safe.o 3obj-$(CONFIG_PRINTK) += printk_safe.o
3obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o 4obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 0f1898820cba..c8e6ab689d42 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -1,18 +1,6 @@
1/* SPDX-License-Identifier: GPL-2.0-or-later */
1/* 2/*
2 * internal.h - printk internal definitions 3 * internal.h - printk internal definitions
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version 2
7 * of the License, or (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, see <http://www.gnu.org/licenses/>.
16 */ 4 */
17#include <linux/percpu.h> 5#include <linux/percpu.h>
18 6
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 02ca827b8fac..1888f6a3b694 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * linux/kernel/printk.c 3 * linux/kernel/printk.c
3 * 4 *
@@ -86,6 +87,12 @@ static DEFINE_SEMAPHORE(console_sem);
86struct console *console_drivers; 87struct console *console_drivers;
87EXPORT_SYMBOL_GPL(console_drivers); 88EXPORT_SYMBOL_GPL(console_drivers);
88 89
90/*
91 * System may need to suppress printk message under certain
92 * circumstances, like after kernel panic happens.
93 */
94int __read_mostly suppress_printk;
95
89#ifdef CONFIG_LOCKDEP 96#ifdef CONFIG_LOCKDEP
90static struct lockdep_map console_lock_dep_map = { 97static struct lockdep_map console_lock_dep_map = {
91 .name = "console_lock" 98 .name = "console_lock"
@@ -1943,6 +1950,10 @@ asmlinkage int vprintk_emit(int facility, int level,
1943 unsigned long flags; 1950 unsigned long flags;
1944 u64 curr_log_seq; 1951 u64 curr_log_seq;
1945 1952
1953 /* Suppress unimportant messages after panic happens */
1954 if (unlikely(suppress_printk))
1955 return 0;
1956
1946 if (level == LOGLEVEL_SCHED) { 1957 if (level == LOGLEVEL_SCHED) {
1947 level = LOGLEVEL_DEFAULT; 1958 level = LOGLEVEL_DEFAULT;
1948 in_sched = true; 1959 in_sched = true;
@@ -2525,10 +2536,11 @@ void console_unblank(void)
2525 2536
2526/** 2537/**
2527 * console_flush_on_panic - flush console content on panic 2538 * console_flush_on_panic - flush console content on panic
2539 * @mode: flush all messages in buffer or just the pending ones
2528 * 2540 *
2529 * Immediately output all pending messages no matter what. 2541 * Immediately output all pending messages no matter what.
2530 */ 2542 */
2531void console_flush_on_panic(void) 2543void console_flush_on_panic(enum con_flush_mode mode)
2532{ 2544{
2533 /* 2545 /*
2534 * If someone else is holding the console lock, trylock will fail 2546 * If someone else is holding the console lock, trylock will fail
@@ -2539,6 +2551,15 @@ void console_flush_on_panic(void)
2539 */ 2551 */
2540 console_trylock(); 2552 console_trylock();
2541 console_may_schedule = 0; 2553 console_may_schedule = 0;
2554
2555 if (mode == CONSOLE_REPLAY_ALL) {
2556 unsigned long flags;
2557
2558 logbuf_lock_irqsave(flags);
2559 console_seq = log_first_seq;
2560 console_idx = log_first_idx;
2561 logbuf_unlock_irqrestore(flags);
2562 }
2542 console_unlock(); 2563 console_unlock();
2543} 2564}
2544 2565
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index 0913b4d385de..b4045e782743 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -1,18 +1,6 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
1/* 2/*
2 * printk_safe.c - Safe printk for printk-deadlock-prone contexts 3 * printk_safe.c - Safe printk for printk-deadlock-prone contexts
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version 2
7 * of the License, or (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, see <http://www.gnu.org/licenses/>.
16 */ 4 */
17 5
18#include <linux/preempt.h> 6#include <linux/preempt.h>
diff --git a/kernel/profile.c b/kernel/profile.c
index 9c08a2c7cb1d..af7c94bf5fa1 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * linux/kernel/profile.c 3 * linux/kernel/profile.c
3 * Simple profiling. Manages a direct-mapped profile hit count buffer, 4 * Simple profiling. Manages a direct-mapped profile hit count buffer,
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 6f357f4fc859..83a531cea2f3 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * linux/kernel/ptrace.c 3 * linux/kernel/ptrace.c
3 * 4 *
@@ -78,9 +79,7 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent,
78 */ 79 */
79static void ptrace_link(struct task_struct *child, struct task_struct *new_parent) 80static void ptrace_link(struct task_struct *child, struct task_struct *new_parent)
80{ 81{
81 rcu_read_lock(); 82 __ptrace_link(child, new_parent, current_cred());
82 __ptrace_link(child, new_parent, __task_cred(new_parent));
83 rcu_read_unlock();
84} 83}
85 84
86/** 85/**
@@ -117,6 +116,9 @@ void __ptrace_unlink(struct task_struct *child)
117 BUG_ON(!child->ptrace); 116 BUG_ON(!child->ptrace);
118 117
119 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); 118 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
119#ifdef TIF_SYSCALL_EMU
120 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
121#endif
120 122
121 child->parent = child->real_parent; 123 child->parent = child->real_parent;
122 list_del_init(&child->ptrace_entry); 124 list_del_init(&child->ptrace_entry);
@@ -323,6 +325,16 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
323 return -EPERM; 325 return -EPERM;
324ok: 326ok:
325 rcu_read_unlock(); 327 rcu_read_unlock();
328 /*
329 * If a task drops privileges and becomes nondumpable (through a syscall
330 * like setresuid()) while we are trying to access it, we must ensure
331 * that the dumpability is read after the credentials; otherwise,
332 * we may be able to attach to a task that we shouldn't be able to
333 * attach to (as if the task had dropped privileges without becoming
334 * nondumpable).
335 * Pairs with a write barrier in commit_creds().
336 */
337 smp_rmb();
326 mm = task->mm; 338 mm = task->mm;
327 if (mm && 339 if (mm &&
328 ((get_dumpable(mm) != SUID_DUMP_USER) && 340 ((get_dumpable(mm) != SUID_DUMP_USER) &&
@@ -704,6 +716,10 @@ static int ptrace_peek_siginfo(struct task_struct *child,
704 if (arg.nr < 0) 716 if (arg.nr < 0)
705 return -EINVAL; 717 return -EINVAL;
706 718
719 /* Ensure arg.off fits in an unsigned long */
720 if (arg.off > ULONG_MAX)
721 return 0;
722
707 if (arg.flags & PTRACE_PEEKSIGINFO_SHARED) 723 if (arg.flags & PTRACE_PEEKSIGINFO_SHARED)
708 pending = &child->signal->shared_pending; 724 pending = &child->signal->shared_pending;
709 else 725 else
@@ -711,18 +727,20 @@ static int ptrace_peek_siginfo(struct task_struct *child,
711 727
712 for (i = 0; i < arg.nr; ) { 728 for (i = 0; i < arg.nr; ) {
713 kernel_siginfo_t info; 729 kernel_siginfo_t info;
714 s32 off = arg.off + i; 730 unsigned long off = arg.off + i;
731 bool found = false;
715 732
716 spin_lock_irq(&child->sighand->siglock); 733 spin_lock_irq(&child->sighand->siglock);
717 list_for_each_entry(q, &pending->list, list) { 734 list_for_each_entry(q, &pending->list, list) {
718 if (!off--) { 735 if (!off--) {
736 found = true;
719 copy_siginfo(&info, &q->info); 737 copy_siginfo(&info, &q->info);
720 break; 738 break;
721 } 739 }
722 } 740 }
723 spin_unlock_irq(&child->sighand->siglock); 741 spin_unlock_irq(&child->sighand->siglock);
724 742
725 if (off >= 0) /* beyond the end of the list */ 743 if (!found) /* beyond the end of the list */
726 break; 744 break;
727 745
728#ifdef CONFIG_COMPAT 746#ifdef CONFIG_COMPAT
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index 37301430970e..480edf328b51 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -1,3 +1,4 @@
1# SPDX-License-Identifier: GPL-2.0-only
1# 2#
2# RCU-related configuration options 3# RCU-related configuration options
3# 4#
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 0ec7d1d33a14..5ec3ea4028e2 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -1,3 +1,4 @@
1# SPDX-License-Identifier: GPL-2.0-only
1# 2#
2# RCU-related debugging configuration options 3# RCU-related debugging configuration options
3# 4#
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 4b58c907b4b7..5290b01de534 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -11,11 +11,6 @@
11#define __LINUX_RCU_H 11#define __LINUX_RCU_H
12 12
13#include <trace/events/rcu.h> 13#include <trace/events/rcu.h>
14#ifdef CONFIG_RCU_TRACE
15#define RCU_TRACE(stmt) stmt
16#else /* #ifdef CONFIG_RCU_TRACE */
17#define RCU_TRACE(stmt)
18#endif /* #else #ifdef CONFIG_RCU_TRACE */
19 14
20/* Offset to allow distinguishing irq vs. task-based idle entry/exit. */ 15/* Offset to allow distinguishing irq vs. task-based idle entry/exit. */
21#define DYNTICK_IRQ_NONIDLE ((LONG_MAX / 2) + 1) 16#define DYNTICK_IRQ_NONIDLE ((LONG_MAX / 2) + 1)
@@ -216,12 +211,12 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
216 211
217 rcu_lock_acquire(&rcu_callback_map); 212 rcu_lock_acquire(&rcu_callback_map);
218 if (__is_kfree_rcu_offset(offset)) { 213 if (__is_kfree_rcu_offset(offset)) {
219 RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset);) 214 trace_rcu_invoke_kfree_callback(rn, head, offset);
220 kfree((void *)head - offset); 215 kfree((void *)head - offset);
221 rcu_lock_release(&rcu_callback_map); 216 rcu_lock_release(&rcu_callback_map);
222 return true; 217 return true;
223 } else { 218 } else {
224 RCU_TRACE(trace_rcu_invoke_callback(rn, head);) 219 trace_rcu_invoke_callback(rn, head);
225 f = head->func; 220 f = head->func;
226 WRITE_ONCE(head->func, (rcu_callback_t)0L); 221 WRITE_ONCE(head->func, (rcu_callback_t)0L);
227 f(head); 222 f(head);
@@ -451,6 +446,7 @@ void rcu_request_urgent_qs_task(struct task_struct *t);
451enum rcutorture_type { 446enum rcutorture_type {
452 RCU_FLAVOR, 447 RCU_FLAVOR,
453 RCU_TASKS_FLAVOR, 448 RCU_TASKS_FLAVOR,
449 RCU_TRIVIAL_FLAVOR,
454 SRCU_FLAVOR, 450 SRCU_FLAVOR,
455 INVALID_RCU_FLAVOR 451 INVALID_RCU_FLAVOR
456}; 452};
@@ -484,6 +480,10 @@ void do_trace_rcu_torture_read(const char *rcutorturename,
484#endif 480#endif
485#endif 481#endif
486 482
483#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST)
484long rcutorture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask);
485#endif
486
487#ifdef CONFIG_TINY_SRCU 487#ifdef CONFIG_TINY_SRCU
488 488
489static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, 489static inline void srcutorture_get_gp_data(enum rcutorture_type test_type,
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index efaa5b3f4d3f..fce4e7e6f502 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -299,6 +299,7 @@ struct rcu_torture_ops {
299 int irq_capable; 299 int irq_capable;
300 int can_boost; 300 int can_boost;
301 int extendables; 301 int extendables;
302 int slow_gps;
302 const char *name; 303 const char *name;
303}; 304};
304 305
@@ -667,9 +668,51 @@ static struct rcu_torture_ops tasks_ops = {
667 .fqs = NULL, 668 .fqs = NULL,
668 .stats = NULL, 669 .stats = NULL,
669 .irq_capable = 1, 670 .irq_capable = 1,
671 .slow_gps = 1,
670 .name = "tasks" 672 .name = "tasks"
671}; 673};
672 674
675/*
676 * Definitions for trivial CONFIG_PREEMPT=n-only torture testing.
677 * This implementation does not necessarily work well with CPU hotplug.
678 */
679
680static void synchronize_rcu_trivial(void)
681{
682 int cpu;
683
684 for_each_online_cpu(cpu) {
685 rcutorture_sched_setaffinity(current->pid, cpumask_of(cpu));
686 WARN_ON_ONCE(raw_smp_processor_id() != cpu);
687 }
688}
689
690static int rcu_torture_read_lock_trivial(void) __acquires(RCU)
691{
692 preempt_disable();
693 return 0;
694}
695
696static void rcu_torture_read_unlock_trivial(int idx) __releases(RCU)
697{
698 preempt_enable();
699}
700
701static struct rcu_torture_ops trivial_ops = {
702 .ttype = RCU_TRIVIAL_FLAVOR,
703 .init = rcu_sync_torture_init,
704 .readlock = rcu_torture_read_lock_trivial,
705 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
706 .readunlock = rcu_torture_read_unlock_trivial,
707 .get_gp_seq = rcu_no_completed,
708 .sync = synchronize_rcu_trivial,
709 .exp_sync = synchronize_rcu_trivial,
710 .fqs = NULL,
711 .stats = NULL,
712 .irq_capable = 1,
713 .name = "trivial"
714};
715
673static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old) 716static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old)
674{ 717{
675 if (!cur_ops->gp_diff) 718 if (!cur_ops->gp_diff)
@@ -1010,10 +1053,17 @@ rcu_torture_writer(void *arg)
1010 !rcu_gp_is_normal(); 1053 !rcu_gp_is_normal();
1011 } 1054 }
1012 rcu_torture_writer_state = RTWS_STUTTER; 1055 rcu_torture_writer_state = RTWS_STUTTER;
1013 if (stutter_wait("rcu_torture_writer")) 1056 if (stutter_wait("rcu_torture_writer") &&
1057 !READ_ONCE(rcu_fwd_cb_nodelay) &&
1058 !cur_ops->slow_gps &&
1059 !torture_must_stop())
1014 for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) 1060 for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++)
1015 if (list_empty(&rcu_tortures[i].rtort_free)) 1061 if (list_empty(&rcu_tortures[i].rtort_free) &&
1016 WARN_ON_ONCE(1); 1062 rcu_access_pointer(rcu_torture_current) !=
1063 &rcu_tortures[i]) {
1064 rcu_ftrace_dump(DUMP_ALL);
1065 WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count);
1066 }
1017 } while (!torture_must_stop()); 1067 } while (!torture_must_stop());
1018 /* Reset expediting back to unexpedited. */ 1068 /* Reset expediting back to unexpedited. */
1019 if (expediting > 0) 1069 if (expediting > 0)
@@ -1358,8 +1408,9 @@ rcu_torture_stats_print(void)
1358 } 1408 }
1359 1409
1360 pr_alert("%s%s ", torture_type, TORTURE_FLAG); 1410 pr_alert("%s%s ", torture_type, TORTURE_FLAG);
1361 pr_cont("rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", 1411 pr_cont("rtc: %p %s: %lu tfle: %d rta: %d rtaf: %d rtf: %d ",
1362 rcu_torture_current, 1412 rcu_torture_current,
1413 rcu_torture_current ? "ver" : "VER",
1363 rcu_torture_current_version, 1414 rcu_torture_current_version,
1364 list_empty(&rcu_torture_freelist), 1415 list_empty(&rcu_torture_freelist),
1365 atomic_read(&n_rcu_torture_alloc), 1416 atomic_read(&n_rcu_torture_alloc),
@@ -1661,6 +1712,17 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp)
1661 spin_unlock_irqrestore(&rcu_fwd_lock, flags); 1712 spin_unlock_irqrestore(&rcu_fwd_lock, flags);
1662} 1713}
1663 1714
1715// Give the scheduler a chance, even on nohz_full CPUs.
1716static void rcu_torture_fwd_prog_cond_resched(void)
1717{
1718 if (IS_ENABLED(CONFIG_PREEMPT) && IS_ENABLED(CONFIG_NO_HZ_FULL)) {
1719 if (need_resched())
1720 schedule();
1721 } else {
1722 cond_resched();
1723 }
1724}
1725
1664/* 1726/*
1665 * Free all callbacks on the rcu_fwd_cb_head list, either because the 1727 * Free all callbacks on the rcu_fwd_cb_head list, either because the
1666 * test is over or because we hit an OOM event. 1728 * test is over or because we hit an OOM event.
@@ -1674,16 +1736,18 @@ static unsigned long rcu_torture_fwd_prog_cbfree(void)
1674 for (;;) { 1736 for (;;) {
1675 spin_lock_irqsave(&rcu_fwd_lock, flags); 1737 spin_lock_irqsave(&rcu_fwd_lock, flags);
1676 rfcp = rcu_fwd_cb_head; 1738 rfcp = rcu_fwd_cb_head;
1677 if (!rfcp) 1739 if (!rfcp) {
1740 spin_unlock_irqrestore(&rcu_fwd_lock, flags);
1678 break; 1741 break;
1742 }
1679 rcu_fwd_cb_head = rfcp->rfc_next; 1743 rcu_fwd_cb_head = rfcp->rfc_next;
1680 if (!rcu_fwd_cb_head) 1744 if (!rcu_fwd_cb_head)
1681 rcu_fwd_cb_tail = &rcu_fwd_cb_head; 1745 rcu_fwd_cb_tail = &rcu_fwd_cb_head;
1682 spin_unlock_irqrestore(&rcu_fwd_lock, flags); 1746 spin_unlock_irqrestore(&rcu_fwd_lock, flags);
1683 kfree(rfcp); 1747 kfree(rfcp);
1684 freed++; 1748 freed++;
1749 rcu_torture_fwd_prog_cond_resched();
1685 } 1750 }
1686 spin_unlock_irqrestore(&rcu_fwd_lock, flags);
1687 return freed; 1751 return freed;
1688} 1752}
1689 1753
@@ -1707,6 +1771,8 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries)
1707 } 1771 }
1708 1772
1709 /* Tight loop containing cond_resched(). */ 1773 /* Tight loop containing cond_resched(). */
1774 WRITE_ONCE(rcu_fwd_cb_nodelay, true);
1775 cur_ops->sync(); /* Later readers see above write. */
1710 if (selfpropcb) { 1776 if (selfpropcb) {
1711 WRITE_ONCE(fcs.stop, 0); 1777 WRITE_ONCE(fcs.stop, 0);
1712 cur_ops->call(&fcs.rh, rcu_torture_fwd_prog_cb); 1778 cur_ops->call(&fcs.rh, rcu_torture_fwd_prog_cb);
@@ -1724,7 +1790,7 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries)
1724 udelay(10); 1790 udelay(10);
1725 cur_ops->readunlock(idx); 1791 cur_ops->readunlock(idx);
1726 if (!fwd_progress_need_resched || need_resched()) 1792 if (!fwd_progress_need_resched || need_resched())
1727 cond_resched(); 1793 rcu_torture_fwd_prog_cond_resched();
1728 } 1794 }
1729 (*tested_tries)++; 1795 (*tested_tries)++;
1730 if (!time_before(jiffies, stopat) && 1796 if (!time_before(jiffies, stopat) &&
@@ -1745,6 +1811,8 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries)
1745 WARN_ON(READ_ONCE(fcs.stop) != 2); 1811 WARN_ON(READ_ONCE(fcs.stop) != 2);
1746 destroy_rcu_head_on_stack(&fcs.rh); 1812 destroy_rcu_head_on_stack(&fcs.rh);
1747 } 1813 }
1814 schedule_timeout_uninterruptible(HZ / 10); /* Let kthreads recover. */
1815 WRITE_ONCE(rcu_fwd_cb_nodelay, false);
1748} 1816}
1749 1817
1750/* Carry out call_rcu() forward-progress testing. */ 1818/* Carry out call_rcu() forward-progress testing. */
@@ -1765,6 +1833,8 @@ static void rcu_torture_fwd_prog_cr(void)
1765 1833
1766 if (READ_ONCE(rcu_fwd_emergency_stop)) 1834 if (READ_ONCE(rcu_fwd_emergency_stop))
1767 return; /* Get out of the way quickly, no GP wait! */ 1835 return; /* Get out of the way quickly, no GP wait! */
1836 if (!cur_ops->call)
1837 return; /* Can't do call_rcu() fwd prog without ->call. */
1768 1838
1769 /* Loop continuously posting RCU callbacks. */ 1839 /* Loop continuously posting RCU callbacks. */
1770 WRITE_ONCE(rcu_fwd_cb_nodelay, true); 1840 WRITE_ONCE(rcu_fwd_cb_nodelay, true);
@@ -1805,7 +1875,7 @@ static void rcu_torture_fwd_prog_cr(void)
1805 rfcp->rfc_gps = 0; 1875 rfcp->rfc_gps = 0;
1806 } 1876 }
1807 cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr); 1877 cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr);
1808 cond_resched(); 1878 rcu_torture_fwd_prog_cond_resched();
1809 } 1879 }
1810 stoppedat = jiffies; 1880 stoppedat = jiffies;
1811 n_launders_cb_snap = READ_ONCE(n_launders_cb); 1881 n_launders_cb_snap = READ_ONCE(n_launders_cb);
@@ -1814,7 +1884,6 @@ static void rcu_torture_fwd_prog_cr(void)
1814 cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */ 1884 cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */
1815 (void)rcu_torture_fwd_prog_cbfree(); 1885 (void)rcu_torture_fwd_prog_cbfree();
1816 1886
1817 WRITE_ONCE(rcu_fwd_cb_nodelay, false);
1818 if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop)) { 1887 if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop)) {
1819 WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED); 1888 WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED);
1820 pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n", 1889 pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n",
@@ -1825,6 +1894,8 @@ static void rcu_torture_fwd_prog_cr(void)
1825 n_max_gps, n_max_cbs, cver, gps); 1894 n_max_gps, n_max_cbs, cver, gps);
1826 rcu_torture_fwd_cb_hist(); 1895 rcu_torture_fwd_cb_hist();
1827 } 1896 }
1897 schedule_timeout_uninterruptible(HZ); /* Let CBs drain. */
1898 WRITE_ONCE(rcu_fwd_cb_nodelay, false);
1828} 1899}
1829 1900
1830 1901
@@ -2240,7 +2311,7 @@ rcu_torture_init(void)
2240 int firsterr = 0; 2311 int firsterr = 0;
2241 static struct rcu_torture_ops *torture_ops[] = { 2312 static struct rcu_torture_ops *torture_ops[] = {
2242 &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, 2313 &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops,
2243 &busted_srcud_ops, &tasks_ops, 2314 &busted_srcud_ops, &tasks_ops, &trivial_ops,
2244 }; 2315 };
2245 2316
2246 if (!torture_init_begin(torture_type, verbose)) 2317 if (!torture_init_begin(torture_type, verbose))
@@ -2363,7 +2434,10 @@ rcu_torture_init(void)
2363 if (stutter < 0) 2434 if (stutter < 0)
2364 stutter = 0; 2435 stutter = 0;
2365 if (stutter) { 2436 if (stutter) {
2366 firsterr = torture_stutter_init(stutter * HZ); 2437 int t;
2438
2439 t = cur_ops->stall_dur ? cur_ops->stall_dur() : stutter * HZ;
2440 firsterr = torture_stutter_init(stutter * HZ, t);
2367 if (firsterr) 2441 if (firsterr)
2368 goto unwind; 2442 goto unwind;
2369 } 2443 }
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 9b761e546de8..cf0e886314f2 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -831,8 +831,8 @@ static void srcu_leak_callback(struct rcu_head *rhp)
831 * srcu_read_lock(), and srcu_read_unlock() that are all passed the same 831 * srcu_read_lock(), and srcu_read_unlock() that are all passed the same
832 * srcu_struct structure. 832 * srcu_struct structure.
833 */ 833 */
834void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, 834static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
835 rcu_callback_t func, bool do_norm) 835 rcu_callback_t func, bool do_norm)
836{ 836{
837 unsigned long flags; 837 unsigned long flags;
838 int idx; 838 int idx;
@@ -1310,3 +1310,68 @@ void __init srcu_init(void)
1310 queue_work(rcu_gp_wq, &ssp->work.work); 1310 queue_work(rcu_gp_wq, &ssp->work.work);
1311 } 1311 }
1312} 1312}
1313
1314#ifdef CONFIG_MODULES
1315
1316/* Initialize any global-scope srcu_struct structures used by this module. */
1317static int srcu_module_coming(struct module *mod)
1318{
1319 int i;
1320 struct srcu_struct **sspp = mod->srcu_struct_ptrs;
1321 int ret;
1322
1323 for (i = 0; i < mod->num_srcu_structs; i++) {
1324 ret = init_srcu_struct(*(sspp++));
1325 if (WARN_ON_ONCE(ret))
1326 return ret;
1327 }
1328 return 0;
1329}
1330
1331/* Clean up any global-scope srcu_struct structures used by this module. */
1332static void srcu_module_going(struct module *mod)
1333{
1334 int i;
1335 struct srcu_struct **sspp = mod->srcu_struct_ptrs;
1336
1337 for (i = 0; i < mod->num_srcu_structs; i++)
1338 cleanup_srcu_struct(*(sspp++));
1339}
1340
1341/* Handle one module, either coming or going. */
1342static int srcu_module_notify(struct notifier_block *self,
1343 unsigned long val, void *data)
1344{
1345 struct module *mod = data;
1346 int ret = 0;
1347
1348 switch (val) {
1349 case MODULE_STATE_COMING:
1350 ret = srcu_module_coming(mod);
1351 break;
1352 case MODULE_STATE_GOING:
1353 srcu_module_going(mod);
1354 break;
1355 default:
1356 break;
1357 }
1358 return ret;
1359}
1360
1361static struct notifier_block srcu_module_nb = {
1362 .notifier_call = srcu_module_notify,
1363 .priority = 0,
1364};
1365
1366static __init int init_srcu_module_notifier(void)
1367{
1368 int ret;
1369
1370 ret = register_module_notifier(&srcu_module_nb);
1371 if (ret)
1372 pr_warn("Failed to register srcu module notifier\n");
1373 return ret;
1374}
1375late_initcall(init_srcu_module_notifier);
1376
1377#endif /* #ifdef CONFIG_MODULES */
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index a8304d90573f..d4558ab7a07d 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -10,65 +10,18 @@
10#include <linux/rcu_sync.h> 10#include <linux/rcu_sync.h>
11#include <linux/sched.h> 11#include <linux/sched.h>
12 12
13#ifdef CONFIG_PROVE_RCU 13enum { GP_IDLE = 0, GP_ENTER, GP_PASSED, GP_EXIT, GP_REPLAY };
14#define __INIT_HELD(func) .held = func,
15#else
16#define __INIT_HELD(func)
17#endif
18
19static const struct {
20 void (*sync)(void);
21 void (*call)(struct rcu_head *, void (*)(struct rcu_head *));
22 void (*wait)(void);
23#ifdef CONFIG_PROVE_RCU
24 int (*held)(void);
25#endif
26} gp_ops[] = {
27 [RCU_SYNC] = {
28 .sync = synchronize_rcu,
29 .call = call_rcu,
30 .wait = rcu_barrier,
31 __INIT_HELD(rcu_read_lock_held)
32 },
33 [RCU_SCHED_SYNC] = {
34 .sync = synchronize_rcu,
35 .call = call_rcu,
36 .wait = rcu_barrier,
37 __INIT_HELD(rcu_read_lock_sched_held)
38 },
39 [RCU_BH_SYNC] = {
40 .sync = synchronize_rcu,
41 .call = call_rcu,
42 .wait = rcu_barrier,
43 __INIT_HELD(rcu_read_lock_bh_held)
44 },
45};
46
47enum { GP_IDLE = 0, GP_PENDING, GP_PASSED };
48enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY };
49 14
50#define rss_lock gp_wait.lock 15#define rss_lock gp_wait.lock
51 16
52#ifdef CONFIG_PROVE_RCU
53void rcu_sync_lockdep_assert(struct rcu_sync *rsp)
54{
55 RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(),
56 "suspicious rcu_sync_is_idle() usage");
57}
58
59EXPORT_SYMBOL_GPL(rcu_sync_lockdep_assert);
60#endif
61
62/** 17/**
63 * rcu_sync_init() - Initialize an rcu_sync structure 18 * rcu_sync_init() - Initialize an rcu_sync structure
64 * @rsp: Pointer to rcu_sync structure to be initialized 19 * @rsp: Pointer to rcu_sync structure to be initialized
65 * @type: Flavor of RCU with which to synchronize rcu_sync structure
66 */ 20 */
67void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type) 21void rcu_sync_init(struct rcu_sync *rsp)
68{ 22{
69 memset(rsp, 0, sizeof(*rsp)); 23 memset(rsp, 0, sizeof(*rsp));
70 init_waitqueue_head(&rsp->gp_wait); 24 init_waitqueue_head(&rsp->gp_wait);
71 rsp->gp_type = type;
72} 25}
73 26
74/** 27/**
@@ -86,56 +39,26 @@ void rcu_sync_enter_start(struct rcu_sync *rsp)
86 rsp->gp_state = GP_PASSED; 39 rsp->gp_state = GP_PASSED;
87} 40}
88 41
89/**
90 * rcu_sync_enter() - Force readers onto slowpath
91 * @rsp: Pointer to rcu_sync structure to use for synchronization
92 *
93 * This function is used by updaters who need readers to make use of
94 * a slowpath during the update. After this function returns, all
95 * subsequent calls to rcu_sync_is_idle() will return false, which
96 * tells readers to stay off their fastpaths. A later call to
97 * rcu_sync_exit() re-enables reader slowpaths.
98 *
99 * When called in isolation, rcu_sync_enter() must wait for a grace
100 * period, however, closely spaced calls to rcu_sync_enter() can
101 * optimize away the grace-period wait via a state machine implemented
102 * by rcu_sync_enter(), rcu_sync_exit(), and rcu_sync_func().
103 */
104void rcu_sync_enter(struct rcu_sync *rsp)
105{
106 bool need_wait, need_sync;
107 42
108 spin_lock_irq(&rsp->rss_lock); 43static void rcu_sync_func(struct rcu_head *rhp);
109 need_wait = rsp->gp_count++;
110 need_sync = rsp->gp_state == GP_IDLE;
111 if (need_sync)
112 rsp->gp_state = GP_PENDING;
113 spin_unlock_irq(&rsp->rss_lock);
114 44
115 WARN_ON_ONCE(need_wait && need_sync); 45static void rcu_sync_call(struct rcu_sync *rsp)
116 if (need_sync) { 46{
117 gp_ops[rsp->gp_type].sync(); 47 call_rcu(&rsp->cb_head, rcu_sync_func);
118 rsp->gp_state = GP_PASSED;
119 wake_up_all(&rsp->gp_wait);
120 } else if (need_wait) {
121 wait_event(rsp->gp_wait, rsp->gp_state == GP_PASSED);
122 } else {
123 /*
124 * Possible when there's a pending CB from a rcu_sync_exit().
125 * Nobody has yet been allowed the 'fast' path and thus we can
126 * avoid doing any sync(). The callback will get 'dropped'.
127 */
128 WARN_ON_ONCE(rsp->gp_state != GP_PASSED);
129 }
130} 48}
131 49
132/** 50/**
133 * rcu_sync_func() - Callback function managing reader access to fastpath 51 * rcu_sync_func() - Callback function managing reader access to fastpath
134 * @rhp: Pointer to rcu_head in rcu_sync structure to use for synchronization 52 * @rhp: Pointer to rcu_head in rcu_sync structure to use for synchronization
135 * 53 *
136 * This function is passed to one of the call_rcu() functions by 54 * This function is passed to call_rcu() function by rcu_sync_enter() and
137 * rcu_sync_exit(), so that it is invoked after a grace period following the 55 * rcu_sync_exit(), so that it is invoked after a grace period following the
138 * that invocation of rcu_sync_exit(). It takes action based on events that 56 * that invocation of enter/exit.
57 *
58 * If it is called by rcu_sync_enter() it signals that all the readers were
59 * switched onto slow path.
60 *
61 * If it is called by rcu_sync_exit() it takes action based on events that
139 * have taken place in the meantime, so that closely spaced rcu_sync_enter() 62 * have taken place in the meantime, so that closely spaced rcu_sync_enter()
140 * and rcu_sync_exit() pairs need not wait for a grace period. 63 * and rcu_sync_exit() pairs need not wait for a grace period.
141 * 64 *
@@ -152,35 +75,88 @@ static void rcu_sync_func(struct rcu_head *rhp)
152 struct rcu_sync *rsp = container_of(rhp, struct rcu_sync, cb_head); 75 struct rcu_sync *rsp = container_of(rhp, struct rcu_sync, cb_head);
153 unsigned long flags; 76 unsigned long flags;
154 77
155 WARN_ON_ONCE(rsp->gp_state != GP_PASSED); 78 WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_IDLE);
156 WARN_ON_ONCE(rsp->cb_state == CB_IDLE); 79 WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_PASSED);
157 80
158 spin_lock_irqsave(&rsp->rss_lock, flags); 81 spin_lock_irqsave(&rsp->rss_lock, flags);
159 if (rsp->gp_count) { 82 if (rsp->gp_count) {
160 /* 83 /*
161 * A new rcu_sync_begin() has happened; drop the callback. 84 * We're at least a GP after the GP_IDLE->GP_ENTER transition.
162 */ 85 */
163 rsp->cb_state = CB_IDLE; 86 WRITE_ONCE(rsp->gp_state, GP_PASSED);
164 } else if (rsp->cb_state == CB_REPLAY) { 87 wake_up_locked(&rsp->gp_wait);
88 } else if (rsp->gp_state == GP_REPLAY) {
165 /* 89 /*
166 * A new rcu_sync_exit() has happened; requeue the callback 90 * A new rcu_sync_exit() has happened; requeue the callback to
167 * to catch a later GP. 91 * catch a later GP.
168 */ 92 */
169 rsp->cb_state = CB_PENDING; 93 WRITE_ONCE(rsp->gp_state, GP_EXIT);
170 gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func); 94 rcu_sync_call(rsp);
171 } else { 95 } else {
172 /* 96 /*
173 * We're at least a GP after rcu_sync_exit(); eveybody will now 97 * We're at least a GP after the last rcu_sync_exit(); eveybody
174 * have observed the write side critical section. Let 'em rip!. 98 * will now have observed the write side critical section.
99 * Let 'em rip!.
175 */ 100 */
176 rsp->cb_state = CB_IDLE; 101 WRITE_ONCE(rsp->gp_state, GP_IDLE);
177 rsp->gp_state = GP_IDLE;
178 } 102 }
179 spin_unlock_irqrestore(&rsp->rss_lock, flags); 103 spin_unlock_irqrestore(&rsp->rss_lock, flags);
180} 104}
181 105
182/** 106/**
183 * rcu_sync_exit() - Allow readers back onto fast patch after grace period 107 * rcu_sync_enter() - Force readers onto slowpath
108 * @rsp: Pointer to rcu_sync structure to use for synchronization
109 *
110 * This function is used by updaters who need readers to make use of
111 * a slowpath during the update. After this function returns, all
112 * subsequent calls to rcu_sync_is_idle() will return false, which
113 * tells readers to stay off their fastpaths. A later call to
114 * rcu_sync_exit() re-enables reader slowpaths.
115 *
116 * When called in isolation, rcu_sync_enter() must wait for a grace
117 * period, however, closely spaced calls to rcu_sync_enter() can
118 * optimize away the grace-period wait via a state machine implemented
119 * by rcu_sync_enter(), rcu_sync_exit(), and rcu_sync_func().
120 */
121void rcu_sync_enter(struct rcu_sync *rsp)
122{
123 int gp_state;
124
125 spin_lock_irq(&rsp->rss_lock);
126 gp_state = rsp->gp_state;
127 if (gp_state == GP_IDLE) {
128 WRITE_ONCE(rsp->gp_state, GP_ENTER);
129 WARN_ON_ONCE(rsp->gp_count);
130 /*
131 * Note that we could simply do rcu_sync_call(rsp) here and
132 * avoid the "if (gp_state == GP_IDLE)" block below.
133 *
134 * However, synchronize_rcu() can be faster if rcu_expedited
135 * or rcu_blocking_is_gp() is true.
136 *
137 * Another reason is that we can't wait for rcu callback if
138 * we are called at early boot time but this shouldn't happen.
139 */
140 }
141 rsp->gp_count++;
142 spin_unlock_irq(&rsp->rss_lock);
143
144 if (gp_state == GP_IDLE) {
145 /*
146 * See the comment above, this simply does the "synchronous"
147 * call_rcu(rcu_sync_func) which does GP_ENTER -> GP_PASSED.
148 */
149 synchronize_rcu();
150 rcu_sync_func(&rsp->cb_head);
151 /* Not really needed, wait_event() would see GP_PASSED. */
152 return;
153 }
154
155 wait_event(rsp->gp_wait, READ_ONCE(rsp->gp_state) >= GP_PASSED);
156}
157
158/**
159 * rcu_sync_exit() - Allow readers back onto fast path after grace period
184 * @rsp: Pointer to rcu_sync structure to use for synchronization 160 * @rsp: Pointer to rcu_sync structure to use for synchronization
185 * 161 *
186 * This function is used by updaters who have completed, and can therefore 162 * This function is used by updaters who have completed, and can therefore
@@ -191,13 +167,16 @@ static void rcu_sync_func(struct rcu_head *rhp)
191 */ 167 */
192void rcu_sync_exit(struct rcu_sync *rsp) 168void rcu_sync_exit(struct rcu_sync *rsp)
193{ 169{
170 WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_IDLE);
171 WARN_ON_ONCE(READ_ONCE(rsp->gp_count) == 0);
172
194 spin_lock_irq(&rsp->rss_lock); 173 spin_lock_irq(&rsp->rss_lock);
195 if (!--rsp->gp_count) { 174 if (!--rsp->gp_count) {
196 if (rsp->cb_state == CB_IDLE) { 175 if (rsp->gp_state == GP_PASSED) {
197 rsp->cb_state = CB_PENDING; 176 WRITE_ONCE(rsp->gp_state, GP_EXIT);
198 gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func); 177 rcu_sync_call(rsp);
199 } else if (rsp->cb_state == CB_PENDING) { 178 } else if (rsp->gp_state == GP_EXIT) {
200 rsp->cb_state = CB_REPLAY; 179 WRITE_ONCE(rsp->gp_state, GP_REPLAY);
201 } 180 }
202 } 181 }
203 spin_unlock_irq(&rsp->rss_lock); 182 spin_unlock_irq(&rsp->rss_lock);
@@ -209,18 +188,19 @@ void rcu_sync_exit(struct rcu_sync *rsp)
209 */ 188 */
210void rcu_sync_dtor(struct rcu_sync *rsp) 189void rcu_sync_dtor(struct rcu_sync *rsp)
211{ 190{
212 int cb_state; 191 int gp_state;
213 192
214 WARN_ON_ONCE(rsp->gp_count); 193 WARN_ON_ONCE(READ_ONCE(rsp->gp_count));
194 WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_PASSED);
215 195
216 spin_lock_irq(&rsp->rss_lock); 196 spin_lock_irq(&rsp->rss_lock);
217 if (rsp->cb_state == CB_REPLAY) 197 if (rsp->gp_state == GP_REPLAY)
218 rsp->cb_state = CB_PENDING; 198 WRITE_ONCE(rsp->gp_state, GP_EXIT);
219 cb_state = rsp->cb_state; 199 gp_state = rsp->gp_state;
220 spin_unlock_irq(&rsp->rss_lock); 200 spin_unlock_irq(&rsp->rss_lock);
221 201
222 if (cb_state != CB_IDLE) { 202 if (gp_state != GP_IDLE) {
223 gp_ops[rsp->gp_type].wait(); 203 rcu_barrier();
224 WARN_ON_ONCE(rsp->cb_state != CB_IDLE); 204 WARN_ON_ONCE(rsp->gp_state != GP_IDLE);
225 } 205 }
226} 206}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index b4d88a594785..a14e5fbbea46 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -51,6 +51,12 @@
51#include <linux/tick.h> 51#include <linux/tick.h>
52#include <linux/sysrq.h> 52#include <linux/sysrq.h>
53#include <linux/kprobes.h> 53#include <linux/kprobes.h>
54#include <linux/gfp.h>
55#include <linux/oom.h>
56#include <linux/smpboot.h>
57#include <linux/jiffies.h>
58#include <linux/sched/isolation.h>
59#include "../time/tick-internal.h"
54 60
55#include "tree.h" 61#include "tree.h"
56#include "rcu.h" 62#include "rcu.h"
@@ -92,6 +98,9 @@ struct rcu_state rcu_state = {
92/* Dump rcu_node combining tree at boot to verify correct setup. */ 98/* Dump rcu_node combining tree at boot to verify correct setup. */
93static bool dump_tree; 99static bool dump_tree;
94module_param(dump_tree, bool, 0444); 100module_param(dump_tree, bool, 0444);
101/* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */
102static bool use_softirq = 1;
103module_param(use_softirq, bool, 0444);
95/* Control rcu_node-tree auto-balancing at boot time. */ 104/* Control rcu_node-tree auto-balancing at boot time. */
96static bool rcu_fanout_exact; 105static bool rcu_fanout_exact;
97module_param(rcu_fanout_exact, bool, 0444); 106module_param(rcu_fanout_exact, bool, 0444);
@@ -138,7 +147,6 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf);
138static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf); 147static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
139static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); 148static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
140static void invoke_rcu_core(void); 149static void invoke_rcu_core(void);
141static void invoke_rcu_callbacks(struct rcu_data *rdp);
142static void rcu_report_exp_rdp(struct rcu_data *rdp); 150static void rcu_report_exp_rdp(struct rcu_data *rdp);
143static void sync_sched_exp_online_cleanup(int cpu); 151static void sync_sched_exp_online_cleanup(int cpu);
144 152
@@ -368,19 +376,33 @@ static void __maybe_unused rcu_momentary_dyntick_idle(void)
368} 376}
369 377
370/** 378/**
371 * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle 379 * rcu_is_cpu_rrupt_from_idle - see if interrupted from idle
372 * 380 *
373 * If the current CPU is idle or running at a first-level (not nested) 381 * If the current CPU is idle and running at a first-level (not nested)
374 * interrupt from idle, return true. The caller must have at least 382 * interrupt from idle, return true. The caller must have at least
375 * disabled preemption. 383 * disabled preemption.
376 */ 384 */
377static int rcu_is_cpu_rrupt_from_idle(void) 385static int rcu_is_cpu_rrupt_from_idle(void)
378{ 386{
379 return __this_cpu_read(rcu_data.dynticks_nesting) <= 0 && 387 /* Called only from within the scheduling-clock interrupt */
380 __this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 1; 388 lockdep_assert_in_irq();
389
390 /* Check for counter underflows */
391 RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) < 0,
392 "RCU dynticks_nesting counter underflow!");
393 RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 0,
394 "RCU dynticks_nmi_nesting counter underflow/zero!");
395
396 /* Are we at first interrupt nesting level? */
397 if (__this_cpu_read(rcu_data.dynticks_nmi_nesting) != 1)
398 return false;
399
400 /* Does CPU appear to be idle from an RCU standpoint? */
401 return __this_cpu_read(rcu_data.dynticks_nesting) == 0;
381} 402}
382 403
383#define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch. */ 404#define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch ... */
405#define DEFAULT_MAX_RCU_BLIMIT 10000 /* ... even during callback flood. */
384static long blimit = DEFAULT_RCU_BLIMIT; 406static long blimit = DEFAULT_RCU_BLIMIT;
385#define DEFAULT_RCU_QHIMARK 10000 /* If this many pending, ignore blimit. */ 407#define DEFAULT_RCU_QHIMARK 10000 /* If this many pending, ignore blimit. */
386static long qhimark = DEFAULT_RCU_QHIMARK; 408static long qhimark = DEFAULT_RCU_QHIMARK;
@@ -1969,14 +1991,14 @@ rcu_check_quiescent_state(struct rcu_data *rdp)
1969 */ 1991 */
1970int rcutree_dying_cpu(unsigned int cpu) 1992int rcutree_dying_cpu(unsigned int cpu)
1971{ 1993{
1972 RCU_TRACE(bool blkd;) 1994 bool blkd;
1973 RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(&rcu_data);) 1995 struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
1974 RCU_TRACE(struct rcu_node *rnp = rdp->mynode;) 1996 struct rcu_node *rnp = rdp->mynode;
1975 1997
1976 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) 1998 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
1977 return 0; 1999 return 0;
1978 2000
1979 RCU_TRACE(blkd = !!(rnp->qsmask & rdp->grpmask);) 2001 blkd = !!(rnp->qsmask & rdp->grpmask);
1980 trace_rcu_grace_period(rcu_state.name, rnp->gp_seq, 2002 trace_rcu_grace_period(rcu_state.name, rnp->gp_seq,
1981 blkd ? TPS("cpuofl") : TPS("cpuofl-bgp")); 2003 blkd ? TPS("cpuofl") : TPS("cpuofl-bgp"));
1982 return 0; 2004 return 0;
@@ -2113,7 +2135,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
2113 2135
2114 /* Reinstate batch limit if we have worked down the excess. */ 2136 /* Reinstate batch limit if we have worked down the excess. */
2115 count = rcu_segcblist_n_cbs(&rdp->cblist); 2137 count = rcu_segcblist_n_cbs(&rdp->cblist);
2116 if (rdp->blimit == LONG_MAX && count <= qlowmark) 2138 if (rdp->blimit >= DEFAULT_MAX_RCU_BLIMIT && count <= qlowmark)
2117 rdp->blimit = blimit; 2139 rdp->blimit = blimit;
2118 2140
2119 /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ 2141 /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
@@ -2253,7 +2275,7 @@ void rcu_force_quiescent_state(void)
2253EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); 2275EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
2254 2276
2255/* Perform RCU core processing work for the current CPU. */ 2277/* Perform RCU core processing work for the current CPU. */
2256static __latent_entropy void rcu_core(struct softirq_action *unused) 2278static __latent_entropy void rcu_core(void)
2257{ 2279{
2258 unsigned long flags; 2280 unsigned long flags;
2259 struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); 2281 struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
@@ -2287,37 +2309,126 @@ static __latent_entropy void rcu_core(struct softirq_action *unused)
2287 rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check()); 2309 rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check());
2288 2310
2289 /* If there are callbacks ready, invoke them. */ 2311 /* If there are callbacks ready, invoke them. */
2290 if (rcu_segcblist_ready_cbs(&rdp->cblist)) 2312 if (rcu_segcblist_ready_cbs(&rdp->cblist) &&
2291 invoke_rcu_callbacks(rdp); 2313 likely(READ_ONCE(rcu_scheduler_fully_active)))
2314 rcu_do_batch(rdp);
2292 2315
2293 /* Do any needed deferred wakeups of rcuo kthreads. */ 2316 /* Do any needed deferred wakeups of rcuo kthreads. */
2294 do_nocb_deferred_wakeup(rdp); 2317 do_nocb_deferred_wakeup(rdp);
2295 trace_rcu_utilization(TPS("End RCU core")); 2318 trace_rcu_utilization(TPS("End RCU core"));
2296} 2319}
2297 2320
2321static void rcu_core_si(struct softirq_action *h)
2322{
2323 rcu_core();
2324}
2325
2326static void rcu_wake_cond(struct task_struct *t, int status)
2327{
2328 /*
2329 * If the thread is yielding, only wake it when this
2330 * is invoked from idle
2331 */
2332 if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current)))
2333 wake_up_process(t);
2334}
2335
2336static void invoke_rcu_core_kthread(void)
2337{
2338 struct task_struct *t;
2339 unsigned long flags;
2340
2341 local_irq_save(flags);
2342 __this_cpu_write(rcu_data.rcu_cpu_has_work, 1);
2343 t = __this_cpu_read(rcu_data.rcu_cpu_kthread_task);
2344 if (t != NULL && t != current)
2345 rcu_wake_cond(t, __this_cpu_read(rcu_data.rcu_cpu_kthread_status));
2346 local_irq_restore(flags);
2347}
2348
2298/* 2349/*
2299 * Schedule RCU callback invocation. If the running implementation of RCU 2350 * Wake up this CPU's rcuc kthread to do RCU core processing.
2300 * does not support RCU priority boosting, just do a direct call, otherwise
2301 * wake up the per-CPU kernel kthread. Note that because we are running
2302 * on the current CPU with softirqs disabled, the rcu_cpu_kthread_task
2303 * cannot disappear out from under us.
2304 */ 2351 */
2305static void invoke_rcu_callbacks(struct rcu_data *rdp) 2352static void invoke_rcu_core(void)
2306{ 2353{
2307 if (unlikely(!READ_ONCE(rcu_scheduler_fully_active))) 2354 if (!cpu_online(smp_processor_id()))
2308 return;
2309 if (likely(!rcu_state.boost)) {
2310 rcu_do_batch(rdp);
2311 return; 2355 return;
2356 if (use_softirq)
2357 raise_softirq(RCU_SOFTIRQ);
2358 else
2359 invoke_rcu_core_kthread();
2360}
2361
2362static void rcu_cpu_kthread_park(unsigned int cpu)
2363{
2364 per_cpu(rcu_data.rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
2365}
2366
2367static int rcu_cpu_kthread_should_run(unsigned int cpu)
2368{
2369 return __this_cpu_read(rcu_data.rcu_cpu_has_work);
2370}
2371
2372/*
2373 * Per-CPU kernel thread that invokes RCU callbacks. This replaces
2374 * the RCU softirq used in configurations of RCU that do not support RCU
2375 * priority boosting.
2376 */
2377static void rcu_cpu_kthread(unsigned int cpu)
2378{
2379 unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status);
2380 char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work);
2381 int spincnt;
2382
2383 for (spincnt = 0; spincnt < 10; spincnt++) {
2384 trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
2385 local_bh_disable();
2386 *statusp = RCU_KTHREAD_RUNNING;
2387 local_irq_disable();
2388 work = *workp;
2389 *workp = 0;
2390 local_irq_enable();
2391 if (work)
2392 rcu_core();
2393 local_bh_enable();
2394 if (*workp == 0) {
2395 trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
2396 *statusp = RCU_KTHREAD_WAITING;
2397 return;
2398 }
2312 } 2399 }
2313 invoke_rcu_callbacks_kthread(); 2400 *statusp = RCU_KTHREAD_YIELDING;
2401 trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
2402 schedule_timeout_interruptible(2);
2403 trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
2404 *statusp = RCU_KTHREAD_WAITING;
2314} 2405}
2315 2406
2316static void invoke_rcu_core(void) 2407static struct smp_hotplug_thread rcu_cpu_thread_spec = {
2408 .store = &rcu_data.rcu_cpu_kthread_task,
2409 .thread_should_run = rcu_cpu_kthread_should_run,
2410 .thread_fn = rcu_cpu_kthread,
2411 .thread_comm = "rcuc/%u",
2412 .setup = rcu_cpu_kthread_setup,
2413 .park = rcu_cpu_kthread_park,
2414};
2415
2416/*
2417 * Spawn per-CPU RCU core processing kthreads.
2418 */
2419static int __init rcu_spawn_core_kthreads(void)
2317{ 2420{
2318 if (cpu_online(smp_processor_id())) 2421 int cpu;
2319 raise_softirq(RCU_SOFTIRQ); 2422
2423 for_each_possible_cpu(cpu)
2424 per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0;
2425 if (!IS_ENABLED(CONFIG_RCU_BOOST) && use_softirq)
2426 return 0;
2427 WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec),
2428 "%s: Could not start rcuc kthread, OOM is now expected behavior\n", __func__);
2429 return 0;
2320} 2430}
2431early_initcall(rcu_spawn_core_kthreads);
2321 2432
2322/* 2433/*
2323 * Handle any core-RCU processing required by a call_rcu() invocation. 2434 * Handle any core-RCU processing required by a call_rcu() invocation.
@@ -2354,7 +2465,7 @@ static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head,
2354 rcu_accelerate_cbs_unlocked(rdp->mynode, rdp); 2465 rcu_accelerate_cbs_unlocked(rdp->mynode, rdp);
2355 } else { 2466 } else {
2356 /* Give the grace period a kick. */ 2467 /* Give the grace period a kick. */
2357 rdp->blimit = LONG_MAX; 2468 rdp->blimit = DEFAULT_MAX_RCU_BLIMIT;
2358 if (rcu_state.n_force_qs == rdp->n_force_qs_snap && 2469 if (rcu_state.n_force_qs == rdp->n_force_qs_snap &&
2359 rcu_segcblist_first_pend_cb(&rdp->cblist) != head) 2470 rcu_segcblist_first_pend_cb(&rdp->cblist) != head)
2360 rcu_force_quiescent_state(); 2471 rcu_force_quiescent_state();
@@ -3355,7 +3466,8 @@ void __init rcu_init(void)
3355 rcu_init_one(); 3466 rcu_init_one();
3356 if (dump_tree) 3467 if (dump_tree)
3357 rcu_dump_rcu_node_tree(); 3468 rcu_dump_rcu_node_tree();
3358 open_softirq(RCU_SOFTIRQ, rcu_core); 3469 if (use_softirq)
3470 open_softirq(RCU_SOFTIRQ, rcu_core_si);
3359 3471
3360 /* 3472 /*
3361 * We don't need protection against CPU-hotplug here because 3473 * We don't need protection against CPU-hotplug here because
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index e253d11af3c4..7acaf3a62d39 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -154,13 +154,15 @@ struct rcu_data {
154 bool core_needs_qs; /* Core waits for quiesc state. */ 154 bool core_needs_qs; /* Core waits for quiesc state. */
155 bool beenonline; /* CPU online at least once. */ 155 bool beenonline; /* CPU online at least once. */
156 bool gpwrap; /* Possible ->gp_seq wrap. */ 156 bool gpwrap; /* Possible ->gp_seq wrap. */
157 bool deferred_qs; /* This CPU awaiting a deferred QS? */ 157 bool exp_deferred_qs; /* This CPU awaiting a deferred QS? */
158 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ 158 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
159 unsigned long grpmask; /* Mask to apply to leaf qsmask. */ 159 unsigned long grpmask; /* Mask to apply to leaf qsmask. */
160 unsigned long ticks_this_gp; /* The number of scheduling-clock */ 160 unsigned long ticks_this_gp; /* The number of scheduling-clock */
161 /* ticks this CPU has handled */ 161 /* ticks this CPU has handled */
162 /* during and after the last grace */ 162 /* during and after the last grace */
163 /* period it is aware of. */ 163 /* period it is aware of. */
164 struct irq_work defer_qs_iw; /* Obtain later scheduler attention. */
165 bool defer_qs_iw_pending; /* Scheduler attention pending? */
164 166
165 /* 2) batch handling */ 167 /* 2) batch handling */
166 struct rcu_segcblist cblist; /* Segmented callback list, with */ 168 struct rcu_segcblist cblist; /* Segmented callback list, with */
@@ -407,8 +409,8 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func);
407static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck); 409static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck);
408static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); 410static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
409static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); 411static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
410static void invoke_rcu_callbacks_kthread(void);
411static bool rcu_is_callbacks_kthread(void); 412static bool rcu_is_callbacks_kthread(void);
413static void rcu_cpu_kthread_setup(unsigned int cpu);
412static void __init rcu_spawn_boost_kthreads(void); 414static void __init rcu_spawn_boost_kthreads(void);
413static void rcu_prepare_kthreads(int cpu); 415static void rcu_prepare_kthreads(int cpu);
414static void rcu_cleanup_after_idle(void); 416static void rcu_cleanup_after_idle(void);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 9c990df880d1..af7e7b9c86af 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -250,7 +250,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp,
250 */ 250 */
251static void rcu_report_exp_rdp(struct rcu_data *rdp) 251static void rcu_report_exp_rdp(struct rcu_data *rdp)
252{ 252{
253 WRITE_ONCE(rdp->deferred_qs, false); 253 WRITE_ONCE(rdp->exp_deferred_qs, false);
254 rcu_report_exp_cpu_mult(rdp->mynode, rdp->grpmask, true); 254 rcu_report_exp_cpu_mult(rdp->mynode, rdp->grpmask, true);
255} 255}
256 256
@@ -259,8 +259,7 @@ static bool sync_exp_work_done(unsigned long s)
259{ 259{
260 if (rcu_exp_gp_seq_done(s)) { 260 if (rcu_exp_gp_seq_done(s)) {
261 trace_rcu_exp_grace_period(rcu_state.name, s, TPS("done")); 261 trace_rcu_exp_grace_period(rcu_state.name, s, TPS("done"));
262 /* Ensure test happens before caller kfree(). */ 262 smp_mb(); /* Ensure test happens before caller kfree(). */
263 smp_mb__before_atomic(); /* ^^^ */
264 return true; 263 return true;
265 } 264 }
266 return false; 265 return false;
@@ -384,7 +383,12 @@ retry_ipi:
384 mask_ofl_test |= mask; 383 mask_ofl_test |= mask;
385 continue; 384 continue;
386 } 385 }
386 if (get_cpu() == cpu) {
387 put_cpu();
388 continue;
389 }
387 ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0); 390 ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0);
391 put_cpu();
388 if (!ret) { 392 if (!ret) {
389 mask_ofl_ipi &= ~mask; 393 mask_ofl_ipi &= ~mask;
390 continue; 394 continue;
@@ -611,7 +615,7 @@ static void rcu_exp_handler(void *unused)
611 rcu_dynticks_curr_cpu_in_eqs()) { 615 rcu_dynticks_curr_cpu_in_eqs()) {
612 rcu_report_exp_rdp(rdp); 616 rcu_report_exp_rdp(rdp);
613 } else { 617 } else {
614 rdp->deferred_qs = true; 618 rdp->exp_deferred_qs = true;
615 set_tsk_need_resched(t); 619 set_tsk_need_resched(t);
616 set_preempt_need_resched(); 620 set_preempt_need_resched();
617 } 621 }
@@ -633,7 +637,7 @@ static void rcu_exp_handler(void *unused)
633 if (t->rcu_read_lock_nesting > 0) { 637 if (t->rcu_read_lock_nesting > 0) {
634 raw_spin_lock_irqsave_rcu_node(rnp, flags); 638 raw_spin_lock_irqsave_rcu_node(rnp, flags);
635 if (rnp->expmask & rdp->grpmask) { 639 if (rnp->expmask & rdp->grpmask) {
636 rdp->deferred_qs = true; 640 rdp->exp_deferred_qs = true;
637 t->rcu_read_unlock_special.b.exp_hint = true; 641 t->rcu_read_unlock_special.b.exp_hint = true;
638 } 642 }
639 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 643 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -656,7 +660,7 @@ static void rcu_exp_handler(void *unused)
656 * 660 *
657 * Otherwise, force a context switch after the CPU enables everything. 661 * Otherwise, force a context switch after the CPU enables everything.
658 */ 662 */
659 rdp->deferred_qs = true; 663 rdp->exp_deferred_qs = true;
660 if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) || 664 if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) ||
661 WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs())) { 665 WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs())) {
662 rcu_preempt_deferred_qs(t); 666 rcu_preempt_deferred_qs(t);
@@ -694,6 +698,16 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
694 698
695#else /* #ifdef CONFIG_PREEMPT_RCU */ 699#else /* #ifdef CONFIG_PREEMPT_RCU */
696 700
701/* Request an expedited quiescent state. */
702static void rcu_exp_need_qs(void)
703{
704 __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true);
705 /* Store .exp before .rcu_urgent_qs. */
706 smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true);
707 set_tsk_need_resched(current);
708 set_preempt_need_resched();
709}
710
697/* Invoked on each online non-idle CPU for expedited quiescent state. */ 711/* Invoked on each online non-idle CPU for expedited quiescent state. */
698static void rcu_exp_handler(void *unused) 712static void rcu_exp_handler(void *unused)
699{ 713{
@@ -709,25 +723,38 @@ static void rcu_exp_handler(void *unused)
709 rcu_report_exp_rdp(this_cpu_ptr(&rcu_data)); 723 rcu_report_exp_rdp(this_cpu_ptr(&rcu_data));
710 return; 724 return;
711 } 725 }
712 __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true); 726 rcu_exp_need_qs();
713 /* Store .exp before .rcu_urgent_qs. */
714 smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true);
715 set_tsk_need_resched(current);
716 set_preempt_need_resched();
717} 727}
718 728
719/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ 729/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */
720static void sync_sched_exp_online_cleanup(int cpu) 730static void sync_sched_exp_online_cleanup(int cpu)
721{ 731{
732 unsigned long flags;
733 int my_cpu;
722 struct rcu_data *rdp; 734 struct rcu_data *rdp;
723 int ret; 735 int ret;
724 struct rcu_node *rnp; 736 struct rcu_node *rnp;
725 737
726 rdp = per_cpu_ptr(&rcu_data, cpu); 738 rdp = per_cpu_ptr(&rcu_data, cpu);
727 rnp = rdp->mynode; 739 rnp = rdp->mynode;
728 if (!(READ_ONCE(rnp->expmask) & rdp->grpmask)) 740 my_cpu = get_cpu();
741 /* Quiescent state either not needed or already requested, leave. */
742 if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
743 __this_cpu_read(rcu_data.cpu_no_qs.b.exp)) {
744 put_cpu();
745 return;
746 }
747 /* Quiescent state needed on current CPU, so set it up locally. */
748 if (my_cpu == cpu) {
749 local_irq_save(flags);
750 rcu_exp_need_qs();
751 local_irq_restore(flags);
752 put_cpu();
729 return; 753 return;
754 }
755 /* Quiescent state needed on some other CPU, send IPI. */
730 ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0); 756 ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0);
757 put_cpu();
731 WARN_ON_ONCE(ret); 758 WARN_ON_ONCE(ret);
732} 759}
733 760
@@ -765,7 +792,6 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
765 */ 792 */
766void synchronize_rcu_expedited(void) 793void synchronize_rcu_expedited(void)
767{ 794{
768 struct rcu_data *rdp;
769 struct rcu_exp_work rew; 795 struct rcu_exp_work rew;
770 struct rcu_node *rnp; 796 struct rcu_node *rnp;
771 unsigned long s; 797 unsigned long s;
@@ -802,7 +828,6 @@ void synchronize_rcu_expedited(void)
802 } 828 }
803 829
804 /* Wait for expedited grace period to complete. */ 830 /* Wait for expedited grace period to complete. */
805 rdp = per_cpu_ptr(&rcu_data, raw_smp_processor_id());
806 rnp = rcu_get_root(); 831 rnp = rcu_get_root();
807 wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], 832 wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
808 sync_exp_work_done(s)); 833 sync_exp_work_done(s));
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 1102765f91fd..acb225023ed1 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -11,29 +11,7 @@
11 * Paul E. McKenney <paulmck@linux.ibm.com> 11 * Paul E. McKenney <paulmck@linux.ibm.com>
12 */ 12 */
13 13
14#include <linux/delay.h>
15#include <linux/gfp.h>
16#include <linux/oom.h>
17#include <linux/sched/debug.h>
18#include <linux/smpboot.h>
19#include <linux/sched/isolation.h>
20#include <uapi/linux/sched/types.h>
21#include "../time/tick-internal.h"
22
23#ifdef CONFIG_RCU_BOOST
24#include "../locking/rtmutex_common.h" 14#include "../locking/rtmutex_common.h"
25#else /* #ifdef CONFIG_RCU_BOOST */
26
27/*
28 * Some architectures do not define rt_mutexes, but if !CONFIG_RCU_BOOST,
29 * all uses are in dead code. Provide a definition to keep the compiler
30 * happy, but add WARN_ON_ONCE() to complain if used in the wrong place.
31 * This probably needs to be excluded from -rt builds.
32 */
33#define rt_mutex_owner(a) ({ WARN_ON_ONCE(1); NULL; })
34#define rt_mutex_futex_unlock(x) WARN_ON_ONCE(1)
35
36#endif /* #else #ifdef CONFIG_RCU_BOOST */
37 15
38#ifdef CONFIG_RCU_NOCB_CPU 16#ifdef CONFIG_RCU_NOCB_CPU
39static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ 17static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
@@ -94,6 +72,8 @@ static void __init rcu_bootup_announce_oddness(void)
94 pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay); 72 pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay);
95 if (gp_cleanup_delay) 73 if (gp_cleanup_delay)
96 pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay); 74 pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay);
75 if (!use_softirq)
76 pr_info("\tRCU_SOFTIRQ processing moved to rcuc kthreads.\n");
97 if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG)) 77 if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG))
98 pr_info("\tRCU debug extended QS entry/exit.\n"); 78 pr_info("\tRCU debug extended QS entry/exit.\n");
99 rcupdate_announce_bootup_oddness(); 79 rcupdate_announce_bootup_oddness();
@@ -257,10 +237,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
257 * no need to check for a subsequent expedited GP. (Though we are 237 * no need to check for a subsequent expedited GP. (Though we are
258 * still in a quiescent state in any case.) 238 * still in a quiescent state in any case.)
259 */ 239 */
260 if (blkd_state & RCU_EXP_BLKD && rdp->deferred_qs) 240 if (blkd_state & RCU_EXP_BLKD && rdp->exp_deferred_qs)
261 rcu_report_exp_rdp(rdp); 241 rcu_report_exp_rdp(rdp);
262 else 242 else
263 WARN_ON_ONCE(rdp->deferred_qs); 243 WARN_ON_ONCE(rdp->exp_deferred_qs);
264} 244}
265 245
266/* 246/*
@@ -357,7 +337,7 @@ void rcu_note_context_switch(bool preempt)
357 * means that we continue to block the current grace period. 337 * means that we continue to block the current grace period.
358 */ 338 */
359 rcu_qs(); 339 rcu_qs();
360 if (rdp->deferred_qs) 340 if (rdp->exp_deferred_qs)
361 rcu_report_exp_rdp(rdp); 341 rcu_report_exp_rdp(rdp);
362 trace_rcu_utilization(TPS("End context switch")); 342 trace_rcu_utilization(TPS("End context switch"));
363 barrier(); /* Avoid RCU read-side critical sections leaking up. */ 343 barrier(); /* Avoid RCU read-side critical sections leaking up. */
@@ -471,14 +451,15 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
471 */ 451 */
472 special = t->rcu_read_unlock_special; 452 special = t->rcu_read_unlock_special;
473 rdp = this_cpu_ptr(&rcu_data); 453 rdp = this_cpu_ptr(&rcu_data);
474 if (!special.s && !rdp->deferred_qs) { 454 if (!special.s && !rdp->exp_deferred_qs) {
475 local_irq_restore(flags); 455 local_irq_restore(flags);
476 return; 456 return;
477 } 457 }
458 t->rcu_read_unlock_special.b.deferred_qs = false;
478 if (special.b.need_qs) { 459 if (special.b.need_qs) {
479 rcu_qs(); 460 rcu_qs();
480 t->rcu_read_unlock_special.b.need_qs = false; 461 t->rcu_read_unlock_special.b.need_qs = false;
481 if (!t->rcu_read_unlock_special.s && !rdp->deferred_qs) { 462 if (!t->rcu_read_unlock_special.s && !rdp->exp_deferred_qs) {
482 local_irq_restore(flags); 463 local_irq_restore(flags);
483 return; 464 return;
484 } 465 }
@@ -490,7 +471,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
490 * tasks are handled when removing the task from the 471 * tasks are handled when removing the task from the
491 * blocked-tasks list below. 472 * blocked-tasks list below.
492 */ 473 */
493 if (rdp->deferred_qs) { 474 if (rdp->exp_deferred_qs) {
494 rcu_report_exp_rdp(rdp); 475 rcu_report_exp_rdp(rdp);
495 if (!t->rcu_read_unlock_special.s) { 476 if (!t->rcu_read_unlock_special.s) {
496 local_irq_restore(flags); 477 local_irq_restore(flags);
@@ -579,7 +560,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
579 */ 560 */
580static bool rcu_preempt_need_deferred_qs(struct task_struct *t) 561static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
581{ 562{
582 return (__this_cpu_read(rcu_data.deferred_qs) || 563 return (__this_cpu_read(rcu_data.exp_deferred_qs) ||
583 READ_ONCE(t->rcu_read_unlock_special.s)) && 564 READ_ONCE(t->rcu_read_unlock_special.s)) &&
584 t->rcu_read_lock_nesting <= 0; 565 t->rcu_read_lock_nesting <= 0;
585} 566}
@@ -607,6 +588,17 @@ static void rcu_preempt_deferred_qs(struct task_struct *t)
607} 588}
608 589
609/* 590/*
591 * Minimal handler to give the scheduler a chance to re-evaluate.
592 */
593static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp)
594{
595 struct rcu_data *rdp;
596
597 rdp = container_of(iwp, struct rcu_data, defer_qs_iw);
598 rdp->defer_qs_iw_pending = false;
599}
600
601/*
610 * Handle special cases during rcu_read_unlock(), such as needing to 602 * Handle special cases during rcu_read_unlock(), such as needing to
611 * notify RCU core processing or task having blocked during the RCU 603 * notify RCU core processing or task having blocked during the RCU
612 * read-side critical section. 604 * read-side critical section.
@@ -625,16 +617,41 @@ static void rcu_read_unlock_special(struct task_struct *t)
625 local_irq_save(flags); 617 local_irq_save(flags);
626 irqs_were_disabled = irqs_disabled_flags(flags); 618 irqs_were_disabled = irqs_disabled_flags(flags);
627 if (preempt_bh_were_disabled || irqs_were_disabled) { 619 if (preempt_bh_were_disabled || irqs_were_disabled) {
628 WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, false); 620 bool exp;
629 /* Need to defer quiescent state until everything is enabled. */ 621 struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
630 if (irqs_were_disabled) { 622 struct rcu_node *rnp = rdp->mynode;
631 /* Enabling irqs does not reschedule, so... */ 623
624 t->rcu_read_unlock_special.b.exp_hint = false;
625 exp = (t->rcu_blocked_node && t->rcu_blocked_node->exp_tasks) ||
626 (rdp->grpmask & rnp->expmask) ||
627 tick_nohz_full_cpu(rdp->cpu);
628 // Need to defer quiescent state until everything is enabled.
629 if ((exp || in_irq()) && irqs_were_disabled && use_softirq &&
630 (in_irq() || !t->rcu_read_unlock_special.b.deferred_qs)) {
631 // Using softirq, safe to awaken, and we get
632 // no help from enabling irqs, unlike bh/preempt.
632 raise_softirq_irqoff(RCU_SOFTIRQ); 633 raise_softirq_irqoff(RCU_SOFTIRQ);
634 } else if (exp && irqs_were_disabled && !use_softirq &&
635 !t->rcu_read_unlock_special.b.deferred_qs) {
636 // Safe to awaken and we get no help from enabling
637 // irqs, unlike bh/preempt.
638 invoke_rcu_core();
633 } else { 639 } else {
634 /* Enabling BH or preempt does reschedule, so... */ 640 // Enabling BH or preempt does reschedule, so...
641 // Also if no expediting or NO_HZ_FULL, slow is OK.
635 set_tsk_need_resched(current); 642 set_tsk_need_resched(current);
636 set_preempt_need_resched(); 643 set_preempt_need_resched();
644 if (IS_ENABLED(CONFIG_IRQ_WORK) &&
645 !rdp->defer_qs_iw_pending && exp) {
646 // Get scheduler to re-evaluate and call hooks.
647 // If !IRQ_WORK, FQS scan will eventually IPI.
648 init_irq_work(&rdp->defer_qs_iw,
649 rcu_preempt_deferred_qs_handler);
650 rdp->defer_qs_iw_pending = true;
651 irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu);
652 }
637 } 653 }
654 t->rcu_read_unlock_special.b.deferred_qs = true;
638 local_irq_restore(flags); 655 local_irq_restore(flags);
639 return; 656 return;
640 } 657 }
@@ -760,7 +777,7 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
760 i = 0; 777 i = 0;
761 list_for_each(lhp, &rnp->blkd_tasks) { 778 list_for_each(lhp, &rnp->blkd_tasks) {
762 pr_cont(" %p", lhp); 779 pr_cont(" %p", lhp);
763 if (++i >= 10) 780 if (++i >= ncheck)
764 break; 781 break;
765 } 782 }
766 pr_cont("\n"); 783 pr_cont("\n");
@@ -944,18 +961,21 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
944 961
945#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ 962#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
946 963
964/*
965 * If boosting, set rcuc kthreads to realtime priority.
966 */
967static void rcu_cpu_kthread_setup(unsigned int cpu)
968{
947#ifdef CONFIG_RCU_BOOST 969#ifdef CONFIG_RCU_BOOST
970 struct sched_param sp;
948 971
949static void rcu_wake_cond(struct task_struct *t, int status) 972 sp.sched_priority = kthread_prio;
950{ 973 sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
951 /* 974#endif /* #ifdef CONFIG_RCU_BOOST */
952 * If the thread is yielding, only wake it when this
953 * is invoked from idle
954 */
955 if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
956 wake_up_process(t);
957} 975}
958 976
977#ifdef CONFIG_RCU_BOOST
978
959/* 979/*
960 * Carry out RCU priority boosting on the task indicated by ->exp_tasks 980 * Carry out RCU priority boosting on the task indicated by ->exp_tasks
961 * or ->boost_tasks, advancing the pointer to the next task in the 981 * or ->boost_tasks, advancing the pointer to the next task in the
@@ -1091,23 +1111,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1091} 1111}
1092 1112
1093/* 1113/*
1094 * Wake up the per-CPU kthread to invoke RCU callbacks.
1095 */
1096static void invoke_rcu_callbacks_kthread(void)
1097{
1098 unsigned long flags;
1099
1100 local_irq_save(flags);
1101 __this_cpu_write(rcu_data.rcu_cpu_has_work, 1);
1102 if (__this_cpu_read(rcu_data.rcu_cpu_kthread_task) != NULL &&
1103 current != __this_cpu_read(rcu_data.rcu_cpu_kthread_task)) {
1104 rcu_wake_cond(__this_cpu_read(rcu_data.rcu_cpu_kthread_task),
1105 __this_cpu_read(rcu_data.rcu_cpu_kthread_status));
1106 }
1107 local_irq_restore(flags);
1108}
1109
1110/*
1111 * Is the current CPU running the RCU-callbacks kthread? 1114 * Is the current CPU running the RCU-callbacks kthread?
1112 * Caller must have preemption disabled. 1115 * Caller must have preemption disabled.
1113 */ 1116 */
@@ -1160,59 +1163,6 @@ static int rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
1160 return 0; 1163 return 0;
1161} 1164}
1162 1165
1163static void rcu_cpu_kthread_setup(unsigned int cpu)
1164{
1165 struct sched_param sp;
1166
1167 sp.sched_priority = kthread_prio;
1168 sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
1169}
1170
1171static void rcu_cpu_kthread_park(unsigned int cpu)
1172{
1173 per_cpu(rcu_data.rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
1174}
1175
1176static int rcu_cpu_kthread_should_run(unsigned int cpu)
1177{
1178 return __this_cpu_read(rcu_data.rcu_cpu_has_work);
1179}
1180
1181/*
1182 * Per-CPU kernel thread that invokes RCU callbacks. This replaces
1183 * the RCU softirq used in configurations of RCU that do not support RCU
1184 * priority boosting.
1185 */
1186static void rcu_cpu_kthread(unsigned int cpu)
1187{
1188 unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status);
1189 char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work);
1190 int spincnt;
1191
1192 for (spincnt = 0; spincnt < 10; spincnt++) {
1193 trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
1194 local_bh_disable();
1195 *statusp = RCU_KTHREAD_RUNNING;
1196 local_irq_disable();
1197 work = *workp;
1198 *workp = 0;
1199 local_irq_enable();
1200 if (work)
1201 rcu_do_batch(this_cpu_ptr(&rcu_data));
1202 local_bh_enable();
1203 if (*workp == 0) {
1204 trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
1205 *statusp = RCU_KTHREAD_WAITING;
1206 return;
1207 }
1208 }
1209 *statusp = RCU_KTHREAD_YIELDING;
1210 trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
1211 schedule_timeout_interruptible(2);
1212 trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
1213 *statusp = RCU_KTHREAD_WAITING;
1214}
1215
1216/* 1166/*
1217 * Set the per-rcu_node kthread's affinity to cover all CPUs that are 1167 * Set the per-rcu_node kthread's affinity to cover all CPUs that are
1218 * served by the rcu_node in question. The CPU hotplug lock is still 1168 * served by the rcu_node in question. The CPU hotplug lock is still
@@ -1243,27 +1193,13 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1243 free_cpumask_var(cm); 1193 free_cpumask_var(cm);
1244} 1194}
1245 1195
1246static struct smp_hotplug_thread rcu_cpu_thread_spec = {
1247 .store = &rcu_data.rcu_cpu_kthread_task,
1248 .thread_should_run = rcu_cpu_kthread_should_run,
1249 .thread_fn = rcu_cpu_kthread,
1250 .thread_comm = "rcuc/%u",
1251 .setup = rcu_cpu_kthread_setup,
1252 .park = rcu_cpu_kthread_park,
1253};
1254
1255/* 1196/*
1256 * Spawn boost kthreads -- called as soon as the scheduler is running. 1197 * Spawn boost kthreads -- called as soon as the scheduler is running.
1257 */ 1198 */
1258static void __init rcu_spawn_boost_kthreads(void) 1199static void __init rcu_spawn_boost_kthreads(void)
1259{ 1200{
1260 struct rcu_node *rnp; 1201 struct rcu_node *rnp;
1261 int cpu;
1262 1202
1263 for_each_possible_cpu(cpu)
1264 per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0;
1265 if (WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec), "%s: Could not start rcub kthread, OOM is now expected behavior\n", __func__))
1266 return;
1267 rcu_for_each_leaf_node(rnp) 1203 rcu_for_each_leaf_node(rnp)
1268 (void)rcu_spawn_one_boost_kthread(rnp); 1204 (void)rcu_spawn_one_boost_kthread(rnp);
1269} 1205}
@@ -1286,11 +1222,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1286 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1222 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1287} 1223}
1288 1224
1289static void invoke_rcu_callbacks_kthread(void)
1290{
1291 WARN_ON_ONCE(1);
1292}
1293
1294static bool rcu_is_callbacks_kthread(void) 1225static bool rcu_is_callbacks_kthread(void)
1295{ 1226{
1296 return false; 1227 return false;
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index f65a73a97323..065183391f75 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -630,7 +630,9 @@ static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp,
630 time_before(j, rcu_state.gp_req_activity + gpssdelay) || 630 time_before(j, rcu_state.gp_req_activity + gpssdelay) ||
631 time_before(j, rcu_state.gp_activity + gpssdelay) || 631 time_before(j, rcu_state.gp_activity + gpssdelay) ||
632 atomic_xchg(&warned, 1)) { 632 atomic_xchg(&warned, 1)) {
633 raw_spin_unlock_rcu_node(rnp_root); /* irqs remain disabled. */ 633 if (rnp_root != rnp)
634 /* irqs remain disabled. */
635 raw_spin_unlock_rcu_node(rnp_root);
634 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 636 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
635 return; 637 return;
636 } 638 }
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index c3bf44ba42e5..61df2bf08563 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -423,6 +423,19 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
423 do { } while (0) 423 do { } while (0)
424#endif 424#endif
425 425
426#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST)
427/* Get rcutorture access to sched_setaffinity(). */
428long rcutorture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
429{
430 int ret;
431
432 ret = sched_setaffinity(pid, in_mask);
433 WARN_ONCE(ret, "%s: sched_setaffinity() returned %d\n", __func__, ret);
434 return ret;
435}
436EXPORT_SYMBOL_GPL(rcutorture_sched_setaffinity);
437#endif
438
426#ifdef CONFIG_RCU_STALL_COMMON 439#ifdef CONFIG_RCU_STALL_COMMON
427int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ 440int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
428EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress); 441EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress);
diff --git a/kernel/reboot.c b/kernel/reboot.c
index e1b79b6a2735..c4d472b7f1b4 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * linux/kernel/reboot.c 3 * linux/kernel/reboot.c
3 * 4 *
@@ -31,6 +32,7 @@ EXPORT_SYMBOL(cad_pid);
31#define DEFAULT_REBOOT_MODE 32#define DEFAULT_REBOOT_MODE
32#endif 33#endif
33enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE; 34enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE;
35enum reboot_mode panic_reboot_mode = REBOOT_UNDEFINED;
34 36
35/* 37/*
36 * This variable is used privately to keep track of whether or not 38 * This variable is used privately to keep track of whether or not
@@ -519,6 +521,8 @@ EXPORT_SYMBOL_GPL(orderly_reboot);
519static int __init reboot_setup(char *str) 521static int __init reboot_setup(char *str)
520{ 522{
521 for (;;) { 523 for (;;) {
524 enum reboot_mode *mode;
525
522 /* 526 /*
523 * Having anything passed on the command line via 527 * Having anything passed on the command line via
524 * reboot= will cause us to disable DMI checking 528 * reboot= will cause us to disable DMI checking
@@ -526,17 +530,24 @@ static int __init reboot_setup(char *str)
526 */ 530 */
527 reboot_default = 0; 531 reboot_default = 0;
528 532
533 if (!strncmp(str, "panic_", 6)) {
534 mode = &panic_reboot_mode;
535 str += 6;
536 } else {
537 mode = &reboot_mode;
538 }
539
529 switch (*str) { 540 switch (*str) {
530 case 'w': 541 case 'w':
531 reboot_mode = REBOOT_WARM; 542 *mode = REBOOT_WARM;
532 break; 543 break;
533 544
534 case 'c': 545 case 'c':
535 reboot_mode = REBOOT_COLD; 546 *mode = REBOOT_COLD;
536 break; 547 break;
537 548
538 case 'h': 549 case 'h':
539 reboot_mode = REBOOT_HARD; 550 *mode = REBOOT_HARD;
540 break; 551 break;
541 552
542 case 's': 553 case 's':
@@ -553,11 +564,11 @@ static int __init reboot_setup(char *str)
553 if (rc) 564 if (rc)
554 return rc; 565 return rc;
555 } else 566 } else
556 reboot_mode = REBOOT_SOFT; 567 *mode = REBOOT_SOFT;
557 break; 568 break;
558 } 569 }
559 case 'g': 570 case 'g':
560 reboot_mode = REBOOT_GPIO; 571 *mode = REBOOT_GPIO;
561 break; 572 break;
562 573
563 case 'b': 574 case 'b':
diff --git a/kernel/resource.c b/kernel/resource.c
index 8c15f846e8ef..158f04ec1d4f 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * linux/kernel/resource.c 3 * linux/kernel/resource.c
3 * 4 *
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 9424ee90589e..27c48eb7de40 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -277,7 +277,7 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
277 277
278error: 278error:
279 sig = ksig ? ksig->sig : 0; 279 sig = ksig ? ksig->sig : 0;
280 force_sigsegv(sig, t); 280 force_sigsegv(sig);
281} 281}
282 282
283#ifdef CONFIG_DEBUG_RSEQ 283#ifdef CONFIG_DEBUG_RSEQ
@@ -296,7 +296,7 @@ void rseq_syscall(struct pt_regs *regs)
296 return; 296 return;
297 if (!access_ok(t->rseq, sizeof(*t->rseq)) || 297 if (!access_ok(t->rseq, sizeof(*t->rseq)) ||
298 rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs)) 298 rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs))
299 force_sig(SIGSEGV, t); 299 force_sig(SIGSEGV);
300} 300}
301 301
302#endif 302#endif
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index 2d4ff5353ded..2067080bb235 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -259,7 +259,6 @@ out:
259} 259}
260#endif /* CONFIG_PROC_FS */ 260#endif /* CONFIG_PROC_FS */
261 261
262#ifdef CONFIG_SCHED_DEBUG
263int autogroup_path(struct task_group *tg, char *buf, int buflen) 262int autogroup_path(struct task_group *tg, char *buf, int buflen)
264{ 263{
265 if (!task_group_is_autogroup(tg)) 264 if (!task_group_is_autogroup(tg))
@@ -267,4 +266,3 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen)
267 266
268 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); 267 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
269} 268}
270#endif
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index e3e3b979f9bd..1152259a4ca0 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * sched_clock() for unstable CPU clocks 3 * sched_clock() for unstable CPU clocks
3 * 4 *
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 102dfcf0a29a..fa43ce3962e7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * kernel/sched/core.c 3 * kernel/sched/core.c
3 * 4 *
@@ -22,6 +23,17 @@
22#define CREATE_TRACE_POINTS 23#define CREATE_TRACE_POINTS
23#include <trace/events/sched.h> 24#include <trace/events/sched.h>
24 25
26/*
27 * Export tracepoints that act as a bare tracehook (ie: have no trace event
28 * associated with them) to allow external modules to probe them.
29 */
30EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp);
31EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
32EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
33EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
34EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
35EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
36
25DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 37DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
26 38
27#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL) 39#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL)
@@ -760,6 +772,401 @@ static void set_load_weight(struct task_struct *p, bool update_load)
760 } 772 }
761} 773}
762 774
775#ifdef CONFIG_UCLAMP_TASK
776/* Max allowed minimum utilization */
777unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
778
779/* Max allowed maximum utilization */
780unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
781
782/* All clamps are required to be less or equal than these values */
783static struct uclamp_se uclamp_default[UCLAMP_CNT];
784
785/* Integer rounded range for each bucket */
786#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
787
788#define for_each_clamp_id(clamp_id) \
789 for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
790
791static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
792{
793 return clamp_value / UCLAMP_BUCKET_DELTA;
794}
795
796static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
797{
798 return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
799}
800
801static inline unsigned int uclamp_none(int clamp_id)
802{
803 if (clamp_id == UCLAMP_MIN)
804 return 0;
805 return SCHED_CAPACITY_SCALE;
806}
807
808static inline void uclamp_se_set(struct uclamp_se *uc_se,
809 unsigned int value, bool user_defined)
810{
811 uc_se->value = value;
812 uc_se->bucket_id = uclamp_bucket_id(value);
813 uc_se->user_defined = user_defined;
814}
815
816static inline unsigned int
817uclamp_idle_value(struct rq *rq, unsigned int clamp_id,
818 unsigned int clamp_value)
819{
820 /*
821 * Avoid blocked utilization pushing up the frequency when we go
822 * idle (which drops the max-clamp) by retaining the last known
823 * max-clamp.
824 */
825 if (clamp_id == UCLAMP_MAX) {
826 rq->uclamp_flags |= UCLAMP_FLAG_IDLE;
827 return clamp_value;
828 }
829
830 return uclamp_none(UCLAMP_MIN);
831}
832
833static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id,
834 unsigned int clamp_value)
835{
836 /* Reset max-clamp retention only on idle exit */
837 if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))
838 return;
839
840 WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value);
841}
842
843static inline
844unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id,
845 unsigned int clamp_value)
846{
847 struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
848 int bucket_id = UCLAMP_BUCKETS - 1;
849
850 /*
851 * Since both min and max clamps are max aggregated, find the
852 * top most bucket with tasks in.
853 */
854 for ( ; bucket_id >= 0; bucket_id--) {
855 if (!bucket[bucket_id].tasks)
856 continue;
857 return bucket[bucket_id].value;
858 }
859
860 /* No tasks -- default clamp values */
861 return uclamp_idle_value(rq, clamp_id, clamp_value);
862}
863
864/*
865 * The effective clamp bucket index of a task depends on, by increasing
866 * priority:
867 * - the task specific clamp value, when explicitly requested from userspace
868 * - the system default clamp value, defined by the sysadmin
869 */
870static inline struct uclamp_se
871uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
872{
873 struct uclamp_se uc_req = p->uclamp_req[clamp_id];
874 struct uclamp_se uc_max = uclamp_default[clamp_id];
875
876 /* System default restrictions always apply */
877 if (unlikely(uc_req.value > uc_max.value))
878 return uc_max;
879
880 return uc_req;
881}
882
883unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
884{
885 struct uclamp_se uc_eff;
886
887 /* Task currently refcounted: use back-annotated (effective) value */
888 if (p->uclamp[clamp_id].active)
889 return p->uclamp[clamp_id].value;
890
891 uc_eff = uclamp_eff_get(p, clamp_id);
892
893 return uc_eff.value;
894}
895
896/*
897 * When a task is enqueued on a rq, the clamp bucket currently defined by the
898 * task's uclamp::bucket_id is refcounted on that rq. This also immediately
899 * updates the rq's clamp value if required.
900 *
901 * Tasks can have a task-specific value requested from user-space, track
902 * within each bucket the maximum value for tasks refcounted in it.
903 * This "local max aggregation" allows to track the exact "requested" value
904 * for each bucket when all its RUNNABLE tasks require the same clamp.
905 */
906static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
907 unsigned int clamp_id)
908{
909 struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
910 struct uclamp_se *uc_se = &p->uclamp[clamp_id];
911 struct uclamp_bucket *bucket;
912
913 lockdep_assert_held(&rq->lock);
914
915 /* Update task effective clamp */
916 p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id);
917
918 bucket = &uc_rq->bucket[uc_se->bucket_id];
919 bucket->tasks++;
920 uc_se->active = true;
921
922 uclamp_idle_reset(rq, clamp_id, uc_se->value);
923
924 /*
925 * Local max aggregation: rq buckets always track the max
926 * "requested" clamp value of its RUNNABLE tasks.
927 */
928 if (bucket->tasks == 1 || uc_se->value > bucket->value)
929 bucket->value = uc_se->value;
930
931 if (uc_se->value > READ_ONCE(uc_rq->value))
932 WRITE_ONCE(uc_rq->value, uc_se->value);
933}
934
935/*
936 * When a task is dequeued from a rq, the clamp bucket refcounted by the task
937 * is released. If this is the last task reference counting the rq's max
938 * active clamp value, then the rq's clamp value is updated.
939 *
940 * Both refcounted tasks and rq's cached clamp values are expected to be
941 * always valid. If it's detected they are not, as defensive programming,
942 * enforce the expected state and warn.
943 */
944static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
945 unsigned int clamp_id)
946{
947 struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
948 struct uclamp_se *uc_se = &p->uclamp[clamp_id];
949 struct uclamp_bucket *bucket;
950 unsigned int bkt_clamp;
951 unsigned int rq_clamp;
952
953 lockdep_assert_held(&rq->lock);
954
955 bucket = &uc_rq->bucket[uc_se->bucket_id];
956 SCHED_WARN_ON(!bucket->tasks);
957 if (likely(bucket->tasks))
958 bucket->tasks--;
959 uc_se->active = false;
960
961 /*
962 * Keep "local max aggregation" simple and accept to (possibly)
963 * overboost some RUNNABLE tasks in the same bucket.
964 * The rq clamp bucket value is reset to its base value whenever
965 * there are no more RUNNABLE tasks refcounting it.
966 */
967 if (likely(bucket->tasks))
968 return;
969
970 rq_clamp = READ_ONCE(uc_rq->value);
971 /*
972 * Defensive programming: this should never happen. If it happens,
973 * e.g. due to future modification, warn and fixup the expected value.
974 */
975 SCHED_WARN_ON(bucket->value > rq_clamp);
976 if (bucket->value >= rq_clamp) {
977 bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
978 WRITE_ONCE(uc_rq->value, bkt_clamp);
979 }
980}
981
982static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
983{
984 unsigned int clamp_id;
985
986 if (unlikely(!p->sched_class->uclamp_enabled))
987 return;
988
989 for_each_clamp_id(clamp_id)
990 uclamp_rq_inc_id(rq, p, clamp_id);
991
992 /* Reset clamp idle holding when there is one RUNNABLE task */
993 if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)
994 rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
995}
996
997static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
998{
999 unsigned int clamp_id;
1000
1001 if (unlikely(!p->sched_class->uclamp_enabled))
1002 return;
1003
1004 for_each_clamp_id(clamp_id)
1005 uclamp_rq_dec_id(rq, p, clamp_id);
1006}
1007
1008int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
1009 void __user *buffer, size_t *lenp,
1010 loff_t *ppos)
1011{
1012 int old_min, old_max;
1013 static DEFINE_MUTEX(mutex);
1014 int result;
1015
1016 mutex_lock(&mutex);
1017 old_min = sysctl_sched_uclamp_util_min;
1018 old_max = sysctl_sched_uclamp_util_max;
1019
1020 result = proc_dointvec(table, write, buffer, lenp, ppos);
1021 if (result)
1022 goto undo;
1023 if (!write)
1024 goto done;
1025
1026 if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
1027 sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) {
1028 result = -EINVAL;
1029 goto undo;
1030 }
1031
1032 if (old_min != sysctl_sched_uclamp_util_min) {
1033 uclamp_se_set(&uclamp_default[UCLAMP_MIN],
1034 sysctl_sched_uclamp_util_min, false);
1035 }
1036 if (old_max != sysctl_sched_uclamp_util_max) {
1037 uclamp_se_set(&uclamp_default[UCLAMP_MAX],
1038 sysctl_sched_uclamp_util_max, false);
1039 }
1040
1041 /*
1042 * Updating all the RUNNABLE task is expensive, keep it simple and do
1043 * just a lazy update at each next enqueue time.
1044 */
1045 goto done;
1046
1047undo:
1048 sysctl_sched_uclamp_util_min = old_min;
1049 sysctl_sched_uclamp_util_max = old_max;
1050done:
1051 mutex_unlock(&mutex);
1052
1053 return result;
1054}
1055
1056static int uclamp_validate(struct task_struct *p,
1057 const struct sched_attr *attr)
1058{
1059 unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value;
1060 unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value;
1061
1062 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN)
1063 lower_bound = attr->sched_util_min;
1064 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX)
1065 upper_bound = attr->sched_util_max;
1066
1067 if (lower_bound > upper_bound)
1068 return -EINVAL;
1069 if (upper_bound > SCHED_CAPACITY_SCALE)
1070 return -EINVAL;
1071
1072 return 0;
1073}
1074
1075static void __setscheduler_uclamp(struct task_struct *p,
1076 const struct sched_attr *attr)
1077{
1078 unsigned int clamp_id;
1079
1080 /*
1081 * On scheduling class change, reset to default clamps for tasks
1082 * without a task-specific value.
1083 */
1084 for_each_clamp_id(clamp_id) {
1085 struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
1086 unsigned int clamp_value = uclamp_none(clamp_id);
1087
1088 /* Keep using defined clamps across class changes */
1089 if (uc_se->user_defined)
1090 continue;
1091
1092 /* By default, RT tasks always get 100% boost */
1093 if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
1094 clamp_value = uclamp_none(UCLAMP_MAX);
1095
1096 uclamp_se_set(uc_se, clamp_value, false);
1097 }
1098
1099 if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
1100 return;
1101
1102 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
1103 uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
1104 attr->sched_util_min, true);
1105 }
1106
1107 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
1108 uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
1109 attr->sched_util_max, true);
1110 }
1111}
1112
1113static void uclamp_fork(struct task_struct *p)
1114{
1115 unsigned int clamp_id;
1116
1117 for_each_clamp_id(clamp_id)
1118 p->uclamp[clamp_id].active = false;
1119
1120 if (likely(!p->sched_reset_on_fork))
1121 return;
1122
1123 for_each_clamp_id(clamp_id) {
1124 unsigned int clamp_value = uclamp_none(clamp_id);
1125
1126 /* By default, RT tasks always get 100% boost */
1127 if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
1128 clamp_value = uclamp_none(UCLAMP_MAX);
1129
1130 uclamp_se_set(&p->uclamp_req[clamp_id], clamp_value, false);
1131 }
1132}
1133
1134static void __init init_uclamp(void)
1135{
1136 struct uclamp_se uc_max = {};
1137 unsigned int clamp_id;
1138 int cpu;
1139
1140 for_each_possible_cpu(cpu) {
1141 memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq));
1142 cpu_rq(cpu)->uclamp_flags = 0;
1143 }
1144
1145 for_each_clamp_id(clamp_id) {
1146 uclamp_se_set(&init_task.uclamp_req[clamp_id],
1147 uclamp_none(clamp_id), false);
1148 }
1149
1150 /* System defaults allow max clamp values for both indexes */
1151 uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
1152 for_each_clamp_id(clamp_id)
1153 uclamp_default[clamp_id] = uc_max;
1154}
1155
1156#else /* CONFIG_UCLAMP_TASK */
1157static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
1158static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
1159static inline int uclamp_validate(struct task_struct *p,
1160 const struct sched_attr *attr)
1161{
1162 return -EOPNOTSUPP;
1163}
1164static void __setscheduler_uclamp(struct task_struct *p,
1165 const struct sched_attr *attr) { }
1166static inline void uclamp_fork(struct task_struct *p) { }
1167static inline void init_uclamp(void) { }
1168#endif /* CONFIG_UCLAMP_TASK */
1169
763static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 1170static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
764{ 1171{
765 if (!(flags & ENQUEUE_NOCLOCK)) 1172 if (!(flags & ENQUEUE_NOCLOCK))
@@ -770,6 +1177,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
770 psi_enqueue(p, flags & ENQUEUE_WAKEUP); 1177 psi_enqueue(p, flags & ENQUEUE_WAKEUP);
771 } 1178 }
772 1179
1180 uclamp_rq_inc(rq, p);
773 p->sched_class->enqueue_task(rq, p, flags); 1181 p->sched_class->enqueue_task(rq, p, flags);
774} 1182}
775 1183
@@ -783,6 +1191,7 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
783 psi_dequeue(p, flags & DEQUEUE_SLEEP); 1191 psi_dequeue(p, flags & DEQUEUE_SLEEP);
784 } 1192 }
785 1193
1194 uclamp_rq_dec(rq, p);
786 p->sched_class->dequeue_task(rq, p, flags); 1195 p->sched_class->dequeue_task(rq, p, flags);
787} 1196}
788 1197
@@ -929,7 +1338,7 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
929 */ 1338 */
930static inline bool is_cpu_allowed(struct task_struct *p, int cpu) 1339static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
931{ 1340{
932 if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) 1341 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
933 return false; 1342 return false;
934 1343
935 if (is_per_cpu_kthread(p)) 1344 if (is_per_cpu_kthread(p))
@@ -1024,7 +1433,7 @@ static int migration_cpu_stop(void *data)
1024 local_irq_disable(); 1433 local_irq_disable();
1025 /* 1434 /*
1026 * We need to explicitly wake pending tasks before running 1435 * We need to explicitly wake pending tasks before running
1027 * __migrate_task() such that we will not miss enforcing cpus_allowed 1436 * __migrate_task() such that we will not miss enforcing cpus_ptr
1028 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. 1437 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
1029 */ 1438 */
1030 sched_ttwu_pending(); 1439 sched_ttwu_pending();
@@ -1055,7 +1464,7 @@ static int migration_cpu_stop(void *data)
1055 */ 1464 */
1056void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) 1465void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
1057{ 1466{
1058 cpumask_copy(&p->cpus_allowed, new_mask); 1467 cpumask_copy(&p->cpus_mask, new_mask);
1059 p->nr_cpus_allowed = cpumask_weight(new_mask); 1468 p->nr_cpus_allowed = cpumask_weight(new_mask);
1060} 1469}
1061 1470
@@ -1125,7 +1534,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
1125 goto out; 1534 goto out;
1126 } 1535 }
1127 1536
1128 if (cpumask_equal(&p->cpus_allowed, new_mask)) 1537 if (cpumask_equal(p->cpus_ptr, new_mask))
1129 goto out; 1538 goto out;
1130 1539
1131 if (!cpumask_intersects(new_mask, cpu_valid_mask)) { 1540 if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
@@ -1285,10 +1694,10 @@ static int migrate_swap_stop(void *data)
1285 if (task_cpu(arg->src_task) != arg->src_cpu) 1694 if (task_cpu(arg->src_task) != arg->src_cpu)
1286 goto unlock; 1695 goto unlock;
1287 1696
1288 if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed)) 1697 if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
1289 goto unlock; 1698 goto unlock;
1290 1699
1291 if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed)) 1700 if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
1292 goto unlock; 1701 goto unlock;
1293 1702
1294 __migrate_swap_task(arg->src_task, arg->dst_cpu); 1703 __migrate_swap_task(arg->src_task, arg->dst_cpu);
@@ -1330,10 +1739,10 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p,
1330 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) 1739 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
1331 goto out; 1740 goto out;
1332 1741
1333 if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed)) 1742 if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr))
1334 goto out; 1743 goto out;
1335 1744
1336 if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed)) 1745 if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr))
1337 goto out; 1746 goto out;
1338 1747
1339 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu); 1748 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
@@ -1478,7 +1887,7 @@ void kick_process(struct task_struct *p)
1478EXPORT_SYMBOL_GPL(kick_process); 1887EXPORT_SYMBOL_GPL(kick_process);
1479 1888
1480/* 1889/*
1481 * ->cpus_allowed is protected by both rq->lock and p->pi_lock 1890 * ->cpus_ptr is protected by both rq->lock and p->pi_lock
1482 * 1891 *
1483 * A few notes on cpu_active vs cpu_online: 1892 * A few notes on cpu_active vs cpu_online:
1484 * 1893 *
@@ -1518,14 +1927,14 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
1518 for_each_cpu(dest_cpu, nodemask) { 1927 for_each_cpu(dest_cpu, nodemask) {
1519 if (!cpu_active(dest_cpu)) 1928 if (!cpu_active(dest_cpu))
1520 continue; 1929 continue;
1521 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) 1930 if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
1522 return dest_cpu; 1931 return dest_cpu;
1523 } 1932 }
1524 } 1933 }
1525 1934
1526 for (;;) { 1935 for (;;) {
1527 /* Any allowed, online CPU? */ 1936 /* Any allowed, online CPU? */
1528 for_each_cpu(dest_cpu, &p->cpus_allowed) { 1937 for_each_cpu(dest_cpu, p->cpus_ptr) {
1529 if (!is_cpu_allowed(p, dest_cpu)) 1938 if (!is_cpu_allowed(p, dest_cpu))
1530 continue; 1939 continue;
1531 1940
@@ -1569,7 +1978,7 @@ out:
1569} 1978}
1570 1979
1571/* 1980/*
1572 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. 1981 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
1573 */ 1982 */
1574static inline 1983static inline
1575int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) 1984int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
@@ -1579,11 +1988,11 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
1579 if (p->nr_cpus_allowed > 1) 1988 if (p->nr_cpus_allowed > 1)
1580 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); 1989 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
1581 else 1990 else
1582 cpu = cpumask_any(&p->cpus_allowed); 1991 cpu = cpumask_any(p->cpus_ptr);
1583 1992
1584 /* 1993 /*
1585 * In order not to call set_task_cpu() on a blocking task we need 1994 * In order not to call set_task_cpu() on a blocking task we need
1586 * to rely on ttwu() to place the task on a valid ->cpus_allowed 1995 * to rely on ttwu() to place the task on a valid ->cpus_ptr
1587 * CPU. 1996 * CPU.
1588 * 1997 *
1589 * Since this is common to all placement strategies, this lives here. 1998 * Since this is common to all placement strategies, this lives here.
@@ -1990,6 +2399,29 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1990 unsigned long flags; 2399 unsigned long flags;
1991 int cpu, success = 0; 2400 int cpu, success = 0;
1992 2401
2402 if (p == current) {
2403 /*
2404 * We're waking current, this means 'p->on_rq' and 'task_cpu(p)
2405 * == smp_processor_id()'. Together this means we can special
2406 * case the whole 'p->on_rq && ttwu_remote()' case below
2407 * without taking any locks.
2408 *
2409 * In particular:
2410 * - we rely on Program-Order guarantees for all the ordering,
2411 * - we're serialized against set_special_state() by virtue of
2412 * it disabling IRQs (this allows not taking ->pi_lock).
2413 */
2414 if (!(p->state & state))
2415 return false;
2416
2417 success = 1;
2418 cpu = task_cpu(p);
2419 trace_sched_waking(p);
2420 p->state = TASK_RUNNING;
2421 trace_sched_wakeup(p);
2422 goto out;
2423 }
2424
1993 /* 2425 /*
1994 * If we are going to wake up a thread waiting for CONDITION we 2426 * If we are going to wake up a thread waiting for CONDITION we
1995 * need to ensure that CONDITION=1 done by the caller can not be 2427 * need to ensure that CONDITION=1 done by the caller can not be
@@ -1999,7 +2431,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1999 raw_spin_lock_irqsave(&p->pi_lock, flags); 2431 raw_spin_lock_irqsave(&p->pi_lock, flags);
2000 smp_mb__after_spinlock(); 2432 smp_mb__after_spinlock();
2001 if (!(p->state & state)) 2433 if (!(p->state & state))
2002 goto out; 2434 goto unlock;
2003 2435
2004 trace_sched_waking(p); 2436 trace_sched_waking(p);
2005 2437
@@ -2029,7 +2461,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2029 */ 2461 */
2030 smp_rmb(); 2462 smp_rmb();
2031 if (p->on_rq && ttwu_remote(p, wake_flags)) 2463 if (p->on_rq && ttwu_remote(p, wake_flags))
2032 goto stat; 2464 goto unlock;
2033 2465
2034#ifdef CONFIG_SMP 2466#ifdef CONFIG_SMP
2035 /* 2467 /*
@@ -2089,10 +2521,11 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2089#endif /* CONFIG_SMP */ 2521#endif /* CONFIG_SMP */
2090 2522
2091 ttwu_queue(p, cpu, wake_flags); 2523 ttwu_queue(p, cpu, wake_flags);
2092stat: 2524unlock:
2093 ttwu_stat(p, cpu, wake_flags);
2094out:
2095 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2525 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2526out:
2527 if (success)
2528 ttwu_stat(p, cpu, wake_flags);
2096 2529
2097 return success; 2530 return success;
2098} 2531}
@@ -2299,6 +2732,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
2299 */ 2732 */
2300 p->prio = current->normal_prio; 2733 p->prio = current->normal_prio;
2301 2734
2735 uclamp_fork(p);
2736
2302 /* 2737 /*
2303 * Revert to default priority/policy on fork if requested. 2738 * Revert to default priority/policy on fork if requested.
2304 */ 2739 */
@@ -2394,7 +2829,7 @@ void wake_up_new_task(struct task_struct *p)
2394#ifdef CONFIG_SMP 2829#ifdef CONFIG_SMP
2395 /* 2830 /*
2396 * Fork balancing, do it here and not earlier because: 2831 * Fork balancing, do it here and not earlier because:
2397 * - cpus_allowed can change in the fork path 2832 * - cpus_ptr can change in the fork path
2398 * - any previously selected CPU might disappear through hotplug 2833 * - any previously selected CPU might disappear through hotplug
2399 * 2834 *
2400 * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, 2835 * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
@@ -3032,7 +3467,6 @@ void scheduler_tick(void)
3032 3467
3033 update_rq_clock(rq); 3468 update_rq_clock(rq);
3034 curr->sched_class->task_tick(rq, curr, 0); 3469 curr->sched_class->task_tick(rq, curr, 0);
3035 cpu_load_update_active(rq);
3036 calc_global_load_tick(rq); 3470 calc_global_load_tick(rq);
3037 psi_task_tick(rq); 3471 psi_task_tick(rq);
3038 3472
@@ -4070,6 +4504,13 @@ static void __setscheduler_params(struct task_struct *p,
4070static void __setscheduler(struct rq *rq, struct task_struct *p, 4504static void __setscheduler(struct rq *rq, struct task_struct *p,
4071 const struct sched_attr *attr, bool keep_boost) 4505 const struct sched_attr *attr, bool keep_boost)
4072{ 4506{
4507 /*
4508 * If params can't change scheduling class changes aren't allowed
4509 * either.
4510 */
4511 if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)
4512 return;
4513
4073 __setscheduler_params(p, attr); 4514 __setscheduler_params(p, attr);
4074 4515
4075 /* 4516 /*
@@ -4207,6 +4648,13 @@ recheck:
4207 return retval; 4648 return retval;
4208 } 4649 }
4209 4650
4651 /* Update task specific "requested" clamps */
4652 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
4653 retval = uclamp_validate(p, attr);
4654 if (retval)
4655 return retval;
4656 }
4657
4210 /* 4658 /*
4211 * Make sure no PI-waiters arrive (or leave) while we are 4659 * Make sure no PI-waiters arrive (or leave) while we are
4212 * changing the priority of the task: 4660 * changing the priority of the task:
@@ -4236,6 +4684,8 @@ recheck:
4236 goto change; 4684 goto change;
4237 if (dl_policy(policy) && dl_param_changed(p, attr)) 4685 if (dl_policy(policy) && dl_param_changed(p, attr))
4238 goto change; 4686 goto change;
4687 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
4688 goto change;
4239 4689
4240 p->sched_reset_on_fork = reset_on_fork; 4690 p->sched_reset_on_fork = reset_on_fork;
4241 task_rq_unlock(rq, p, &rf); 4691 task_rq_unlock(rq, p, &rf);
@@ -4266,7 +4716,7 @@ change:
4266 * the entire root_domain to become SCHED_DEADLINE. We 4716 * the entire root_domain to become SCHED_DEADLINE. We
4267 * will also fail if there's no bandwidth available. 4717 * will also fail if there's no bandwidth available.
4268 */ 4718 */
4269 if (!cpumask_subset(span, &p->cpus_allowed) || 4719 if (!cpumask_subset(span, p->cpus_ptr) ||
4270 rq->rd->dl_bw.bw == 0) { 4720 rq->rd->dl_bw.bw == 0) {
4271 task_rq_unlock(rq, p, &rf); 4721 task_rq_unlock(rq, p, &rf);
4272 return -EPERM; 4722 return -EPERM;
@@ -4316,7 +4766,9 @@ change:
4316 put_prev_task(rq, p); 4766 put_prev_task(rq, p);
4317 4767
4318 prev_class = p->sched_class; 4768 prev_class = p->sched_class;
4769
4319 __setscheduler(rq, p, attr, pi); 4770 __setscheduler(rq, p, attr, pi);
4771 __setscheduler_uclamp(p, attr);
4320 4772
4321 if (queued) { 4773 if (queued) {
4322 /* 4774 /*
@@ -4492,6 +4944,10 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
4492 if (ret) 4944 if (ret)
4493 return -EFAULT; 4945 return -EFAULT;
4494 4946
4947 if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
4948 size < SCHED_ATTR_SIZE_VER1)
4949 return -EINVAL;
4950
4495 /* 4951 /*
4496 * XXX: Do we want to be lenient like existing syscalls; or do we want 4952 * XXX: Do we want to be lenient like existing syscalls; or do we want
4497 * to be strict and return an error on out-of-bounds values? 4953 * to be strict and return an error on out-of-bounds values?
@@ -4555,14 +5011,21 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
4555 5011
4556 if ((int)attr.sched_policy < 0) 5012 if ((int)attr.sched_policy < 0)
4557 return -EINVAL; 5013 return -EINVAL;
5014 if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
5015 attr.sched_policy = SETPARAM_POLICY;
4558 5016
4559 rcu_read_lock(); 5017 rcu_read_lock();
4560 retval = -ESRCH; 5018 retval = -ESRCH;
4561 p = find_process_by_pid(pid); 5019 p = find_process_by_pid(pid);
4562 if (p != NULL) 5020 if (likely(p))
4563 retval = sched_setattr(p, &attr); 5021 get_task_struct(p);
4564 rcu_read_unlock(); 5022 rcu_read_unlock();
4565 5023
5024 if (likely(p)) {
5025 retval = sched_setattr(p, &attr);
5026 put_task_struct(p);
5027 }
5028
4566 return retval; 5029 return retval;
4567} 5030}
4568 5031
@@ -4713,6 +5176,11 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
4713 else 5176 else
4714 attr.sched_nice = task_nice(p); 5177 attr.sched_nice = task_nice(p);
4715 5178
5179#ifdef CONFIG_UCLAMP_TASK
5180 attr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
5181 attr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
5182#endif
5183
4716 rcu_read_unlock(); 5184 rcu_read_unlock();
4717 5185
4718 retval = sched_read_attr(uattr, &attr, size); 5186 retval = sched_read_attr(uattr, &attr, size);
@@ -4865,7 +5333,7 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
4865 goto out_unlock; 5333 goto out_unlock;
4866 5334
4867 raw_spin_lock_irqsave(&p->pi_lock, flags); 5335 raw_spin_lock_irqsave(&p->pi_lock, flags);
4868 cpumask_and(mask, &p->cpus_allowed, cpu_active_mask); 5336 cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
4869 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 5337 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4870 5338
4871out_unlock: 5339out_unlock:
@@ -5122,7 +5590,7 @@ long __sched io_schedule_timeout(long timeout)
5122} 5590}
5123EXPORT_SYMBOL(io_schedule_timeout); 5591EXPORT_SYMBOL(io_schedule_timeout);
5124 5592
5125void io_schedule(void) 5593void __sched io_schedule(void)
5126{ 5594{
5127 int token; 5595 int token;
5128 5596
@@ -5442,7 +5910,7 @@ int task_can_attach(struct task_struct *p,
5442 * allowed nodes is unnecessary. Thus, cpusets are not 5910 * allowed nodes is unnecessary. Thus, cpusets are not
5443 * applicable for such threads. This prevents checking for 5911 * applicable for such threads. This prevents checking for
5444 * success of set_cpus_allowed_ptr() on all attached tasks 5912 * success of set_cpus_allowed_ptr() on all attached tasks
5445 * before cpus_allowed may be changed. 5913 * before cpus_mask may be changed.
5446 */ 5914 */
5447 if (p->flags & PF_NO_SETAFFINITY) { 5915 if (p->flags & PF_NO_SETAFFINITY) {
5448 ret = -EINVAL; 5916 ret = -EINVAL;
@@ -5469,7 +5937,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
5469 if (curr_cpu == target_cpu) 5937 if (curr_cpu == target_cpu)
5470 return 0; 5938 return 0;
5471 5939
5472 if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed)) 5940 if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
5473 return -EINVAL; 5941 return -EINVAL;
5474 5942
5475 /* TODO: This is not properly updating schedstats */ 5943 /* TODO: This is not properly updating schedstats */
@@ -5607,7 +6075,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
5607 put_prev_task(rq, next); 6075 put_prev_task(rq, next);
5608 6076
5609 /* 6077 /*
5610 * Rules for changing task_struct::cpus_allowed are holding 6078 * Rules for changing task_struct::cpus_mask are holding
5611 * both pi_lock and rq->lock, such that holding either 6079 * both pi_lock and rq->lock, such that holding either
5612 * stabilizes the mask. 6080 * stabilizes the mask.
5613 * 6081 *
@@ -5901,8 +6369,8 @@ DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
5901 6369
5902void __init sched_init(void) 6370void __init sched_init(void)
5903{ 6371{
5904 int i, j;
5905 unsigned long alloc_size = 0, ptr; 6372 unsigned long alloc_size = 0, ptr;
6373 int i;
5906 6374
5907 wait_bit_init(); 6375 wait_bit_init();
5908 6376
@@ -6004,10 +6472,6 @@ void __init sched_init(void)
6004#ifdef CONFIG_RT_GROUP_SCHED 6472#ifdef CONFIG_RT_GROUP_SCHED
6005 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); 6473 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
6006#endif 6474#endif
6007
6008 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6009 rq->cpu_load[j] = 0;
6010
6011#ifdef CONFIG_SMP 6475#ifdef CONFIG_SMP
6012 rq->sd = NULL; 6476 rq->sd = NULL;
6013 rq->rd = NULL; 6477 rq->rd = NULL;
@@ -6062,6 +6526,8 @@ void __init sched_init(void)
6062 6526
6063 psi_init(); 6527 psi_init();
6064 6528
6529 init_uclamp();
6530
6065 scheduler_running = 1; 6531 scheduler_running = 1;
6066} 6532}
6067 6533
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 50316455ea66..5cc4012572ec 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -1,14 +1,10 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * kernel/sched/cpudl.c 3 * kernel/sched/cpudl.c
3 * 4 *
4 * Global CPU deadline management 5 * Global CPU deadline management
5 * 6 *
6 * Author: Juri Lelli <j.lelli@sssup.it> 7 * Author: Juri Lelli <j.lelli@sssup.it>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; version 2
11 * of the License.
12 */ 8 */
13#include "sched.h" 9#include "sched.h"
14 10
@@ -124,14 +120,14 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
124 const struct sched_dl_entity *dl_se = &p->dl; 120 const struct sched_dl_entity *dl_se = &p->dl;
125 121
126 if (later_mask && 122 if (later_mask &&
127 cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) { 123 cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) {
128 return 1; 124 return 1;
129 } else { 125 } else {
130 int best_cpu = cpudl_maximum(cp); 126 int best_cpu = cpudl_maximum(cp);
131 127
132 WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); 128 WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
133 129
134 if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) && 130 if (cpumask_test_cpu(best_cpu, p->cpus_ptr) &&
135 dl_time_before(dl_se->deadline, cp->elements[0].dl)) { 131 dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
136 if (later_mask) 132 if (later_mask)
137 cpumask_set_cpu(best_cpu, later_mask); 133 cpumask_set_cpu(best_cpu, later_mask);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 962cf343f798..636ca6f88c8e 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -196,14 +196,17 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
196 * based on the task model parameters and gives the minimal utilization 196 * based on the task model parameters and gives the minimal utilization
197 * required to meet deadlines. 197 * required to meet deadlines.
198 */ 198 */
199unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, 199unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
200 unsigned long max, enum schedutil_type type) 200 unsigned long max, enum schedutil_type type,
201 struct task_struct *p)
201{ 202{
202 unsigned long dl_util, util, irq; 203 unsigned long dl_util, util, irq;
203 struct rq *rq = cpu_rq(cpu); 204 struct rq *rq = cpu_rq(cpu);
204 205
205 if (type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) 206 if (!IS_BUILTIN(CONFIG_UCLAMP_TASK) &&
207 type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
206 return max; 208 return max;
209 }
207 210
208 /* 211 /*
209 * Early check to see if IRQ/steal time saturates the CPU, can be 212 * Early check to see if IRQ/steal time saturates the CPU, can be
@@ -219,9 +222,16 @@ unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs,
219 * CFS tasks and we use the same metric to track the effective 222 * CFS tasks and we use the same metric to track the effective
220 * utilization (PELT windows are synchronized) we can directly add them 223 * utilization (PELT windows are synchronized) we can directly add them
221 * to obtain the CPU's actual utilization. 224 * to obtain the CPU's actual utilization.
225 *
226 * CFS and RT utilization can be boosted or capped, depending on
227 * utilization clamp constraints requested by currently RUNNABLE
228 * tasks.
229 * When there are no CFS RUNNABLE tasks, clamps are released and
230 * frequency will be gracefully reduced with the utilization decay.
222 */ 231 */
223 util = util_cfs; 232 util = util_cfs + cpu_util_rt(rq);
224 util += cpu_util_rt(rq); 233 if (type == FREQUENCY_UTIL)
234 util = uclamp_util_with(rq, util, p);
225 235
226 dl_util = cpu_util_dl(rq); 236 dl_util = cpu_util_dl(rq);
227 237
@@ -276,12 +286,12 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
276{ 286{
277 struct rq *rq = cpu_rq(sg_cpu->cpu); 287 struct rq *rq = cpu_rq(sg_cpu->cpu);
278 unsigned long util = cpu_util_cfs(rq); 288 unsigned long util = cpu_util_cfs(rq);
279 unsigned long max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu); 289 unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu);
280 290
281 sg_cpu->max = max; 291 sg_cpu->max = max;
282 sg_cpu->bw_dl = cpu_bw_dl(rq); 292 sg_cpu->bw_dl = cpu_bw_dl(rq);
283 293
284 return schedutil_freq_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL); 294 return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL);
285} 295}
286 296
287/** 297/**
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index daaadf939ccb..b7abca987d94 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * kernel/sched/cpupri.c 3 * kernel/sched/cpupri.c
3 * 4 *
@@ -20,11 +21,6 @@
20 * searches). For tasks with affinity restrictions, the algorithm has a 21 * searches). For tasks with affinity restrictions, the algorithm has a
21 * worst case complexity of O(min(102, nr_domcpus)), though the scenario that 22 * worst case complexity of O(min(102, nr_domcpus)), though the scenario that
22 * yields the worst case search is fairly contrived. 23 * yields the worst case search is fairly contrived.
23 *
24 * This program is free software; you can redistribute it and/or
25 * modify it under the terms of the GNU General Public License
26 * as published by the Free Software Foundation; version 2
27 * of the License.
28 */ 24 */
29#include "sched.h" 25#include "sched.h"
30 26
@@ -98,11 +94,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
98 if (skip) 94 if (skip)
99 continue; 95 continue;
100 96
101 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) 97 if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids)
102 continue; 98 continue;
103 99
104 if (lowest_mask) { 100 if (lowest_mask) {
105 cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); 101 cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
106 102
107 /* 103 /*
108 * We have to ensure that we have at least one bit 104 * We have to ensure that we have at least one bit
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index ba4a143bdcf3..2305ce89a26c 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * Simple CPU accounting cgroup controller 3 * Simple CPU accounting cgroup controller
3 */ 4 */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 43901fa3f269..ef5b9f6b1d42 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -538,7 +538,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
538 * If we cannot preempt any rq, fall back to pick any 538 * If we cannot preempt any rq, fall back to pick any
539 * online CPU: 539 * online CPU:
540 */ 540 */
541 cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); 541 cpu = cpumask_any_and(cpu_active_mask, p->cpus_ptr);
542 if (cpu >= nr_cpu_ids) { 542 if (cpu >= nr_cpu_ids) {
543 /* 543 /*
544 * Failed to find any suitable CPU. 544 * Failed to find any suitable CPU.
@@ -726,7 +726,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
726 * refill the runtime and set the deadline a period in the future, 726 * refill the runtime and set the deadline a period in the future,
727 * because keeping the current (absolute) deadline of the task would 727 * because keeping the current (absolute) deadline of the task would
728 * result in breaking guarantees promised to other tasks (refer to 728 * result in breaking guarantees promised to other tasks (refer to
729 * Documentation/scheduler/sched-deadline.txt for more information). 729 * Documentation/scheduler/sched-deadline.rst for more information).
730 * 730 *
731 * This function returns true if: 731 * This function returns true if:
732 * 732 *
@@ -1195,7 +1195,7 @@ static void update_curr_dl(struct rq *rq)
1195 &curr->dl); 1195 &curr->dl);
1196 } else { 1196 } else {
1197 unsigned long scale_freq = arch_scale_freq_capacity(cpu); 1197 unsigned long scale_freq = arch_scale_freq_capacity(cpu);
1198 unsigned long scale_cpu = arch_scale_cpu_capacity(NULL, cpu); 1198 unsigned long scale_cpu = arch_scale_cpu_capacity(cpu);
1199 1199
1200 scaled_delta_exec = cap_scale(delta_exec, scale_freq); 1200 scaled_delta_exec = cap_scale(delta_exec, scale_freq);
1201 scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu); 1201 scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu);
@@ -1824,7 +1824,7 @@ static void set_curr_task_dl(struct rq *rq)
1824static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) 1824static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
1825{ 1825{
1826 if (!task_running(rq, p) && 1826 if (!task_running(rq, p) &&
1827 cpumask_test_cpu(cpu, &p->cpus_allowed)) 1827 cpumask_test_cpu(cpu, p->cpus_ptr))
1828 return 1; 1828 return 1;
1829 return 0; 1829 return 0;
1830} 1830}
@@ -1974,7 +1974,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
1974 /* Retry if something changed. */ 1974 /* Retry if something changed. */
1975 if (double_lock_balance(rq, later_rq)) { 1975 if (double_lock_balance(rq, later_rq)) {
1976 if (unlikely(task_rq(task) != rq || 1976 if (unlikely(task_rq(task) != rq ||
1977 !cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) || 1977 !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) ||
1978 task_running(rq, task) || 1978 task_running(rq, task) ||
1979 !dl_task(task) || 1979 !dl_task(task) ||
1980 !task_on_rq_queued(task))) { 1980 !task_on_rq_queued(task))) {
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 678bfb9bd87f..f7e4579e746c 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1,13 +1,10 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * kernel/sched/debug.c 3 * kernel/sched/debug.c
3 * 4 *
4 * Print the CFS rbtree and other debugging details 5 * Print the CFS rbtree and other debugging details
5 * 6 *
6 * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar 7 * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 */ 8 */
12#include "sched.h" 9#include "sched.h"
13 10
@@ -236,49 +233,35 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
236 *tablep = NULL; 233 *tablep = NULL;
237} 234}
238 235
239static int min_load_idx = 0;
240static int max_load_idx = CPU_LOAD_IDX_MAX-1;
241
242static void 236static void
243set_table_entry(struct ctl_table *entry, 237set_table_entry(struct ctl_table *entry,
244 const char *procname, void *data, int maxlen, 238 const char *procname, void *data, int maxlen,
245 umode_t mode, proc_handler *proc_handler, 239 umode_t mode, proc_handler *proc_handler)
246 bool load_idx)
247{ 240{
248 entry->procname = procname; 241 entry->procname = procname;
249 entry->data = data; 242 entry->data = data;
250 entry->maxlen = maxlen; 243 entry->maxlen = maxlen;
251 entry->mode = mode; 244 entry->mode = mode;
252 entry->proc_handler = proc_handler; 245 entry->proc_handler = proc_handler;
253
254 if (load_idx) {
255 entry->extra1 = &min_load_idx;
256 entry->extra2 = &max_load_idx;
257 }
258} 246}
259 247
260static struct ctl_table * 248static struct ctl_table *
261sd_alloc_ctl_domain_table(struct sched_domain *sd) 249sd_alloc_ctl_domain_table(struct sched_domain *sd)
262{ 250{
263 struct ctl_table *table = sd_alloc_ctl_entry(14); 251 struct ctl_table *table = sd_alloc_ctl_entry(9);
264 252
265 if (table == NULL) 253 if (table == NULL)
266 return NULL; 254 return NULL;
267 255
268 set_table_entry(&table[0] , "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax, false); 256 set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax);
269 set_table_entry(&table[1] , "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax, false); 257 set_table_entry(&table[1], "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax);
270 set_table_entry(&table[2] , "busy_idx", &sd->busy_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); 258 set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax);
271 set_table_entry(&table[3] , "idle_idx", &sd->idle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); 259 set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax);
272 set_table_entry(&table[4] , "newidle_idx", &sd->newidle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); 260 set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax);
273 set_table_entry(&table[5] , "wake_idx", &sd->wake_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); 261 set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax);
274 set_table_entry(&table[6] , "forkexec_idx", &sd->forkexec_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); 262 set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax);
275 set_table_entry(&table[7] , "busy_factor", &sd->busy_factor, sizeof(int) , 0644, proc_dointvec_minmax, false); 263 set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring);
276 set_table_entry(&table[8] , "imbalance_pct", &sd->imbalance_pct, sizeof(int) , 0644, proc_dointvec_minmax, false); 264 /* &table[8] is terminator */
277 set_table_entry(&table[9] , "cache_nice_tries", &sd->cache_nice_tries, sizeof(int) , 0644, proc_dointvec_minmax, false);
278 set_table_entry(&table[10], "flags", &sd->flags, sizeof(int) , 0644, proc_dointvec_minmax, false);
279 set_table_entry(&table[11], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax, false);
280 set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring, false);
281 /* &table[13] is terminator */
282 265
283 return table; 266 return table;
284} 267}
@@ -656,8 +639,6 @@ do { \
656 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) 639 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
657 640
658 P(nr_running); 641 P(nr_running);
659 SEQ_printf(m, " .%-30s: %lu\n", "load",
660 rq->load.weight);
661 P(nr_switches); 642 P(nr_switches);
662 P(nr_load_updates); 643 P(nr_load_updates);
663 P(nr_uninterruptible); 644 P(nr_uninterruptible);
@@ -665,11 +646,6 @@ do { \
665 SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr))); 646 SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
666 PN(clock); 647 PN(clock);
667 PN(clock_task); 648 PN(clock_task);
668 P(cpu_load[0]);
669 P(cpu_load[1]);
670 P(cpu_load[2]);
671 P(cpu_load[3]);
672 P(cpu_load[4]);
673#undef P 649#undef P
674#undef PN 650#undef PN
675 651
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f35930f5e528..036be95a87e9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -275,6 +275,19 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
275 return grp->my_q; 275 return grp->my_q;
276} 276}
277 277
278static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
279{
280 if (!path)
281 return;
282
283 if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
284 autogroup_path(cfs_rq->tg, path, len);
285 else if (cfs_rq && cfs_rq->tg->css.cgroup)
286 cgroup_path(cfs_rq->tg->css.cgroup, path, len);
287 else
288 strlcpy(path, "(null)", len);
289}
290
278static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) 291static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
279{ 292{
280 struct rq *rq = rq_of(cfs_rq); 293 struct rq *rq = rq_of(cfs_rq);
@@ -449,6 +462,12 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
449 return NULL; 462 return NULL;
450} 463}
451 464
465static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
466{
467 if (path)
468 strlcpy(path, "(null)", len);
469}
470
452static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) 471static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
453{ 472{
454 return true; 473 return true;
@@ -764,7 +783,7 @@ void post_init_entity_util_avg(struct task_struct *p)
764 struct sched_entity *se = &p->se; 783 struct sched_entity *se = &p->se;
765 struct cfs_rq *cfs_rq = cfs_rq_of(se); 784 struct cfs_rq *cfs_rq = cfs_rq_of(se);
766 struct sched_avg *sa = &se->avg; 785 struct sched_avg *sa = &se->avg;
767 long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq))); 786 long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
768 long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2; 787 long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
769 788
770 if (cap > 0) { 789 if (cap > 0) {
@@ -1466,9 +1485,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1466 group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; 1485 group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
1467} 1486}
1468 1487
1469static unsigned long weighted_cpuload(struct rq *rq); 1488static unsigned long cpu_runnable_load(struct rq *rq);
1470static unsigned long source_load(int cpu, int type);
1471static unsigned long target_load(int cpu, int type);
1472 1489
1473/* Cached statistics for all CPUs within a node */ 1490/* Cached statistics for all CPUs within a node */
1474struct numa_stats { 1491struct numa_stats {
@@ -1489,7 +1506,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
1489 for_each_cpu(cpu, cpumask_of_node(nid)) { 1506 for_each_cpu(cpu, cpumask_of_node(nid)) {
1490 struct rq *rq = cpu_rq(cpu); 1507 struct rq *rq = cpu_rq(cpu);
1491 1508
1492 ns->load += weighted_cpuload(rq); 1509 ns->load += cpu_runnable_load(rq);
1493 ns->compute_capacity += capacity_of(cpu); 1510 ns->compute_capacity += capacity_of(cpu);
1494 } 1511 }
1495 1512
@@ -1621,7 +1638,7 @@ static void task_numa_compare(struct task_numa_env *env,
1621 * be incurred if the tasks were swapped. 1638 * be incurred if the tasks were swapped.
1622 */ 1639 */
1623 /* Skip this swap candidate if cannot move to the source cpu */ 1640 /* Skip this swap candidate if cannot move to the source cpu */
1624 if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed)) 1641 if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
1625 goto unlock; 1642 goto unlock;
1626 1643
1627 /* 1644 /*
@@ -1718,7 +1735,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
1718 1735
1719 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { 1736 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1720 /* Skip this CPU if the source task cannot migrate */ 1737 /* Skip this CPU if the source task cannot migrate */
1721 if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed)) 1738 if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
1722 continue; 1739 continue;
1723 1740
1724 env->dst_cpu = cpu; 1741 env->dst_cpu = cpu;
@@ -2686,8 +2703,6 @@ static void
2686account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 2703account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2687{ 2704{
2688 update_load_add(&cfs_rq->load, se->load.weight); 2705 update_load_add(&cfs_rq->load, se->load.weight);
2689 if (!parent_entity(se))
2690 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
2691#ifdef CONFIG_SMP 2706#ifdef CONFIG_SMP
2692 if (entity_is_task(se)) { 2707 if (entity_is_task(se)) {
2693 struct rq *rq = rq_of(cfs_rq); 2708 struct rq *rq = rq_of(cfs_rq);
@@ -2703,8 +2718,6 @@ static void
2703account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) 2718account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2704{ 2719{
2705 update_load_sub(&cfs_rq->load, se->load.weight); 2720 update_load_sub(&cfs_rq->load, se->load.weight);
2706 if (!parent_entity(se))
2707 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
2708#ifdef CONFIG_SMP 2721#ifdef CONFIG_SMP
2709 if (entity_is_task(se)) { 2722 if (entity_is_task(se)) {
2710 account_numa_dequeue(rq_of(cfs_rq), task_of(se)); 2723 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
@@ -3334,6 +3347,9 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)
3334 update_tg_cfs_util(cfs_rq, se, gcfs_rq); 3347 update_tg_cfs_util(cfs_rq, se, gcfs_rq);
3335 update_tg_cfs_runnable(cfs_rq, se, gcfs_rq); 3348 update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
3336 3349
3350 trace_pelt_cfs_tp(cfs_rq);
3351 trace_pelt_se_tp(se);
3352
3337 return 1; 3353 return 1;
3338} 3354}
3339 3355
@@ -3486,6 +3502,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
3486 add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); 3502 add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
3487 3503
3488 cfs_rq_util_change(cfs_rq, flags); 3504 cfs_rq_util_change(cfs_rq, flags);
3505
3506 trace_pelt_cfs_tp(cfs_rq);
3489} 3507}
3490 3508
3491/** 3509/**
@@ -3505,6 +3523,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
3505 add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); 3523 add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
3506 3524
3507 cfs_rq_util_change(cfs_rq, 0); 3525 cfs_rq_util_change(cfs_rq, 0);
3526
3527 trace_pelt_cfs_tp(cfs_rq);
3508} 3528}
3509 3529
3510/* 3530/*
@@ -4100,7 +4120,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
4100 * least twice that of our own weight (i.e. dont track it 4120 * least twice that of our own weight (i.e. dont track it
4101 * when there are only lesser-weight tasks around): 4121 * when there are only lesser-weight tasks around):
4102 */ 4122 */
4103 if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { 4123 if (schedstat_enabled() &&
4124 rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
4104 schedstat_set(se->statistics.slice_max, 4125 schedstat_set(se->statistics.slice_max,
4105 max((u64)schedstat_val(se->statistics.slice_max), 4126 max((u64)schedstat_val(se->statistics.slice_max),
4106 se->sum_exec_runtime - se->prev_sum_exec_runtime)); 4127 se->sum_exec_runtime - se->prev_sum_exec_runtime));
@@ -4734,6 +4755,11 @@ static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
4734 if (runtime_refresh_within(cfs_b, min_left)) 4755 if (runtime_refresh_within(cfs_b, min_left))
4735 return; 4756 return;
4736 4757
4758 /* don't push forwards an existing deferred unthrottle */
4759 if (cfs_b->slack_started)
4760 return;
4761 cfs_b->slack_started = true;
4762
4737 hrtimer_start(&cfs_b->slack_timer, 4763 hrtimer_start(&cfs_b->slack_timer,
4738 ns_to_ktime(cfs_bandwidth_slack_period), 4764 ns_to_ktime(cfs_bandwidth_slack_period),
4739 HRTIMER_MODE_REL); 4765 HRTIMER_MODE_REL);
@@ -4787,6 +4813,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
4787 4813
4788 /* confirm we're still not at a refresh boundary */ 4814 /* confirm we're still not at a refresh boundary */
4789 raw_spin_lock_irqsave(&cfs_b->lock, flags); 4815 raw_spin_lock_irqsave(&cfs_b->lock, flags);
4816 cfs_b->slack_started = false;
4790 if (cfs_b->distribute_running) { 4817 if (cfs_b->distribute_running) {
4791 raw_spin_unlock_irqrestore(&cfs_b->lock, flags); 4818 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4792 return; 4819 return;
@@ -4950,6 +4977,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4950 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 4977 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4951 cfs_b->slack_timer.function = sched_cfs_slack_timer; 4978 cfs_b->slack_timer.function = sched_cfs_slack_timer;
4952 cfs_b->distribute_running = 0; 4979 cfs_b->distribute_running = 0;
4980 cfs_b->slack_started = false;
4953} 4981}
4954 4982
4955static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) 4983static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
@@ -5153,8 +5181,10 @@ static inline bool cpu_overutilized(int cpu)
5153 5181
5154static inline void update_overutilized_status(struct rq *rq) 5182static inline void update_overutilized_status(struct rq *rq)
5155{ 5183{
5156 if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) 5184 if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
5157 WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED); 5185 WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
5186 trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
5187 }
5158} 5188}
5159#else 5189#else
5160static inline void update_overutilized_status(struct rq *rq) { } 5190static inline void update_overutilized_status(struct rq *rq) { }
@@ -5325,71 +5355,6 @@ DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
5325DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); 5355DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
5326 5356
5327#ifdef CONFIG_NO_HZ_COMMON 5357#ifdef CONFIG_NO_HZ_COMMON
5328/*
5329 * per rq 'load' arrray crap; XXX kill this.
5330 */
5331
5332/*
5333 * The exact cpuload calculated at every tick would be:
5334 *
5335 * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
5336 *
5337 * If a CPU misses updates for n ticks (as it was idle) and update gets
5338 * called on the n+1-th tick when CPU may be busy, then we have:
5339 *
5340 * load_n = (1 - 1/2^i)^n * load_0
5341 * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load
5342 *
5343 * decay_load_missed() below does efficient calculation of
5344 *
5345 * load' = (1 - 1/2^i)^n * load
5346 *
5347 * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors.
5348 * This allows us to precompute the above in said factors, thereby allowing the
5349 * reduction of an arbitrary n in O(log_2 n) steps. (See also
5350 * fixed_power_int())
5351 *
5352 * The calculation is approximated on a 128 point scale.
5353 */
5354#define DEGRADE_SHIFT 7
5355
5356static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
5357static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
5358 { 0, 0, 0, 0, 0, 0, 0, 0 },
5359 { 64, 32, 8, 0, 0, 0, 0, 0 },
5360 { 96, 72, 40, 12, 1, 0, 0, 0 },
5361 { 112, 98, 75, 43, 15, 1, 0, 0 },
5362 { 120, 112, 98, 76, 45, 16, 2, 0 }
5363};
5364
5365/*
5366 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
5367 * would be when CPU is idle and so we just decay the old load without
5368 * adding any new load.
5369 */
5370static unsigned long
5371decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
5372{
5373 int j = 0;
5374
5375 if (!missed_updates)
5376 return load;
5377
5378 if (missed_updates >= degrade_zero_ticks[idx])
5379 return 0;
5380
5381 if (idx == 1)
5382 return load >> missed_updates;
5383
5384 while (missed_updates) {
5385 if (missed_updates % 2)
5386 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
5387
5388 missed_updates >>= 1;
5389 j++;
5390 }
5391 return load;
5392}
5393 5358
5394static struct { 5359static struct {
5395 cpumask_var_t idle_cpus_mask; 5360 cpumask_var_t idle_cpus_mask;
@@ -5401,234 +5366,11 @@ static struct {
5401 5366
5402#endif /* CONFIG_NO_HZ_COMMON */ 5367#endif /* CONFIG_NO_HZ_COMMON */
5403 5368
5404/** 5369static unsigned long cpu_runnable_load(struct rq *rq)
5405 * __cpu_load_update - update the rq->cpu_load[] statistics
5406 * @this_rq: The rq to update statistics for
5407 * @this_load: The current load
5408 * @pending_updates: The number of missed updates
5409 *
5410 * Update rq->cpu_load[] statistics. This function is usually called every
5411 * scheduler tick (TICK_NSEC).
5412 *
5413 * This function computes a decaying average:
5414 *
5415 * load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load
5416 *
5417 * Because of NOHZ it might not get called on every tick which gives need for
5418 * the @pending_updates argument.
5419 *
5420 * load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1
5421 * = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load
5422 * = A * (A * load[i]_n-2 + B) + B
5423 * = A * (A * (A * load[i]_n-3 + B) + B) + B
5424 * = A^3 * load[i]_n-3 + (A^2 + A + 1) * B
5425 * = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B
5426 * = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B
5427 * = (1 - 1/2^i)^n * (load[i]_0 - load) + load
5428 *
5429 * In the above we've assumed load_n := load, which is true for NOHZ_FULL as
5430 * any change in load would have resulted in the tick being turned back on.
5431 *
5432 * For regular NOHZ, this reduces to:
5433 *
5434 * load[i]_n = (1 - 1/2^i)^n * load[i]_0
5435 *
5436 * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
5437 * term.
5438 */
5439static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
5440 unsigned long pending_updates)
5441{
5442 unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
5443 int i, scale;
5444
5445 this_rq->nr_load_updates++;
5446
5447 /* Update our load: */
5448 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
5449 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
5450 unsigned long old_load, new_load;
5451
5452 /* scale is effectively 1 << i now, and >> i divides by scale */
5453
5454 old_load = this_rq->cpu_load[i];
5455#ifdef CONFIG_NO_HZ_COMMON
5456 old_load = decay_load_missed(old_load, pending_updates - 1, i);
5457 if (tickless_load) {
5458 old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
5459 /*
5460 * old_load can never be a negative value because a
5461 * decayed tickless_load cannot be greater than the
5462 * original tickless_load.
5463 */
5464 old_load += tickless_load;
5465 }
5466#endif
5467 new_load = this_load;
5468 /*
5469 * Round up the averaging division if load is increasing. This
5470 * prevents us from getting stuck on 9 if the load is 10, for
5471 * example.
5472 */
5473 if (new_load > old_load)
5474 new_load += scale - 1;
5475
5476 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
5477 }
5478}
5479
5480/* Used instead of source_load when we know the type == 0 */
5481static unsigned long weighted_cpuload(struct rq *rq)
5482{ 5370{
5483 return cfs_rq_runnable_load_avg(&rq->cfs); 5371 return cfs_rq_runnable_load_avg(&rq->cfs);
5484} 5372}
5485 5373
5486#ifdef CONFIG_NO_HZ_COMMON
5487/*
5488 * There is no sane way to deal with nohz on smp when using jiffies because the
5489 * CPU doing the jiffies update might drift wrt the CPU doing the jiffy reading
5490 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
5491 *
5492 * Therefore we need to avoid the delta approach from the regular tick when
5493 * possible since that would seriously skew the load calculation. This is why we
5494 * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on
5495 * jiffies deltas for updates happening while in nohz mode (idle ticks, idle
5496 * loop exit, nohz_idle_balance, nohz full exit...)
5497 *
5498 * This means we might still be one tick off for nohz periods.
5499 */
5500
5501static void cpu_load_update_nohz(struct rq *this_rq,
5502 unsigned long curr_jiffies,
5503 unsigned long load)
5504{
5505 unsigned long pending_updates;
5506
5507 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
5508 if (pending_updates) {
5509 this_rq->last_load_update_tick = curr_jiffies;
5510 /*
5511 * In the regular NOHZ case, we were idle, this means load 0.
5512 * In the NOHZ_FULL case, we were non-idle, we should consider
5513 * its weighted load.
5514 */
5515 cpu_load_update(this_rq, load, pending_updates);
5516 }
5517}
5518
5519/*
5520 * Called from nohz_idle_balance() to update the load ratings before doing the
5521 * idle balance.
5522 */
5523static void cpu_load_update_idle(struct rq *this_rq)
5524{
5525 /*
5526 * bail if there's load or we're actually up-to-date.
5527 */
5528 if (weighted_cpuload(this_rq))
5529 return;
5530
5531 cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
5532}
5533
5534/*
5535 * Record CPU load on nohz entry so we know the tickless load to account
5536 * on nohz exit. cpu_load[0] happens then to be updated more frequently
5537 * than other cpu_load[idx] but it should be fine as cpu_load readers
5538 * shouldn't rely into synchronized cpu_load[*] updates.
5539 */
5540void cpu_load_update_nohz_start(void)
5541{
5542 struct rq *this_rq = this_rq();
5543
5544 /*
5545 * This is all lockless but should be fine. If weighted_cpuload changes
5546 * concurrently we'll exit nohz. And cpu_load write can race with
5547 * cpu_load_update_idle() but both updater would be writing the same.
5548 */
5549 this_rq->cpu_load[0] = weighted_cpuload(this_rq);
5550}
5551
5552/*
5553 * Account the tickless load in the end of a nohz frame.
5554 */
5555void cpu_load_update_nohz_stop(void)
5556{
5557 unsigned long curr_jiffies = READ_ONCE(jiffies);
5558 struct rq *this_rq = this_rq();
5559 unsigned long load;
5560 struct rq_flags rf;
5561
5562 if (curr_jiffies == this_rq->last_load_update_tick)
5563 return;
5564
5565 load = weighted_cpuload(this_rq);
5566 rq_lock(this_rq, &rf);
5567 update_rq_clock(this_rq);
5568 cpu_load_update_nohz(this_rq, curr_jiffies, load);
5569 rq_unlock(this_rq, &rf);
5570}
5571#else /* !CONFIG_NO_HZ_COMMON */
5572static inline void cpu_load_update_nohz(struct rq *this_rq,
5573 unsigned long curr_jiffies,
5574 unsigned long load) { }
5575#endif /* CONFIG_NO_HZ_COMMON */
5576
5577static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
5578{
5579#ifdef CONFIG_NO_HZ_COMMON
5580 /* See the mess around cpu_load_update_nohz(). */
5581 this_rq->last_load_update_tick = READ_ONCE(jiffies);
5582#endif
5583 cpu_load_update(this_rq, load, 1);
5584}
5585
5586/*
5587 * Called from scheduler_tick()
5588 */
5589void cpu_load_update_active(struct rq *this_rq)
5590{
5591 unsigned long load = weighted_cpuload(this_rq);
5592
5593 if (tick_nohz_tick_stopped())
5594 cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
5595 else
5596 cpu_load_update_periodic(this_rq, load);
5597}
5598
5599/*
5600 * Return a low guess at the load of a migration-source CPU weighted
5601 * according to the scheduling class and "nice" value.
5602 *
5603 * We want to under-estimate the load of migration sources, to
5604 * balance conservatively.
5605 */
5606static unsigned long source_load(int cpu, int type)
5607{
5608 struct rq *rq = cpu_rq(cpu);
5609 unsigned long total = weighted_cpuload(rq);
5610
5611 if (type == 0 || !sched_feat(LB_BIAS))
5612 return total;
5613
5614 return min(rq->cpu_load[type-1], total);
5615}
5616
5617/*
5618 * Return a high guess at the load of a migration-target CPU weighted
5619 * according to the scheduling class and "nice" value.
5620 */
5621static unsigned long target_load(int cpu, int type)
5622{
5623 struct rq *rq = cpu_rq(cpu);
5624 unsigned long total = weighted_cpuload(rq);
5625
5626 if (type == 0 || !sched_feat(LB_BIAS))
5627 return total;
5628
5629 return max(rq->cpu_load[type-1], total);
5630}
5631
5632static unsigned long capacity_of(int cpu) 5374static unsigned long capacity_of(int cpu)
5633{ 5375{
5634 return cpu_rq(cpu)->cpu_capacity; 5376 return cpu_rq(cpu)->cpu_capacity;
@@ -5638,7 +5380,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
5638{ 5380{
5639 struct rq *rq = cpu_rq(cpu); 5381 struct rq *rq = cpu_rq(cpu);
5640 unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); 5382 unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
5641 unsigned long load_avg = weighted_cpuload(rq); 5383 unsigned long load_avg = cpu_runnable_load(rq);
5642 5384
5643 if (nr_running) 5385 if (nr_running)
5644 return load_avg / nr_running; 5386 return load_avg / nr_running;
@@ -5736,7 +5478,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
5736 s64 this_eff_load, prev_eff_load; 5478 s64 this_eff_load, prev_eff_load;
5737 unsigned long task_load; 5479 unsigned long task_load;
5738 5480
5739 this_eff_load = target_load(this_cpu, sd->wake_idx); 5481 this_eff_load = cpu_runnable_load(cpu_rq(this_cpu));
5740 5482
5741 if (sync) { 5483 if (sync) {
5742 unsigned long current_load = task_h_load(current); 5484 unsigned long current_load = task_h_load(current);
@@ -5754,7 +5496,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
5754 this_eff_load *= 100; 5496 this_eff_load *= 100;
5755 this_eff_load *= capacity_of(prev_cpu); 5497 this_eff_load *= capacity_of(prev_cpu);
5756 5498
5757 prev_eff_load = source_load(prev_cpu, sd->wake_idx); 5499 prev_eff_load = cpu_runnable_load(cpu_rq(prev_cpu));
5758 prev_eff_load -= task_load; 5500 prev_eff_load -= task_load;
5759 if (sched_feat(WA_BIAS)) 5501 if (sched_feat(WA_BIAS))
5760 prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; 5502 prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
@@ -5815,14 +5557,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5815 unsigned long this_runnable_load = ULONG_MAX; 5557 unsigned long this_runnable_load = ULONG_MAX;
5816 unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX; 5558 unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
5817 unsigned long most_spare = 0, this_spare = 0; 5559 unsigned long most_spare = 0, this_spare = 0;
5818 int load_idx = sd->forkexec_idx;
5819 int imbalance_scale = 100 + (sd->imbalance_pct-100)/2; 5560 int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
5820 unsigned long imbalance = scale_load_down(NICE_0_LOAD) * 5561 unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
5821 (sd->imbalance_pct-100) / 100; 5562 (sd->imbalance_pct-100) / 100;
5822 5563
5823 if (sd_flag & SD_BALANCE_WAKE)
5824 load_idx = sd->wake_idx;
5825
5826 do { 5564 do {
5827 unsigned long load, avg_load, runnable_load; 5565 unsigned long load, avg_load, runnable_load;
5828 unsigned long spare_cap, max_spare_cap; 5566 unsigned long spare_cap, max_spare_cap;
@@ -5831,7 +5569,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5831 5569
5832 /* Skip over this group if it has no CPUs allowed */ 5570 /* Skip over this group if it has no CPUs allowed */
5833 if (!cpumask_intersects(sched_group_span(group), 5571 if (!cpumask_intersects(sched_group_span(group),
5834 &p->cpus_allowed)) 5572 p->cpus_ptr))
5835 continue; 5573 continue;
5836 5574
5837 local_group = cpumask_test_cpu(this_cpu, 5575 local_group = cpumask_test_cpu(this_cpu,
@@ -5846,12 +5584,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5846 max_spare_cap = 0; 5584 max_spare_cap = 0;
5847 5585
5848 for_each_cpu(i, sched_group_span(group)) { 5586 for_each_cpu(i, sched_group_span(group)) {
5849 /* Bias balancing toward CPUs of our domain */ 5587 load = cpu_runnable_load(cpu_rq(i));
5850 if (local_group)
5851 load = source_load(i, load_idx);
5852 else
5853 load = target_load(i, load_idx);
5854
5855 runnable_load += load; 5588 runnable_load += load;
5856 5589
5857 avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs); 5590 avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
@@ -5963,7 +5696,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
5963 return cpumask_first(sched_group_span(group)); 5696 return cpumask_first(sched_group_span(group));
5964 5697
5965 /* Traverse only the allowed CPUs */ 5698 /* Traverse only the allowed CPUs */
5966 for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) { 5699 for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
5967 if (available_idle_cpu(i)) { 5700 if (available_idle_cpu(i)) {
5968 struct rq *rq = cpu_rq(i); 5701 struct rq *rq = cpu_rq(i);
5969 struct cpuidle_state *idle = idle_get_state(rq); 5702 struct cpuidle_state *idle = idle_get_state(rq);
@@ -5987,7 +5720,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
5987 shallowest_idle_cpu = i; 5720 shallowest_idle_cpu = i;
5988 } 5721 }
5989 } else if (shallowest_idle_cpu == -1) { 5722 } else if (shallowest_idle_cpu == -1) {
5990 load = weighted_cpuload(cpu_rq(i)); 5723 load = cpu_runnable_load(cpu_rq(i));
5991 if (load < min_load) { 5724 if (load < min_load) {
5992 min_load = load; 5725 min_load = load;
5993 least_loaded_cpu = i; 5726 least_loaded_cpu = i;
@@ -6003,7 +5736,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
6003{ 5736{
6004 int new_cpu = cpu; 5737 int new_cpu = cpu;
6005 5738
6006 if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed)) 5739 if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr))
6007 return prev_cpu; 5740 return prev_cpu;
6008 5741
6009 /* 5742 /*
@@ -6120,7 +5853,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
6120 if (!test_idle_cores(target, false)) 5853 if (!test_idle_cores(target, false))
6121 return -1; 5854 return -1;
6122 5855
6123 cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed); 5856 cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
6124 5857
6125 for_each_cpu_wrap(core, cpus, target) { 5858 for_each_cpu_wrap(core, cpus, target) {
6126 bool idle = true; 5859 bool idle = true;
@@ -6154,7 +5887,7 @@ static int select_idle_smt(struct task_struct *p, int target)
6154 return -1; 5887 return -1;
6155 5888
6156 for_each_cpu(cpu, cpu_smt_mask(target)) { 5889 for_each_cpu(cpu, cpu_smt_mask(target)) {
6157 if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) 5890 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
6158 continue; 5891 continue;
6159 if (available_idle_cpu(cpu)) 5892 if (available_idle_cpu(cpu))
6160 return cpu; 5893 return cpu;
@@ -6189,6 +5922,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
6189 u64 time, cost; 5922 u64 time, cost;
6190 s64 delta; 5923 s64 delta;
6191 int cpu, nr = INT_MAX; 5924 int cpu, nr = INT_MAX;
5925 int this = smp_processor_id();
6192 5926
6193 this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); 5927 this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
6194 if (!this_sd) 5928 if (!this_sd)
@@ -6212,18 +5946,18 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
6212 nr = 4; 5946 nr = 4;
6213 } 5947 }
6214 5948
6215 time = local_clock(); 5949 time = cpu_clock(this);
6216 5950
6217 for_each_cpu_wrap(cpu, sched_domain_span(sd), target) { 5951 for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
6218 if (!--nr) 5952 if (!--nr)
6219 return -1; 5953 return -1;
6220 if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) 5954 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
6221 continue; 5955 continue;
6222 if (available_idle_cpu(cpu)) 5956 if (available_idle_cpu(cpu))
6223 break; 5957 break;
6224 } 5958 }
6225 5959
6226 time = local_clock() - time; 5960 time = cpu_clock(this) - time;
6227 cost = this_sd->avg_scan_cost; 5961 cost = this_sd->avg_scan_cost;
6228 delta = (s64)(time - cost) / 8; 5962 delta = (s64)(time - cost) / 8;
6229 this_sd->avg_scan_cost += delta; 5963 this_sd->avg_scan_cost += delta;
@@ -6254,7 +5988,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
6254 recent_used_cpu != target && 5988 recent_used_cpu != target &&
6255 cpus_share_cache(recent_used_cpu, target) && 5989 cpus_share_cache(recent_used_cpu, target) &&
6256 available_idle_cpu(recent_used_cpu) && 5990 available_idle_cpu(recent_used_cpu) &&
6257 cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) { 5991 cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) {
6258 /* 5992 /*
6259 * Replace recent_used_cpu with prev as it is a potential 5993 * Replace recent_used_cpu with prev as it is a potential
6260 * candidate for the next wake: 5994 * candidate for the next wake:
@@ -6498,11 +6232,21 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
6498static long 6232static long
6499compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) 6233compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
6500{ 6234{
6501 long util, max_util, sum_util, energy = 0; 6235 unsigned int max_util, util_cfs, cpu_util, cpu_cap;
6236 unsigned long sum_util, energy = 0;
6237 struct task_struct *tsk;
6502 int cpu; 6238 int cpu;
6503 6239
6504 for (; pd; pd = pd->next) { 6240 for (; pd; pd = pd->next) {
6241 struct cpumask *pd_mask = perf_domain_span(pd);
6242
6243 /*
6244 * The energy model mandates all the CPUs of a performance
6245 * domain have the same capacity.
6246 */
6247 cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
6505 max_util = sum_util = 0; 6248 max_util = sum_util = 0;
6249
6506 /* 6250 /*
6507 * The capacity state of CPUs of the current rd can be driven by 6251 * The capacity state of CPUs of the current rd can be driven by
6508 * CPUs of another rd if they belong to the same performance 6252 * CPUs of another rd if they belong to the same performance
@@ -6513,11 +6257,29 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
6513 * it will not appear in its pd list and will not be accounted 6257 * it will not appear in its pd list and will not be accounted
6514 * by compute_energy(). 6258 * by compute_energy().
6515 */ 6259 */
6516 for_each_cpu_and(cpu, perf_domain_span(pd), cpu_online_mask) { 6260 for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
6517 util = cpu_util_next(cpu, p, dst_cpu); 6261 util_cfs = cpu_util_next(cpu, p, dst_cpu);
6518 util = schedutil_energy_util(cpu, util); 6262
6519 max_util = max(util, max_util); 6263 /*
6520 sum_util += util; 6264 * Busy time computation: utilization clamping is not
6265 * required since the ratio (sum_util / cpu_capacity)
6266 * is already enough to scale the EM reported power
6267 * consumption at the (eventually clamped) cpu_capacity.
6268 */
6269 sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
6270 ENERGY_UTIL, NULL);
6271
6272 /*
6273 * Performance domain frequency: utilization clamping
6274 * must be considered since it affects the selection
6275 * of the performance domain frequency.
6276 * NOTE: in case RT tasks are running, by default the
6277 * FREQUENCY_UTIL's utilization can be max OPP.
6278 */
6279 tsk = cpu == dst_cpu ? p : NULL;
6280 cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
6281 FREQUENCY_UTIL, tsk);
6282 max_util = max(max_util, cpu_util);
6521 } 6283 }
6522 6284
6523 energy += em_pd_energy(pd->em_pd, max_util, sum_util); 6285 energy += em_pd_energy(pd->em_pd, max_util, sum_util);
@@ -6600,7 +6362,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
6600 int max_spare_cap_cpu = -1; 6362 int max_spare_cap_cpu = -1;
6601 6363
6602 for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) { 6364 for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
6603 if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) 6365 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
6604 continue; 6366 continue;
6605 6367
6606 /* Skip CPUs that will be overutilized. */ 6368 /* Skip CPUs that will be overutilized. */
@@ -6689,7 +6451,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
6689 } 6451 }
6690 6452
6691 want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) && 6453 want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) &&
6692 cpumask_test_cpu(cpu, &p->cpus_allowed); 6454 cpumask_test_cpu(cpu, p->cpus_ptr);
6693 } 6455 }
6694 6456
6695 rcu_read_lock(); 6457 rcu_read_lock();
@@ -7445,14 +7207,14 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
7445 /* 7207 /*
7446 * We do not migrate tasks that are: 7208 * We do not migrate tasks that are:
7447 * 1) throttled_lb_pair, or 7209 * 1) throttled_lb_pair, or
7448 * 2) cannot be migrated to this CPU due to cpus_allowed, or 7210 * 2) cannot be migrated to this CPU due to cpus_ptr, or
7449 * 3) running (obviously), or 7211 * 3) running (obviously), or
7450 * 4) are cache-hot on their current CPU. 7212 * 4) are cache-hot on their current CPU.
7451 */ 7213 */
7452 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) 7214 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
7453 return 0; 7215 return 0;
7454 7216
7455 if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) { 7217 if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
7456 int cpu; 7218 int cpu;
7457 7219
7458 schedstat_inc(p->se.statistics.nr_failed_migrations_affine); 7220 schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
@@ -7472,7 +7234,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
7472 7234
7473 /* Prevent to re-select dst_cpu via env's CPUs: */ 7235 /* Prevent to re-select dst_cpu via env's CPUs: */
7474 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { 7236 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
7475 if (cpumask_test_cpu(cpu, &p->cpus_allowed)) { 7237 if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
7476 env->flags |= LBF_DST_PINNED; 7238 env->flags |= LBF_DST_PINNED;
7477 env->new_dst_cpu = cpu; 7239 env->new_dst_cpu = cpu;
7478 break; 7240 break;
@@ -7558,7 +7320,7 @@ static struct task_struct *detach_one_task(struct lb_env *env)
7558static const unsigned int sched_nr_migrate_break = 32; 7320static const unsigned int sched_nr_migrate_break = 32;
7559 7321
7560/* 7322/*
7561 * detach_tasks() -- tries to detach up to imbalance weighted load from 7323 * detach_tasks() -- tries to detach up to imbalance runnable load from
7562 * busiest_rq, as part of a balancing operation within domain "sd". 7324 * busiest_rq, as part of a balancing operation within domain "sd".
7563 * 7325 *
7564 * Returns number of detached tasks if successful and 0 otherwise. 7326 * Returns number of detached tasks if successful and 0 otherwise.
@@ -7626,7 +7388,7 @@ static int detach_tasks(struct lb_env *env)
7626 7388
7627 /* 7389 /*
7628 * We only want to steal up to the prescribed amount of 7390 * We only want to steal up to the prescribed amount of
7629 * weighted load. 7391 * runnable load.
7630 */ 7392 */
7631 if (env->imbalance <= 0) 7393 if (env->imbalance <= 0)
7632 break; 7394 break;
@@ -7695,6 +7457,7 @@ static void attach_tasks(struct lb_env *env)
7695 rq_unlock(env->dst_rq, &rf); 7457 rq_unlock(env->dst_rq, &rf);
7696} 7458}
7697 7459
7460#ifdef CONFIG_NO_HZ_COMMON
7698static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) 7461static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
7699{ 7462{
7700 if (cfs_rq->avg.load_avg) 7463 if (cfs_rq->avg.load_avg)
@@ -7722,6 +7485,19 @@ static inline bool others_have_blocked(struct rq *rq)
7722 return false; 7485 return false;
7723} 7486}
7724 7487
7488static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
7489{
7490 rq->last_blocked_load_update_tick = jiffies;
7491
7492 if (!has_blocked)
7493 rq->has_blocked_load = 0;
7494}
7495#else
7496static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
7497static inline bool others_have_blocked(struct rq *rq) { return false; }
7498static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
7499#endif
7500
7725#ifdef CONFIG_FAIR_GROUP_SCHED 7501#ifdef CONFIG_FAIR_GROUP_SCHED
7726 7502
7727static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) 7503static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
@@ -7787,11 +7563,7 @@ static void update_blocked_averages(int cpu)
7787 if (others_have_blocked(rq)) 7563 if (others_have_blocked(rq))
7788 done = false; 7564 done = false;
7789 7565
7790#ifdef CONFIG_NO_HZ_COMMON 7566 update_blocked_load_status(rq, !done);
7791 rq->last_blocked_load_update_tick = jiffies;
7792 if (done)
7793 rq->has_blocked_load = 0;
7794#endif
7795 rq_unlock_irqrestore(rq, &rf); 7567 rq_unlock_irqrestore(rq, &rf);
7796} 7568}
7797 7569
@@ -7857,11 +7629,7 @@ static inline void update_blocked_averages(int cpu)
7857 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); 7629 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
7858 update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class); 7630 update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
7859 update_irq_load_avg(rq, 0); 7631 update_irq_load_avg(rq, 0);
7860#ifdef CONFIG_NO_HZ_COMMON 7632 update_blocked_load_status(rq, cfs_rq_has_blocked(cfs_rq) || others_have_blocked(rq));
7861 rq->last_blocked_load_update_tick = jiffies;
7862 if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq))
7863 rq->has_blocked_load = 0;
7864#endif
7865 rq_unlock_irqrestore(rq, &rf); 7633 rq_unlock_irqrestore(rq, &rf);
7866} 7634}
7867 7635
@@ -7879,7 +7647,6 @@ static unsigned long task_h_load(struct task_struct *p)
7879struct sg_lb_stats { 7647struct sg_lb_stats {
7880 unsigned long avg_load; /*Avg load across the CPUs of the group */ 7648 unsigned long avg_load; /*Avg load across the CPUs of the group */
7881 unsigned long group_load; /* Total load over the CPUs of the group */ 7649 unsigned long group_load; /* Total load over the CPUs of the group */
7882 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
7883 unsigned long load_per_task; 7650 unsigned long load_per_task;
7884 unsigned long group_capacity; 7651 unsigned long group_capacity;
7885 unsigned long group_util; /* Total utilization of the group */ 7652 unsigned long group_util; /* Total utilization of the group */
@@ -7933,38 +7700,10 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
7933 }; 7700 };
7934} 7701}
7935 7702
7936/**
7937 * get_sd_load_idx - Obtain the load index for a given sched domain.
7938 * @sd: The sched_domain whose load_idx is to be obtained.
7939 * @idle: The idle status of the CPU for whose sd load_idx is obtained.
7940 *
7941 * Return: The load index.
7942 */
7943static inline int get_sd_load_idx(struct sched_domain *sd,
7944 enum cpu_idle_type idle)
7945{
7946 int load_idx;
7947
7948 switch (idle) {
7949 case CPU_NOT_IDLE:
7950 load_idx = sd->busy_idx;
7951 break;
7952
7953 case CPU_NEWLY_IDLE:
7954 load_idx = sd->newidle_idx;
7955 break;
7956 default:
7957 load_idx = sd->idle_idx;
7958 break;
7959 }
7960
7961 return load_idx;
7962}
7963
7964static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu) 7703static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
7965{ 7704{
7966 struct rq *rq = cpu_rq(cpu); 7705 struct rq *rq = cpu_rq(cpu);
7967 unsigned long max = arch_scale_cpu_capacity(sd, cpu); 7706 unsigned long max = arch_scale_cpu_capacity(cpu);
7968 unsigned long used, free; 7707 unsigned long used, free;
7969 unsigned long irq; 7708 unsigned long irq;
7970 7709
@@ -7989,7 +7728,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
7989 unsigned long capacity = scale_rt_capacity(sd, cpu); 7728 unsigned long capacity = scale_rt_capacity(sd, cpu);
7990 struct sched_group *sdg = sd->groups; 7729 struct sched_group *sdg = sd->groups;
7991 7730
7992 cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu); 7731 cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
7993 7732
7994 if (!capacity) 7733 if (!capacity)
7995 capacity = 1; 7734 capacity = 1;
@@ -8099,7 +7838,7 @@ static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
8099 7838
8100/* 7839/*
8101 * Group imbalance indicates (and tries to solve) the problem where balancing 7840 * Group imbalance indicates (and tries to solve) the problem where balancing
8102 * groups is inadequate due to ->cpus_allowed constraints. 7841 * groups is inadequate due to ->cpus_ptr constraints.
8103 * 7842 *
8104 * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a 7843 * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a
8105 * cpumask covering 1 CPU of the first group and 3 CPUs of the second group. 7844 * cpumask covering 1 CPU of the first group and 3 CPUs of the second group.
@@ -8249,9 +7988,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
8249 struct sg_lb_stats *sgs, 7988 struct sg_lb_stats *sgs,
8250 int *sg_status) 7989 int *sg_status)
8251{ 7990{
8252 int local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
8253 int load_idx = get_sd_load_idx(env->sd, env->idle);
8254 unsigned long load;
8255 int i, nr_running; 7991 int i, nr_running;
8256 7992
8257 memset(sgs, 0, sizeof(*sgs)); 7993 memset(sgs, 0, sizeof(*sgs));
@@ -8262,13 +7998,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
8262 if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false)) 7998 if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
8263 env->flags |= LBF_NOHZ_AGAIN; 7999 env->flags |= LBF_NOHZ_AGAIN;
8264 8000
8265 /* Bias balancing toward CPUs of our domain: */ 8001 sgs->group_load += cpu_runnable_load(rq);
8266 if (local_group)
8267 load = target_load(i, load_idx);
8268 else
8269 load = source_load(i, load_idx);
8270
8271 sgs->group_load += load;
8272 sgs->group_util += cpu_util(i); 8002 sgs->group_util += cpu_util(i);
8273 sgs->sum_nr_running += rq->cfs.h_nr_running; 8003 sgs->sum_nr_running += rq->cfs.h_nr_running;
8274 8004
@@ -8283,7 +8013,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
8283 sgs->nr_numa_running += rq->nr_numa_running; 8013 sgs->nr_numa_running += rq->nr_numa_running;
8284 sgs->nr_preferred_running += rq->nr_preferred_running; 8014 sgs->nr_preferred_running += rq->nr_preferred_running;
8285#endif 8015#endif
8286 sgs->sum_weighted_load += weighted_cpuload(rq);
8287 /* 8016 /*
8288 * No need to call idle_cpu() if nr_running is not 0 8017 * No need to call idle_cpu() if nr_running is not 0
8289 */ 8018 */
@@ -8302,7 +8031,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
8302 sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity; 8031 sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
8303 8032
8304 if (sgs->sum_nr_running) 8033 if (sgs->sum_nr_running)
8305 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 8034 sgs->load_per_task = sgs->group_load / sgs->sum_nr_running;
8306 8035
8307 sgs->group_weight = group->group_weight; 8036 sgs->group_weight = group->group_weight;
8308 8037
@@ -8516,8 +8245,12 @@ next_group:
8516 8245
8517 /* Update over-utilization (tipping point, U >= 0) indicator */ 8246 /* Update over-utilization (tipping point, U >= 0) indicator */
8518 WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED); 8247 WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
8248 trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
8519 } else if (sg_status & SG_OVERUTILIZED) { 8249 } else if (sg_status & SG_OVERUTILIZED) {
8520 WRITE_ONCE(env->dst_rq->rd->overutilized, SG_OVERUTILIZED); 8250 struct root_domain *rd = env->dst_rq->rd;
8251
8252 WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
8253 trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
8521 } 8254 }
8522} 8255}
8523 8256
@@ -8723,7 +8456,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
8723 * find_busiest_group - Returns the busiest group within the sched_domain 8456 * find_busiest_group - Returns the busiest group within the sched_domain
8724 * if there is an imbalance. 8457 * if there is an imbalance.
8725 * 8458 *
8726 * Also calculates the amount of weighted load which should be moved 8459 * Also calculates the amount of runnable load which should be moved
8727 * to restore balance. 8460 * to restore balance.
8728 * 8461 *
8729 * @env: The load balancing environment. 8462 * @env: The load balancing environment.
@@ -8768,7 +8501,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
8768 /* 8501 /*
8769 * If the busiest group is imbalanced the below checks don't 8502 * If the busiest group is imbalanced the below checks don't
8770 * work because they assume all things are equal, which typically 8503 * work because they assume all things are equal, which typically
8771 * isn't true due to cpus_allowed constraints and the like. 8504 * isn't true due to cpus_ptr constraints and the like.
8772 */ 8505 */
8773 if (busiest->group_type == group_imbalanced) 8506 if (busiest->group_type == group_imbalanced)
8774 goto force_balance; 8507 goto force_balance;
@@ -8842,7 +8575,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
8842 int i; 8575 int i;
8843 8576
8844 for_each_cpu_and(i, sched_group_span(group), env->cpus) { 8577 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
8845 unsigned long capacity, wl; 8578 unsigned long capacity, load;
8846 enum fbq_type rt; 8579 enum fbq_type rt;
8847 8580
8848 rq = cpu_rq(i); 8581 rq = cpu_rq(i);
@@ -8896,30 +8629,30 @@ static struct rq *find_busiest_queue(struct lb_env *env,
8896 rq->nr_running == 1) 8629 rq->nr_running == 1)
8897 continue; 8630 continue;
8898 8631
8899 wl = weighted_cpuload(rq); 8632 load = cpu_runnable_load(rq);
8900 8633
8901 /* 8634 /*
8902 * When comparing with imbalance, use weighted_cpuload() 8635 * When comparing with imbalance, use cpu_runnable_load()
8903 * which is not scaled with the CPU capacity. 8636 * which is not scaled with the CPU capacity.
8904 */ 8637 */
8905 8638
8906 if (rq->nr_running == 1 && wl > env->imbalance && 8639 if (rq->nr_running == 1 && load > env->imbalance &&
8907 !check_cpu_capacity(rq, env->sd)) 8640 !check_cpu_capacity(rq, env->sd))
8908 continue; 8641 continue;
8909 8642
8910 /* 8643 /*
8911 * For the load comparisons with the other CPU's, consider 8644 * For the load comparisons with the other CPU's, consider
8912 * the weighted_cpuload() scaled with the CPU capacity, so 8645 * the cpu_runnable_load() scaled with the CPU capacity, so
8913 * that the load can be moved away from the CPU that is 8646 * that the load can be moved away from the CPU that is
8914 * potentially running at a lower capacity. 8647 * potentially running at a lower capacity.
8915 * 8648 *
8916 * Thus we're looking for max(wl_i / capacity_i), crosswise 8649 * Thus we're looking for max(load_i / capacity_i), crosswise
8917 * multiplication to rid ourselves of the division works out 8650 * multiplication to rid ourselves of the division works out
8918 * to: wl_i * capacity_j > wl_j * capacity_i; where j is 8651 * to: load_i * capacity_j > load_j * capacity_i; where j is
8919 * our previous maximum. 8652 * our previous maximum.
8920 */ 8653 */
8921 if (wl * busiest_capacity > busiest_load * capacity) { 8654 if (load * busiest_capacity > busiest_load * capacity) {
8922 busiest_load = wl; 8655 busiest_load = load;
8923 busiest_capacity = capacity; 8656 busiest_capacity = capacity;
8924 busiest = rq; 8657 busiest = rq;
8925 } 8658 }
@@ -9210,7 +8943,7 @@ more_balance:
9210 * if the curr task on busiest CPU can't be 8943 * if the curr task on busiest CPU can't be
9211 * moved to this_cpu: 8944 * moved to this_cpu:
9212 */ 8945 */
9213 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { 8946 if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
9214 raw_spin_unlock_irqrestore(&busiest->lock, 8947 raw_spin_unlock_irqrestore(&busiest->lock,
9215 flags); 8948 flags);
9216 env.flags |= LBF_ALL_PINNED; 8949 env.flags |= LBF_ALL_PINNED;
@@ -9879,7 +9612,6 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
9879 9612
9880 rq_lock_irqsave(rq, &rf); 9613 rq_lock_irqsave(rq, &rf);
9881 update_rq_clock(rq); 9614 update_rq_clock(rq);
9882 cpu_load_update_idle(rq);
9883 rq_unlock_irqrestore(rq, &rf); 9615 rq_unlock_irqrestore(rq, &rf);
9884 9616
9885 if (flags & NOHZ_BALANCE_KICK) 9617 if (flags & NOHZ_BALANCE_KICK)
@@ -10690,6 +10422,10 @@ const struct sched_class fair_sched_class = {
10690#ifdef CONFIG_FAIR_GROUP_SCHED 10422#ifdef CONFIG_FAIR_GROUP_SCHED
10691 .task_change_group = task_change_group_fair, 10423 .task_change_group = task_change_group_fair,
10692#endif 10424#endif
10425
10426#ifdef CONFIG_UCLAMP_TASK
10427 .uclamp_enabled = 1,
10428#endif
10693}; 10429};
10694 10430
10695#ifdef CONFIG_SCHED_DEBUG 10431#ifdef CONFIG_SCHED_DEBUG
@@ -10737,3 +10473,83 @@ __init void init_sched_fair_class(void)
10737#endif /* SMP */ 10473#endif /* SMP */
10738 10474
10739} 10475}
10476
10477/*
10478 * Helper functions to facilitate extracting info from tracepoints.
10479 */
10480
10481const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq)
10482{
10483#ifdef CONFIG_SMP
10484 return cfs_rq ? &cfs_rq->avg : NULL;
10485#else
10486 return NULL;
10487#endif
10488}
10489EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg);
10490
10491char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len)
10492{
10493 if (!cfs_rq) {
10494 if (str)
10495 strlcpy(str, "(null)", len);
10496 else
10497 return NULL;
10498 }
10499
10500 cfs_rq_tg_path(cfs_rq, str, len);
10501 return str;
10502}
10503EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path);
10504
10505int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq)
10506{
10507 return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1;
10508}
10509EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu);
10510
10511const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq)
10512{
10513#ifdef CONFIG_SMP
10514 return rq ? &rq->avg_rt : NULL;
10515#else
10516 return NULL;
10517#endif
10518}
10519EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt);
10520
10521const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq)
10522{
10523#ifdef CONFIG_SMP
10524 return rq ? &rq->avg_dl : NULL;
10525#else
10526 return NULL;
10527#endif
10528}
10529EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl);
10530
10531const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq)
10532{
10533#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ)
10534 return rq ? &rq->avg_irq : NULL;
10535#else
10536 return NULL;
10537#endif
10538}
10539EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq);
10540
10541int sched_trace_rq_cpu(struct rq *rq)
10542{
10543 return rq ? cpu_of(rq) : -1;
10544}
10545EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
10546
10547const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
10548{
10549#ifdef CONFIG_SMP
10550 return rd ? rd->span : NULL;
10551#else
10552 return NULL;
10553#endif
10554}
10555EXPORT_SYMBOL_GPL(sched_trace_rd_span);
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 858589b83377..2410db5e9a35 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -39,7 +39,6 @@ SCHED_FEAT(WAKEUP_PREEMPTION, true)
39 39
40SCHED_FEAT(HRTICK, false) 40SCHED_FEAT(HRTICK, false)
41SCHED_FEAT(DOUBLE_TICK, false) 41SCHED_FEAT(DOUBLE_TICK, false)
42SCHED_FEAT(LB_BIAS, false)
43 42
44/* 43/*
45 * Decrement CPU capacity based on time not spent running tasks 44 * Decrement CPU capacity based on time not spent running tasks
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index f5516bae0c1b..80940939b733 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * Generic entry points for the idle threads and 3 * Generic entry points for the idle threads and
3 * implementation of the idle task scheduling class. 4 * implementation of the idle task scheduling class.
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 687302051a27..123ea07a3f3b 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * Housekeeping management. Manage the targets for routine code that can run on 3 * Housekeeping management. Manage the targets for routine code that can run on
3 * any CPU: unbound workqueues, timers, kthreads and any offloadable work. 4 * any CPU: unbound workqueues, timers, kthreads and any offloadable work.
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 3cd8a3a795d2..aa8d75804108 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -1,17 +1,8 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
1/* 2/*
2 * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> 3 * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
3 * 4 *
4 * membarrier system call 5 * membarrier system call
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 */ 6 */
16#include "sched.h" 7#include "sched.h"
17 8
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index befce29bd882..a96db50d40e0 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -28,6 +28,8 @@
28#include "sched.h" 28#include "sched.h"
29#include "pelt.h" 29#include "pelt.h"
30 30
31#include <trace/events/sched.h>
32
31/* 33/*
32 * Approximate: 34 * Approximate:
33 * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) 35 * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
@@ -265,6 +267,7 @@ int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
265{ 267{
266 if (___update_load_sum(now, &se->avg, 0, 0, 0)) { 268 if (___update_load_sum(now, &se->avg, 0, 0, 0)) {
267 ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); 269 ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
270 trace_pelt_se_tp(se);
268 return 1; 271 return 1;
269 } 272 }
270 273
@@ -278,6 +281,7 @@ int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se
278 281
279 ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); 282 ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
280 cfs_se_util_change(&se->avg); 283 cfs_se_util_change(&se->avg);
284 trace_pelt_se_tp(se);
281 return 1; 285 return 1;
282 } 286 }
283 287
@@ -292,6 +296,7 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
292 cfs_rq->curr != NULL)) { 296 cfs_rq->curr != NULL)) {
293 297
294 ___update_load_avg(&cfs_rq->avg, 1, 1); 298 ___update_load_avg(&cfs_rq->avg, 1, 1);
299 trace_pelt_cfs_tp(cfs_rq);
295 return 1; 300 return 1;
296 } 301 }
297 302
@@ -317,6 +322,7 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
317 running)) { 322 running)) {
318 323
319 ___update_load_avg(&rq->avg_rt, 1, 1); 324 ___update_load_avg(&rq->avg_rt, 1, 1);
325 trace_pelt_rt_tp(rq);
320 return 1; 326 return 1;
321 } 327 }
322 328
@@ -340,6 +346,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
340 running)) { 346 running)) {
341 347
342 ___update_load_avg(&rq->avg_dl, 1, 1); 348 ___update_load_avg(&rq->avg_dl, 1, 1);
349 trace_pelt_dl_tp(rq);
343 return 1; 350 return 1;
344 } 351 }
345 352
@@ -366,7 +373,7 @@ int update_irq_load_avg(struct rq *rq, u64 running)
366 * reflect the real amount of computation 373 * reflect the real amount of computation
367 */ 374 */
368 running = cap_scale(running, arch_scale_freq_capacity(cpu_of(rq))); 375 running = cap_scale(running, arch_scale_freq_capacity(cpu_of(rq)));
369 running = cap_scale(running, arch_scale_cpu_capacity(NULL, cpu_of(rq))); 376 running = cap_scale(running, arch_scale_cpu_capacity(cpu_of(rq)));
370 377
371 /* 378 /*
372 * We know the time that has been used by interrupt since last update 379 * We know the time that has been used by interrupt since last update
@@ -388,8 +395,10 @@ int update_irq_load_avg(struct rq *rq, u64 running)
388 1, 395 1,
389 1); 396 1);
390 397
391 if (ret) 398 if (ret) {
392 ___update_load_avg(&rq->avg_irq, 1, 1); 399 ___update_load_avg(&rq->avg_irq, 1, 1);
400 trace_pelt_irq_tp(rq);
401 }
393 402
394 return ret; 403 return ret;
395} 404}
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 7489d5f56960..afff644da065 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -79,7 +79,7 @@ static inline void update_rq_clock_pelt(struct rq *rq, s64 delta)
79 * Scale the elapsed time to reflect the real amount of 79 * Scale the elapsed time to reflect the real amount of
80 * computation 80 * computation
81 */ 81 */
82 delta = cap_scale(delta, arch_scale_cpu_capacity(NULL, cpu_of(rq))); 82 delta = cap_scale(delta, arch_scale_cpu_capacity(cpu_of(rq)));
83 delta = cap_scale(delta, arch_scale_freq_capacity(cpu_of(rq))); 83 delta = cap_scale(delta, arch_scale_freq_capacity(cpu_of(rq)));
84 84
85 rq->clock_pelt += delta; 85 rq->clock_pelt += delta;
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 0e97ca9306ef..7acc632c3b82 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -4,6 +4,9 @@
4 * Copyright (c) 2018 Facebook, Inc. 4 * Copyright (c) 2018 Facebook, Inc.
5 * Author: Johannes Weiner <hannes@cmpxchg.org> 5 * Author: Johannes Weiner <hannes@cmpxchg.org>
6 * 6 *
7 * Polling support by Suren Baghdasaryan <surenb@google.com>
8 * Copyright (c) 2018 Google, Inc.
9 *
7 * When CPU, memory and IO are contended, tasks experience delays that 10 * When CPU, memory and IO are contended, tasks experience delays that
8 * reduce throughput and introduce latencies into the workload. Memory 11 * reduce throughput and introduce latencies into the workload. Memory
9 * and IO contention, in addition, can cause a full loss of forward 12 * and IO contention, in addition, can cause a full loss of forward
@@ -129,9 +132,13 @@
129#include <linux/seq_file.h> 132#include <linux/seq_file.h>
130#include <linux/proc_fs.h> 133#include <linux/proc_fs.h>
131#include <linux/seqlock.h> 134#include <linux/seqlock.h>
135#include <linux/uaccess.h>
132#include <linux/cgroup.h> 136#include <linux/cgroup.h>
133#include <linux/module.h> 137#include <linux/module.h>
134#include <linux/sched.h> 138#include <linux/sched.h>
139#include <linux/ctype.h>
140#include <linux/file.h>
141#include <linux/poll.h>
135#include <linux/psi.h> 142#include <linux/psi.h>
136#include "sched.h" 143#include "sched.h"
137 144
@@ -140,9 +147,9 @@ static int psi_bug __read_mostly;
140DEFINE_STATIC_KEY_FALSE(psi_disabled); 147DEFINE_STATIC_KEY_FALSE(psi_disabled);
141 148
142#ifdef CONFIG_PSI_DEFAULT_DISABLED 149#ifdef CONFIG_PSI_DEFAULT_DISABLED
143bool psi_enable; 150static bool psi_enable;
144#else 151#else
145bool psi_enable = true; 152static bool psi_enable = true;
146#endif 153#endif
147static int __init setup_psi(char *str) 154static int __init setup_psi(char *str)
148{ 155{
@@ -156,16 +163,21 @@ __setup("psi=", setup_psi);
156#define EXP_60s 1981 /* 1/exp(2s/60s) */ 163#define EXP_60s 1981 /* 1/exp(2s/60s) */
157#define EXP_300s 2034 /* 1/exp(2s/300s) */ 164#define EXP_300s 2034 /* 1/exp(2s/300s) */
158 165
166/* PSI trigger definitions */
167#define WINDOW_MIN_US 500000 /* Min window size is 500ms */
168#define WINDOW_MAX_US 10000000 /* Max window size is 10s */
169#define UPDATES_PER_WINDOW 10 /* 10 updates per window */
170
159/* Sampling frequency in nanoseconds */ 171/* Sampling frequency in nanoseconds */
160static u64 psi_period __read_mostly; 172static u64 psi_period __read_mostly;
161 173
162/* System-level pressure and stall tracking */ 174/* System-level pressure and stall tracking */
163static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu); 175static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu);
164static struct psi_group psi_system = { 176struct psi_group psi_system = {
165 .pcpu = &system_group_pcpu, 177 .pcpu = &system_group_pcpu,
166}; 178};
167 179
168static void psi_update_work(struct work_struct *work); 180static void psi_avgs_work(struct work_struct *work);
169 181
170static void group_init(struct psi_group *group) 182static void group_init(struct psi_group *group)
171{ 183{
@@ -173,9 +185,20 @@ static void group_init(struct psi_group *group)
173 185
174 for_each_possible_cpu(cpu) 186 for_each_possible_cpu(cpu)
175 seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); 187 seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
176 group->next_update = sched_clock() + psi_period; 188 group->avg_next_update = sched_clock() + psi_period;
177 INIT_DELAYED_WORK(&group->clock_work, psi_update_work); 189 INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
178 mutex_init(&group->stat_lock); 190 mutex_init(&group->avgs_lock);
191 /* Init trigger-related members */
192 atomic_set(&group->poll_scheduled, 0);
193 mutex_init(&group->trigger_lock);
194 INIT_LIST_HEAD(&group->triggers);
195 memset(group->nr_triggers, 0, sizeof(group->nr_triggers));
196 group->poll_states = 0;
197 group->poll_min_period = U32_MAX;
198 memset(group->polling_total, 0, sizeof(group->polling_total));
199 group->polling_next_update = ULLONG_MAX;
200 group->polling_until = 0;
201 rcu_assign_pointer(group->poll_kworker, NULL);
179} 202}
180 203
181void __init psi_init(void) 204void __init psi_init(void)
@@ -210,20 +233,24 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
210 } 233 }
211} 234}
212 235
213static void get_recent_times(struct psi_group *group, int cpu, u32 *times) 236static void get_recent_times(struct psi_group *group, int cpu,
237 enum psi_aggregators aggregator, u32 *times,
238 u32 *pchanged_states)
214{ 239{
215 struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); 240 struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
216 unsigned int tasks[NR_PSI_TASK_COUNTS];
217 u64 now, state_start; 241 u64 now, state_start;
242 enum psi_states s;
218 unsigned int seq; 243 unsigned int seq;
219 int s; 244 u32 state_mask;
245
246 *pchanged_states = 0;
220 247
221 /* Snapshot a coherent view of the CPU state */ 248 /* Snapshot a coherent view of the CPU state */
222 do { 249 do {
223 seq = read_seqcount_begin(&groupc->seq); 250 seq = read_seqcount_begin(&groupc->seq);
224 now = cpu_clock(cpu); 251 now = cpu_clock(cpu);
225 memcpy(times, groupc->times, sizeof(groupc->times)); 252 memcpy(times, groupc->times, sizeof(groupc->times));
226 memcpy(tasks, groupc->tasks, sizeof(groupc->tasks)); 253 state_mask = groupc->state_mask;
227 state_start = groupc->state_start; 254 state_start = groupc->state_start;
228 } while (read_seqcount_retry(&groupc->seq, seq)); 255 } while (read_seqcount_retry(&groupc->seq, seq));
229 256
@@ -239,13 +266,15 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *times)
239 * (u32) and our reported pressure close to what's 266 * (u32) and our reported pressure close to what's
240 * actually happening. 267 * actually happening.
241 */ 268 */
242 if (test_state(tasks, s)) 269 if (state_mask & (1 << s))
243 times[s] += now - state_start; 270 times[s] += now - state_start;
244 271
245 delta = times[s] - groupc->times_prev[s]; 272 delta = times[s] - groupc->times_prev[aggregator][s];
246 groupc->times_prev[s] = times[s]; 273 groupc->times_prev[aggregator][s] = times[s];
247 274
248 times[s] = delta; 275 times[s] = delta;
276 if (delta)
277 *pchanged_states |= (1 << s);
249 } 278 }
250} 279}
251 280
@@ -269,17 +298,16 @@ static void calc_avgs(unsigned long avg[3], int missed_periods,
269 avg[2] = calc_load(avg[2], EXP_300s, pct); 298 avg[2] = calc_load(avg[2], EXP_300s, pct);
270} 299}
271 300
272static bool update_stats(struct psi_group *group) 301static void collect_percpu_times(struct psi_group *group,
302 enum psi_aggregators aggregator,
303 u32 *pchanged_states)
273{ 304{
274 u64 deltas[NR_PSI_STATES - 1] = { 0, }; 305 u64 deltas[NR_PSI_STATES - 1] = { 0, };
275 unsigned long missed_periods = 0;
276 unsigned long nonidle_total = 0; 306 unsigned long nonidle_total = 0;
277 u64 now, expires, period; 307 u32 changed_states = 0;
278 int cpu; 308 int cpu;
279 int s; 309 int s;
280 310
281 mutex_lock(&group->stat_lock);
282
283 /* 311 /*
284 * Collect the per-cpu time buckets and average them into a 312 * Collect the per-cpu time buckets and average them into a
285 * single time sample that is normalized to wallclock time. 313 * single time sample that is normalized to wallclock time.
@@ -291,8 +319,11 @@ static bool update_stats(struct psi_group *group)
291 for_each_possible_cpu(cpu) { 319 for_each_possible_cpu(cpu) {
292 u32 times[NR_PSI_STATES]; 320 u32 times[NR_PSI_STATES];
293 u32 nonidle; 321 u32 nonidle;
322 u32 cpu_changed_states;
294 323
295 get_recent_times(group, cpu, times); 324 get_recent_times(group, cpu, aggregator, times,
325 &cpu_changed_states);
326 changed_states |= cpu_changed_states;
296 327
297 nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]); 328 nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]);
298 nonidle_total += nonidle; 329 nonidle_total += nonidle;
@@ -315,13 +346,22 @@ static bool update_stats(struct psi_group *group)
315 346
316 /* total= */ 347 /* total= */
317 for (s = 0; s < NR_PSI_STATES - 1; s++) 348 for (s = 0; s < NR_PSI_STATES - 1; s++)
318 group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL)); 349 group->total[aggregator][s] +=
350 div_u64(deltas[s], max(nonidle_total, 1UL));
351
352 if (pchanged_states)
353 *pchanged_states = changed_states;
354}
355
356static u64 update_averages(struct psi_group *group, u64 now)
357{
358 unsigned long missed_periods = 0;
359 u64 expires, period;
360 u64 avg_next_update;
361 int s;
319 362
320 /* avgX= */ 363 /* avgX= */
321 now = sched_clock(); 364 expires = group->avg_next_update;
322 expires = group->next_update;
323 if (now < expires)
324 goto out;
325 if (now - expires >= psi_period) 365 if (now - expires >= psi_period)
326 missed_periods = div_u64(now - expires, psi_period); 366 missed_periods = div_u64(now - expires, psi_period);
327 367
@@ -332,14 +372,14 @@ static bool update_stats(struct psi_group *group)
332 * But the deltas we sample out of the per-cpu buckets above 372 * But the deltas we sample out of the per-cpu buckets above
333 * are based on the actual time elapsing between clock ticks. 373 * are based on the actual time elapsing between clock ticks.
334 */ 374 */
335 group->next_update = expires + ((1 + missed_periods) * psi_period); 375 avg_next_update = expires + ((1 + missed_periods) * psi_period);
336 period = now - (group->last_update + (missed_periods * psi_period)); 376 period = now - (group->avg_last_update + (missed_periods * psi_period));
337 group->last_update = now; 377 group->avg_last_update = now;
338 378
339 for (s = 0; s < NR_PSI_STATES - 1; s++) { 379 for (s = 0; s < NR_PSI_STATES - 1; s++) {
340 u32 sample; 380 u32 sample;
341 381
342 sample = group->total[s] - group->total_prev[s]; 382 sample = group->total[PSI_AVGS][s] - group->avg_total[s];
343 /* 383 /*
344 * Due to the lockless sampling of the time buckets, 384 * Due to the lockless sampling of the time buckets,
345 * recorded time deltas can slip into the next period, 385 * recorded time deltas can slip into the next period,
@@ -359,23 +399,30 @@ static bool update_stats(struct psi_group *group)
359 */ 399 */
360 if (sample > period) 400 if (sample > period)
361 sample = period; 401 sample = period;
362 group->total_prev[s] += sample; 402 group->avg_total[s] += sample;
363 calc_avgs(group->avg[s], missed_periods, sample, period); 403 calc_avgs(group->avg[s], missed_periods, sample, period);
364 } 404 }
365out: 405
366 mutex_unlock(&group->stat_lock); 406 return avg_next_update;
367 return nonidle_total;
368} 407}
369 408
370static void psi_update_work(struct work_struct *work) 409static void psi_avgs_work(struct work_struct *work)
371{ 410{
372 struct delayed_work *dwork; 411 struct delayed_work *dwork;
373 struct psi_group *group; 412 struct psi_group *group;
413 u32 changed_states;
374 bool nonidle; 414 bool nonidle;
415 u64 now;
375 416
376 dwork = to_delayed_work(work); 417 dwork = to_delayed_work(work);
377 group = container_of(dwork, struct psi_group, clock_work); 418 group = container_of(dwork, struct psi_group, avgs_work);
419
420 mutex_lock(&group->avgs_lock);
378 421
422 now = sched_clock();
423
424 collect_percpu_times(group, PSI_AVGS, &changed_states);
425 nonidle = changed_states & (1 << PSI_NONIDLE);
379 /* 426 /*
380 * If there is task activity, periodically fold the per-cpu 427 * If there is task activity, periodically fold the per-cpu
381 * times and feed samples into the running averages. If things 428 * times and feed samples into the running averages. If things
@@ -383,18 +430,196 @@ static void psi_update_work(struct work_struct *work)
383 * Once restarted, we'll catch up the running averages in one 430 * Once restarted, we'll catch up the running averages in one
384 * go - see calc_avgs() and missed_periods. 431 * go - see calc_avgs() and missed_periods.
385 */ 432 */
386 433 if (now >= group->avg_next_update)
387 nonidle = update_stats(group); 434 group->avg_next_update = update_averages(group, now);
388 435
389 if (nonidle) { 436 if (nonidle) {
390 unsigned long delay = 0; 437 schedule_delayed_work(dwork, nsecs_to_jiffies(
391 u64 now; 438 group->avg_next_update - now) + 1);
439 }
440
441 mutex_unlock(&group->avgs_lock);
442}
443
444/* Trigger tracking window manupulations */
445static void window_reset(struct psi_window *win, u64 now, u64 value,
446 u64 prev_growth)
447{
448 win->start_time = now;
449 win->start_value = value;
450 win->prev_growth = prev_growth;
451}
452
453/*
454 * PSI growth tracking window update and growth calculation routine.
455 *
456 * This approximates a sliding tracking window by interpolating
457 * partially elapsed windows using historical growth data from the
458 * previous intervals. This minimizes memory requirements (by not storing
459 * all the intermediate values in the previous window) and simplifies
460 * the calculations. It works well because PSI signal changes only in
461 * positive direction and over relatively small window sizes the growth
462 * is close to linear.
463 */
464static u64 window_update(struct psi_window *win, u64 now, u64 value)
465{
466 u64 elapsed;
467 u64 growth;
468
469 elapsed = now - win->start_time;
470 growth = value - win->start_value;
471 /*
472 * After each tracking window passes win->start_value and
473 * win->start_time get reset and win->prev_growth stores
474 * the average per-window growth of the previous window.
475 * win->prev_growth is then used to interpolate additional
476 * growth from the previous window assuming it was linear.
477 */
478 if (elapsed > win->size)
479 window_reset(win, now, value, growth);
480 else {
481 u32 remaining;
482
483 remaining = win->size - elapsed;
484 growth += div_u64(win->prev_growth * remaining, win->size);
485 }
486
487 return growth;
488}
489
490static void init_triggers(struct psi_group *group, u64 now)
491{
492 struct psi_trigger *t;
493
494 list_for_each_entry(t, &group->triggers, node)
495 window_reset(&t->win, now,
496 group->total[PSI_POLL][t->state], 0);
497 memcpy(group->polling_total, group->total[PSI_POLL],
498 sizeof(group->polling_total));
499 group->polling_next_update = now + group->poll_min_period;
500}
501
502static u64 update_triggers(struct psi_group *group, u64 now)
503{
504 struct psi_trigger *t;
505 bool new_stall = false;
506 u64 *total = group->total[PSI_POLL];
507
508 /*
509 * On subsequent updates, calculate growth deltas and let
510 * watchers know when their specified thresholds are exceeded.
511 */
512 list_for_each_entry(t, &group->triggers, node) {
513 u64 growth;
514
515 /* Check for stall activity */
516 if (group->polling_total[t->state] == total[t->state])
517 continue;
518
519 /*
520 * Multiple triggers might be looking at the same state,
521 * remember to update group->polling_total[] once we've
522 * been through all of them. Also remember to extend the
523 * polling time if we see new stall activity.
524 */
525 new_stall = true;
526
527 /* Calculate growth since last update */
528 growth = window_update(&t->win, now, total[t->state]);
529 if (growth < t->threshold)
530 continue;
531
532 /* Limit event signaling to once per window */
533 if (now < t->last_event_time + t->win.size)
534 continue;
535
536 /* Generate an event */
537 if (cmpxchg(&t->event, 0, 1) == 0)
538 wake_up_interruptible(&t->event_wait);
539 t->last_event_time = now;
540 }
541
542 if (new_stall)
543 memcpy(group->polling_total, total,
544 sizeof(group->polling_total));
545
546 return now + group->poll_min_period;
547}
548
549/*
550 * Schedule polling if it's not already scheduled. It's safe to call even from
551 * hotpath because even though kthread_queue_delayed_work takes worker->lock
552 * spinlock that spinlock is never contended due to poll_scheduled atomic
553 * preventing such competition.
554 */
555static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay)
556{
557 struct kthread_worker *kworker;
558
559 /* Do not reschedule if already scheduled */
560 if (atomic_cmpxchg(&group->poll_scheduled, 0, 1) != 0)
561 return;
562
563 rcu_read_lock();
392 564
393 now = sched_clock(); 565 kworker = rcu_dereference(group->poll_kworker);
394 if (group->next_update > now) 566 /*
395 delay = nsecs_to_jiffies(group->next_update - now) + 1; 567 * kworker might be NULL in case psi_trigger_destroy races with
396 schedule_delayed_work(dwork, delay); 568 * psi_task_change (hotpath) which can't use locks
569 */
570 if (likely(kworker))
571 kthread_queue_delayed_work(kworker, &group->poll_work, delay);
572 else
573 atomic_set(&group->poll_scheduled, 0);
574
575 rcu_read_unlock();
576}
577
578static void psi_poll_work(struct kthread_work *work)
579{
580 struct kthread_delayed_work *dwork;
581 struct psi_group *group;
582 u32 changed_states;
583 u64 now;
584
585 dwork = container_of(work, struct kthread_delayed_work, work);
586 group = container_of(dwork, struct psi_group, poll_work);
587
588 atomic_set(&group->poll_scheduled, 0);
589
590 mutex_lock(&group->trigger_lock);
591
592 now = sched_clock();
593
594 collect_percpu_times(group, PSI_POLL, &changed_states);
595
596 if (changed_states & group->poll_states) {
597 /* Initialize trigger windows when entering polling mode */
598 if (now > group->polling_until)
599 init_triggers(group, now);
600
601 /*
602 * Keep the monitor active for at least the duration of the
603 * minimum tracking window as long as monitor states are
604 * changing.
605 */
606 group->polling_until = now +
607 group->poll_min_period * UPDATES_PER_WINDOW;
608 }
609
610 if (now > group->polling_until) {
611 group->polling_next_update = ULLONG_MAX;
612 goto out;
397 } 613 }
614
615 if (now >= group->polling_next_update)
616 group->polling_next_update = update_triggers(group, now);
617
618 psi_schedule_poll_work(group,
619 nsecs_to_jiffies(group->polling_next_update - now) + 1);
620
621out:
622 mutex_unlock(&group->trigger_lock);
398} 623}
399 624
400static void record_times(struct psi_group_cpu *groupc, int cpu, 625static void record_times(struct psi_group_cpu *groupc, int cpu,
@@ -407,15 +632,15 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,
407 delta = now - groupc->state_start; 632 delta = now - groupc->state_start;
408 groupc->state_start = now; 633 groupc->state_start = now;
409 634
410 if (test_state(groupc->tasks, PSI_IO_SOME)) { 635 if (groupc->state_mask & (1 << PSI_IO_SOME)) {
411 groupc->times[PSI_IO_SOME] += delta; 636 groupc->times[PSI_IO_SOME] += delta;
412 if (test_state(groupc->tasks, PSI_IO_FULL)) 637 if (groupc->state_mask & (1 << PSI_IO_FULL))
413 groupc->times[PSI_IO_FULL] += delta; 638 groupc->times[PSI_IO_FULL] += delta;
414 } 639 }
415 640
416 if (test_state(groupc->tasks, PSI_MEM_SOME)) { 641 if (groupc->state_mask & (1 << PSI_MEM_SOME)) {
417 groupc->times[PSI_MEM_SOME] += delta; 642 groupc->times[PSI_MEM_SOME] += delta;
418 if (test_state(groupc->tasks, PSI_MEM_FULL)) 643 if (groupc->state_mask & (1 << PSI_MEM_FULL))
419 groupc->times[PSI_MEM_FULL] += delta; 644 groupc->times[PSI_MEM_FULL] += delta;
420 else if (memstall_tick) { 645 else if (memstall_tick) {
421 u32 sample; 646 u32 sample;
@@ -436,18 +661,20 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,
436 } 661 }
437 } 662 }
438 663
439 if (test_state(groupc->tasks, PSI_CPU_SOME)) 664 if (groupc->state_mask & (1 << PSI_CPU_SOME))
440 groupc->times[PSI_CPU_SOME] += delta; 665 groupc->times[PSI_CPU_SOME] += delta;
441 666
442 if (test_state(groupc->tasks, PSI_NONIDLE)) 667 if (groupc->state_mask & (1 << PSI_NONIDLE))
443 groupc->times[PSI_NONIDLE] += delta; 668 groupc->times[PSI_NONIDLE] += delta;
444} 669}
445 670
446static void psi_group_change(struct psi_group *group, int cpu, 671static u32 psi_group_change(struct psi_group *group, int cpu,
447 unsigned int clear, unsigned int set) 672 unsigned int clear, unsigned int set)
448{ 673{
449 struct psi_group_cpu *groupc; 674 struct psi_group_cpu *groupc;
450 unsigned int t, m; 675 unsigned int t, m;
676 enum psi_states s;
677 u32 state_mask = 0;
451 678
452 groupc = per_cpu_ptr(group->pcpu, cpu); 679 groupc = per_cpu_ptr(group->pcpu, cpu);
453 680
@@ -480,7 +707,16 @@ static void psi_group_change(struct psi_group *group, int cpu,
480 if (set & (1 << t)) 707 if (set & (1 << t))
481 groupc->tasks[t]++; 708 groupc->tasks[t]++;
482 709
710 /* Calculate state mask representing active states */
711 for (s = 0; s < NR_PSI_STATES; s++) {
712 if (test_state(groupc->tasks, s))
713 state_mask |= (1 << s);
714 }
715 groupc->state_mask = state_mask;
716
483 write_seqcount_end(&groupc->seq); 717 write_seqcount_end(&groupc->seq);
718
719 return state_mask;
484} 720}
485 721
486static struct psi_group *iterate_groups(struct task_struct *task, void **iter) 722static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
@@ -537,13 +773,17 @@ void psi_task_change(struct task_struct *task, int clear, int set)
537 */ 773 */
538 if (unlikely((clear & TSK_RUNNING) && 774 if (unlikely((clear & TSK_RUNNING) &&
539 (task->flags & PF_WQ_WORKER) && 775 (task->flags & PF_WQ_WORKER) &&
540 wq_worker_last_func(task) == psi_update_work)) 776 wq_worker_last_func(task) == psi_avgs_work))
541 wake_clock = false; 777 wake_clock = false;
542 778
543 while ((group = iterate_groups(task, &iter))) { 779 while ((group = iterate_groups(task, &iter))) {
544 psi_group_change(group, cpu, clear, set); 780 u32 state_mask = psi_group_change(group, cpu, clear, set);
545 if (wake_clock && !delayed_work_pending(&group->clock_work)) 781
546 schedule_delayed_work(&group->clock_work, PSI_FREQ); 782 if (state_mask & group->poll_states)
783 psi_schedule_poll_work(group, 1);
784
785 if (wake_clock && !delayed_work_pending(&group->avgs_work))
786 schedule_delayed_work(&group->avgs_work, PSI_FREQ);
547 } 787 }
548} 788}
549 789
@@ -640,8 +880,10 @@ void psi_cgroup_free(struct cgroup *cgroup)
640 if (static_branch_likely(&psi_disabled)) 880 if (static_branch_likely(&psi_disabled))
641 return; 881 return;
642 882
643 cancel_delayed_work_sync(&cgroup->psi.clock_work); 883 cancel_delayed_work_sync(&cgroup->psi.avgs_work);
644 free_percpu(cgroup->psi.pcpu); 884 free_percpu(cgroup->psi.pcpu);
885 /* All triggers must be removed by now */
886 WARN_ONCE(cgroup->psi.poll_states, "psi: trigger leak\n");
645} 887}
646 888
647/** 889/**
@@ -697,11 +939,18 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
697int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) 939int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
698{ 940{
699 int full; 941 int full;
942 u64 now;
700 943
701 if (static_branch_likely(&psi_disabled)) 944 if (static_branch_likely(&psi_disabled))
702 return -EOPNOTSUPP; 945 return -EOPNOTSUPP;
703 946
704 update_stats(group); 947 /* Update averages before reporting them */
948 mutex_lock(&group->avgs_lock);
949 now = sched_clock();
950 collect_percpu_times(group, PSI_AVGS, NULL);
951 if (now >= group->avg_next_update)
952 group->avg_next_update = update_averages(group, now);
953 mutex_unlock(&group->avgs_lock);
705 954
706 for (full = 0; full < 2 - (res == PSI_CPU); full++) { 955 for (full = 0; full < 2 - (res == PSI_CPU); full++) {
707 unsigned long avg[3]; 956 unsigned long avg[3];
@@ -710,7 +959,8 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
710 959
711 for (w = 0; w < 3; w++) 960 for (w = 0; w < 3; w++)
712 avg[w] = group->avg[res * 2 + full][w]; 961 avg[w] = group->avg[res * 2 + full][w];
713 total = div_u64(group->total[res * 2 + full], NSEC_PER_USEC); 962 total = div_u64(group->total[PSI_AVGS][res * 2 + full],
963 NSEC_PER_USEC);
714 964
715 seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", 965 seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
716 full ? "full" : "some", 966 full ? "full" : "some",
@@ -753,25 +1003,270 @@ static int psi_cpu_open(struct inode *inode, struct file *file)
753 return single_open(file, psi_cpu_show, NULL); 1003 return single_open(file, psi_cpu_show, NULL);
754} 1004}
755 1005
1006struct psi_trigger *psi_trigger_create(struct psi_group *group,
1007 char *buf, size_t nbytes, enum psi_res res)
1008{
1009 struct psi_trigger *t;
1010 enum psi_states state;
1011 u32 threshold_us;
1012 u32 window_us;
1013
1014 if (static_branch_likely(&psi_disabled))
1015 return ERR_PTR(-EOPNOTSUPP);
1016
1017 if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2)
1018 state = PSI_IO_SOME + res * 2;
1019 else if (sscanf(buf, "full %u %u", &threshold_us, &window_us) == 2)
1020 state = PSI_IO_FULL + res * 2;
1021 else
1022 return ERR_PTR(-EINVAL);
1023
1024 if (state >= PSI_NONIDLE)
1025 return ERR_PTR(-EINVAL);
1026
1027 if (window_us < WINDOW_MIN_US ||
1028 window_us > WINDOW_MAX_US)
1029 return ERR_PTR(-EINVAL);
1030
1031 /* Check threshold */
1032 if (threshold_us == 0 || threshold_us > window_us)
1033 return ERR_PTR(-EINVAL);
1034
1035 t = kmalloc(sizeof(*t), GFP_KERNEL);
1036 if (!t)
1037 return ERR_PTR(-ENOMEM);
1038
1039 t->group = group;
1040 t->state = state;
1041 t->threshold = threshold_us * NSEC_PER_USEC;
1042 t->win.size = window_us * NSEC_PER_USEC;
1043 window_reset(&t->win, 0, 0, 0);
1044
1045 t->event = 0;
1046 t->last_event_time = 0;
1047 init_waitqueue_head(&t->event_wait);
1048 kref_init(&t->refcount);
1049
1050 mutex_lock(&group->trigger_lock);
1051
1052 if (!rcu_access_pointer(group->poll_kworker)) {
1053 struct sched_param param = {
1054 .sched_priority = MAX_RT_PRIO - 1,
1055 };
1056 struct kthread_worker *kworker;
1057
1058 kworker = kthread_create_worker(0, "psimon");
1059 if (IS_ERR(kworker)) {
1060 kfree(t);
1061 mutex_unlock(&group->trigger_lock);
1062 return ERR_CAST(kworker);
1063 }
1064 sched_setscheduler(kworker->task, SCHED_FIFO, &param);
1065 kthread_init_delayed_work(&group->poll_work,
1066 psi_poll_work);
1067 rcu_assign_pointer(group->poll_kworker, kworker);
1068 }
1069
1070 list_add(&t->node, &group->triggers);
1071 group->poll_min_period = min(group->poll_min_period,
1072 div_u64(t->win.size, UPDATES_PER_WINDOW));
1073 group->nr_triggers[t->state]++;
1074 group->poll_states |= (1 << t->state);
1075
1076 mutex_unlock(&group->trigger_lock);
1077
1078 return t;
1079}
1080
1081static void psi_trigger_destroy(struct kref *ref)
1082{
1083 struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount);
1084 struct psi_group *group = t->group;
1085 struct kthread_worker *kworker_to_destroy = NULL;
1086
1087 if (static_branch_likely(&psi_disabled))
1088 return;
1089
1090 /*
1091 * Wakeup waiters to stop polling. Can happen if cgroup is deleted
1092 * from under a polling process.
1093 */
1094 wake_up_interruptible(&t->event_wait);
1095
1096 mutex_lock(&group->trigger_lock);
1097
1098 if (!list_empty(&t->node)) {
1099 struct psi_trigger *tmp;
1100 u64 period = ULLONG_MAX;
1101
1102 list_del(&t->node);
1103 group->nr_triggers[t->state]--;
1104 if (!group->nr_triggers[t->state])
1105 group->poll_states &= ~(1 << t->state);
1106 /* reset min update period for the remaining triggers */
1107 list_for_each_entry(tmp, &group->triggers, node)
1108 period = min(period, div_u64(tmp->win.size,
1109 UPDATES_PER_WINDOW));
1110 group->poll_min_period = period;
1111 /* Destroy poll_kworker when the last trigger is destroyed */
1112 if (group->poll_states == 0) {
1113 group->polling_until = 0;
1114 kworker_to_destroy = rcu_dereference_protected(
1115 group->poll_kworker,
1116 lockdep_is_held(&group->trigger_lock));
1117 rcu_assign_pointer(group->poll_kworker, NULL);
1118 }
1119 }
1120
1121 mutex_unlock(&group->trigger_lock);
1122
1123 /*
1124 * Wait for both *trigger_ptr from psi_trigger_replace and
1125 * poll_kworker RCUs to complete their read-side critical sections
1126 * before destroying the trigger and optionally the poll_kworker
1127 */
1128 synchronize_rcu();
1129 /*
1130 * Destroy the kworker after releasing trigger_lock to prevent a
1131 * deadlock while waiting for psi_poll_work to acquire trigger_lock
1132 */
1133 if (kworker_to_destroy) {
1134 kthread_cancel_delayed_work_sync(&group->poll_work);
1135 kthread_destroy_worker(kworker_to_destroy);
1136 }
1137 kfree(t);
1138}
1139
1140void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new)
1141{
1142 struct psi_trigger *old = *trigger_ptr;
1143
1144 if (static_branch_likely(&psi_disabled))
1145 return;
1146
1147 rcu_assign_pointer(*trigger_ptr, new);
1148 if (old)
1149 kref_put(&old->refcount, psi_trigger_destroy);
1150}
1151
1152__poll_t psi_trigger_poll(void **trigger_ptr,
1153 struct file *file, poll_table *wait)
1154{
1155 __poll_t ret = DEFAULT_POLLMASK;
1156 struct psi_trigger *t;
1157
1158 if (static_branch_likely(&psi_disabled))
1159 return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
1160
1161 rcu_read_lock();
1162
1163 t = rcu_dereference(*(void __rcu __force **)trigger_ptr);
1164 if (!t) {
1165 rcu_read_unlock();
1166 return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
1167 }
1168 kref_get(&t->refcount);
1169
1170 rcu_read_unlock();
1171
1172 poll_wait(file, &t->event_wait, wait);
1173
1174 if (cmpxchg(&t->event, 1, 0) == 1)
1175 ret |= EPOLLPRI;
1176
1177 kref_put(&t->refcount, psi_trigger_destroy);
1178
1179 return ret;
1180}
1181
1182static ssize_t psi_write(struct file *file, const char __user *user_buf,
1183 size_t nbytes, enum psi_res res)
1184{
1185 char buf[32];
1186 size_t buf_size;
1187 struct seq_file *seq;
1188 struct psi_trigger *new;
1189
1190 if (static_branch_likely(&psi_disabled))
1191 return -EOPNOTSUPP;
1192
1193 buf_size = min(nbytes, (sizeof(buf) - 1));
1194 if (copy_from_user(buf, user_buf, buf_size))
1195 return -EFAULT;
1196
1197 buf[buf_size - 1] = '\0';
1198
1199 new = psi_trigger_create(&psi_system, buf, nbytes, res);
1200 if (IS_ERR(new))
1201 return PTR_ERR(new);
1202
1203 seq = file->private_data;
1204 /* Take seq->lock to protect seq->private from concurrent writes */
1205 mutex_lock(&seq->lock);
1206 psi_trigger_replace(&seq->private, new);
1207 mutex_unlock(&seq->lock);
1208
1209 return nbytes;
1210}
1211
1212static ssize_t psi_io_write(struct file *file, const char __user *user_buf,
1213 size_t nbytes, loff_t *ppos)
1214{
1215 return psi_write(file, user_buf, nbytes, PSI_IO);
1216}
1217
1218static ssize_t psi_memory_write(struct file *file, const char __user *user_buf,
1219 size_t nbytes, loff_t *ppos)
1220{
1221 return psi_write(file, user_buf, nbytes, PSI_MEM);
1222}
1223
1224static ssize_t psi_cpu_write(struct file *file, const char __user *user_buf,
1225 size_t nbytes, loff_t *ppos)
1226{
1227 return psi_write(file, user_buf, nbytes, PSI_CPU);
1228}
1229
1230static __poll_t psi_fop_poll(struct file *file, poll_table *wait)
1231{
1232 struct seq_file *seq = file->private_data;
1233
1234 return psi_trigger_poll(&seq->private, file, wait);
1235}
1236
1237static int psi_fop_release(struct inode *inode, struct file *file)
1238{
1239 struct seq_file *seq = file->private_data;
1240
1241 psi_trigger_replace(&seq->private, NULL);
1242 return single_release(inode, file);
1243}
1244
756static const struct file_operations psi_io_fops = { 1245static const struct file_operations psi_io_fops = {
757 .open = psi_io_open, 1246 .open = psi_io_open,
758 .read = seq_read, 1247 .read = seq_read,
759 .llseek = seq_lseek, 1248 .llseek = seq_lseek,
760 .release = single_release, 1249 .write = psi_io_write,
1250 .poll = psi_fop_poll,
1251 .release = psi_fop_release,
761}; 1252};
762 1253
763static const struct file_operations psi_memory_fops = { 1254static const struct file_operations psi_memory_fops = {
764 .open = psi_memory_open, 1255 .open = psi_memory_open,
765 .read = seq_read, 1256 .read = seq_read,
766 .llseek = seq_lseek, 1257 .llseek = seq_lseek,
767 .release = single_release, 1258 .write = psi_memory_write,
1259 .poll = psi_fop_poll,
1260 .release = psi_fop_release,
768}; 1261};
769 1262
770static const struct file_operations psi_cpu_fops = { 1263static const struct file_operations psi_cpu_fops = {
771 .open = psi_cpu_open, 1264 .open = psi_cpu_open,
772 .read = seq_read, 1265 .read = seq_read,
773 .llseek = seq_lseek, 1266 .llseek = seq_lseek,
774 .release = single_release, 1267 .write = psi_cpu_write,
1268 .poll = psi_fop_poll,
1269 .release = psi_fop_release,
775}; 1270};
776 1271
777static int __init psi_proc_init(void) 1272static int __init psi_proc_init(void)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 1e6b909dca36..a532558a5176 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1614,7 +1614,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1614static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 1614static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1615{ 1615{
1616 if (!task_running(rq, p) && 1616 if (!task_running(rq, p) &&
1617 cpumask_test_cpu(cpu, &p->cpus_allowed)) 1617 cpumask_test_cpu(cpu, p->cpus_ptr))
1618 return 1; 1618 return 1;
1619 1619
1620 return 0; 1620 return 0;
@@ -1751,7 +1751,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1751 * Also make sure that it wasn't scheduled on its rq. 1751 * Also make sure that it wasn't scheduled on its rq.
1752 */ 1752 */
1753 if (unlikely(task_rq(task) != rq || 1753 if (unlikely(task_rq(task) != rq ||
1754 !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) || 1754 !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) ||
1755 task_running(rq, task) || 1755 task_running(rq, task) ||
1756 !rt_task(task) || 1756 !rt_task(task) ||
1757 !task_on_rq_queued(task))) { 1757 !task_on_rq_queued(task))) {
@@ -2400,6 +2400,10 @@ const struct sched_class rt_sched_class = {
2400 .switched_to = switched_to_rt, 2400 .switched_to = switched_to_rt,
2401 2401
2402 .update_curr = update_curr_rt, 2402 .update_curr = update_curr_rt,
2403
2404#ifdef CONFIG_UCLAMP_TASK
2405 .uclamp_enabled = 1,
2406#endif
2403}; 2407};
2404 2408
2405#ifdef CONFIG_RT_GROUP_SCHED 2409#ifdef CONFIG_RT_GROUP_SCHED
diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h
index a26473674fb7..c529706bed11 100644
--- a/kernel/sched/sched-pelt.h
+++ b/kernel/sched/sched-pelt.h
@@ -1,7 +1,7 @@
1/* SPDX-License-Identifier: GPL-2.0 */ 1/* SPDX-License-Identifier: GPL-2.0 */
2/* Generated by Documentation/scheduler/sched-pelt; do not modify. */ 2/* Generated by Documentation/scheduler/sched-pelt; do not modify. */
3 3
4static const u32 runnable_avg_yN_inv[] = { 4static const u32 runnable_avg_yN_inv[] __maybe_unused = {
5 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, 5 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
6 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, 6 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
7 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581, 7 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b52ed1ada0be..802b1f3405f2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -96,12 +96,6 @@ extern atomic_long_t calc_load_tasks;
96extern void calc_global_load_tick(struct rq *this_rq); 96extern void calc_global_load_tick(struct rq *this_rq);
97extern long calc_load_fold_active(struct rq *this_rq, long adjust); 97extern long calc_load_fold_active(struct rq *this_rq, long adjust);
98 98
99#ifdef CONFIG_SMP
100extern void cpu_load_update_active(struct rq *this_rq);
101#else
102static inline void cpu_load_update_active(struct rq *this_rq) { }
103#endif
104
105/* 99/*
106 * Helpers for converting nanosecond timing to jiffy resolution 100 * Helpers for converting nanosecond timing to jiffy resolution
107 */ 101 */
@@ -344,8 +338,10 @@ struct cfs_bandwidth {
344 u64 runtime_expires; 338 u64 runtime_expires;
345 int expires_seq; 339 int expires_seq;
346 340
347 short idle; 341 u8 idle;
348 short period_active; 342 u8 period_active;
343 u8 distribute_running;
344 u8 slack_started;
349 struct hrtimer period_timer; 345 struct hrtimer period_timer;
350 struct hrtimer slack_timer; 346 struct hrtimer slack_timer;
351 struct list_head throttled_cfs_rq; 347 struct list_head throttled_cfs_rq;
@@ -354,8 +350,6 @@ struct cfs_bandwidth {
354 int nr_periods; 350 int nr_periods;
355 int nr_throttled; 351 int nr_throttled;
356 u64 throttled_time; 352 u64 throttled_time;
357
358 bool distribute_running;
359#endif 353#endif
360}; 354};
361 355
@@ -797,6 +791,48 @@ extern void rto_push_irq_work_func(struct irq_work *work);
797#endif 791#endif
798#endif /* CONFIG_SMP */ 792#endif /* CONFIG_SMP */
799 793
794#ifdef CONFIG_UCLAMP_TASK
795/*
796 * struct uclamp_bucket - Utilization clamp bucket
797 * @value: utilization clamp value for tasks on this clamp bucket
798 * @tasks: number of RUNNABLE tasks on this clamp bucket
799 *
800 * Keep track of how many tasks are RUNNABLE for a given utilization
801 * clamp value.
802 */
803struct uclamp_bucket {
804 unsigned long value : bits_per(SCHED_CAPACITY_SCALE);
805 unsigned long tasks : BITS_PER_LONG - bits_per(SCHED_CAPACITY_SCALE);
806};
807
808/*
809 * struct uclamp_rq - rq's utilization clamp
810 * @value: currently active clamp values for a rq
811 * @bucket: utilization clamp buckets affecting a rq
812 *
813 * Keep track of RUNNABLE tasks on a rq to aggregate their clamp values.
814 * A clamp value is affecting a rq when there is at least one task RUNNABLE
815 * (or actually running) with that value.
816 *
817 * There are up to UCLAMP_CNT possible different clamp values, currently there
818 * are only two: minimum utilization and maximum utilization.
819 *
820 * All utilization clamping values are MAX aggregated, since:
821 * - for util_min: we want to run the CPU at least at the max of the minimum
822 * utilization required by its currently RUNNABLE tasks.
823 * - for util_max: we want to allow the CPU to run up to the max of the
824 * maximum utilization allowed by its currently RUNNABLE tasks.
825 *
826 * Since on each system we expect only a limited number of different
827 * utilization clamp values (UCLAMP_BUCKETS), use a simple array to track
828 * the metrics required to compute all the per-rq utilization clamp values.
829 */
830struct uclamp_rq {
831 unsigned int value;
832 struct uclamp_bucket bucket[UCLAMP_BUCKETS];
833};
834#endif /* CONFIG_UCLAMP_TASK */
835
800/* 836/*
801 * This is the main, per-CPU runqueue data structure. 837 * This is the main, per-CPU runqueue data structure.
802 * 838 *
@@ -818,8 +854,6 @@ struct rq {
818 unsigned int nr_preferred_running; 854 unsigned int nr_preferred_running;
819 unsigned int numa_migrate_on; 855 unsigned int numa_migrate_on;
820#endif 856#endif
821 #define CPU_LOAD_IDX_MAX 5
822 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
823#ifdef CONFIG_NO_HZ_COMMON 857#ifdef CONFIG_NO_HZ_COMMON
824#ifdef CONFIG_SMP 858#ifdef CONFIG_SMP
825 unsigned long last_load_update_tick; 859 unsigned long last_load_update_tick;
@@ -830,11 +864,16 @@ struct rq {
830 atomic_t nohz_flags; 864 atomic_t nohz_flags;
831#endif /* CONFIG_NO_HZ_COMMON */ 865#endif /* CONFIG_NO_HZ_COMMON */
832 866
833 /* capture load from *all* tasks on this CPU: */
834 struct load_weight load;
835 unsigned long nr_load_updates; 867 unsigned long nr_load_updates;
836 u64 nr_switches; 868 u64 nr_switches;
837 869
870#ifdef CONFIG_UCLAMP_TASK
871 /* Utilization clamp values based on CPU's RUNNABLE tasks */
872 struct uclamp_rq uclamp[UCLAMP_CNT] ____cacheline_aligned;
873 unsigned int uclamp_flags;
874#define UCLAMP_FLAG_IDLE 0x01
875#endif
876
838 struct cfs_rq cfs; 877 struct cfs_rq cfs;
839 struct rt_rq rt; 878 struct rt_rq rt;
840 struct dl_rq dl; 879 struct dl_rq dl;
@@ -1649,6 +1688,10 @@ extern const u32 sched_prio_to_wmult[40];
1649struct sched_class { 1688struct sched_class {
1650 const struct sched_class *next; 1689 const struct sched_class *next;
1651 1690
1691#ifdef CONFIG_UCLAMP_TASK
1692 int uclamp_enabled;
1693#endif
1694
1652 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); 1695 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
1653 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); 1696 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
1654 void (*yield_task) (struct rq *rq); 1697 void (*yield_task) (struct rq *rq);
@@ -2222,6 +2265,48 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
2222static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} 2265static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
2223#endif /* CONFIG_CPU_FREQ */ 2266#endif /* CONFIG_CPU_FREQ */
2224 2267
2268#ifdef CONFIG_UCLAMP_TASK
2269unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id);
2270
2271static __always_inline
2272unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
2273 struct task_struct *p)
2274{
2275 unsigned int min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value);
2276 unsigned int max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
2277
2278 if (p) {
2279 min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN));
2280 max_util = max(max_util, uclamp_eff_value(p, UCLAMP_MAX));
2281 }
2282
2283 /*
2284 * Since CPU's {min,max}_util clamps are MAX aggregated considering
2285 * RUNNABLE tasks with _different_ clamps, we can end up with an
2286 * inversion. Fix it now when the clamps are applied.
2287 */
2288 if (unlikely(min_util >= max_util))
2289 return min_util;
2290
2291 return clamp(util, min_util, max_util);
2292}
2293
2294static inline unsigned int uclamp_util(struct rq *rq, unsigned int util)
2295{
2296 return uclamp_util_with(rq, util, NULL);
2297}
2298#else /* CONFIG_UCLAMP_TASK */
2299static inline unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
2300 struct task_struct *p)
2301{
2302 return util;
2303}
2304static inline unsigned int uclamp_util(struct rq *rq, unsigned int util)
2305{
2306 return util;
2307}
2308#endif /* CONFIG_UCLAMP_TASK */
2309
2225#ifdef arch_scale_freq_capacity 2310#ifdef arch_scale_freq_capacity
2226# ifndef arch_scale_freq_invariant 2311# ifndef arch_scale_freq_invariant
2227# define arch_scale_freq_invariant() true 2312# define arch_scale_freq_invariant() true
@@ -2237,7 +2322,6 @@ static inline unsigned long capacity_orig_of(int cpu)
2237} 2322}
2238#endif 2323#endif
2239 2324
2240#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
2241/** 2325/**
2242 * enum schedutil_type - CPU utilization type 2326 * enum schedutil_type - CPU utilization type
2243 * @FREQUENCY_UTIL: Utilization used to select frequency 2327 * @FREQUENCY_UTIL: Utilization used to select frequency
@@ -2253,15 +2337,11 @@ enum schedutil_type {
2253 ENERGY_UTIL, 2337 ENERGY_UTIL,
2254}; 2338};
2255 2339
2256unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, 2340#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
2257 unsigned long max, enum schedutil_type type);
2258
2259static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs)
2260{
2261 unsigned long max = arch_scale_cpu_capacity(NULL, cpu);
2262 2341
2263 return schedutil_freq_util(cpu, cfs, max, ENERGY_UTIL); 2342unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
2264} 2343 unsigned long max, enum schedutil_type type,
2344 struct task_struct *p);
2265 2345
2266static inline unsigned long cpu_bw_dl(struct rq *rq) 2346static inline unsigned long cpu_bw_dl(struct rq *rq)
2267{ 2347{
@@ -2290,11 +2370,13 @@ static inline unsigned long cpu_util_rt(struct rq *rq)
2290 return READ_ONCE(rq->avg_rt.util_avg); 2370 return READ_ONCE(rq->avg_rt.util_avg);
2291} 2371}
2292#else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ 2372#else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
2293static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs) 2373static inline unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
2374 unsigned long max, enum schedutil_type type,
2375 struct task_struct *p)
2294{ 2376{
2295 return cfs; 2377 return 0;
2296} 2378}
2297#endif 2379#endif /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
2298 2380
2299#ifdef CONFIG_HAVE_SCHED_AVG_IRQ 2381#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
2300static inline unsigned long cpu_util_irq(struct rq *rq) 2382static inline unsigned long cpu_util_irq(struct rq *rq)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index f53f89df837d..f751ce0b783e 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1344,11 +1344,6 @@ sd_init(struct sched_domain_topology_level *tl,
1344 .imbalance_pct = 125, 1344 .imbalance_pct = 125,
1345 1345
1346 .cache_nice_tries = 0, 1346 .cache_nice_tries = 0,
1347 .busy_idx = 0,
1348 .idle_idx = 0,
1349 .newidle_idx = 0,
1350 .wake_idx = 0,
1351 .forkexec_idx = 0,
1352 1347
1353 .flags = 1*SD_LOAD_BALANCE 1348 .flags = 1*SD_LOAD_BALANCE
1354 | 1*SD_BALANCE_NEWIDLE 1349 | 1*SD_BALANCE_NEWIDLE
@@ -1400,13 +1395,10 @@ sd_init(struct sched_domain_topology_level *tl,
1400 } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { 1395 } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
1401 sd->imbalance_pct = 117; 1396 sd->imbalance_pct = 117;
1402 sd->cache_nice_tries = 1; 1397 sd->cache_nice_tries = 1;
1403 sd->busy_idx = 2;
1404 1398
1405#ifdef CONFIG_NUMA 1399#ifdef CONFIG_NUMA
1406 } else if (sd->flags & SD_NUMA) { 1400 } else if (sd->flags & SD_NUMA) {
1407 sd->cache_nice_tries = 2; 1401 sd->cache_nice_tries = 2;
1408 sd->busy_idx = 3;
1409 sd->idle_idx = 2;
1410 1402
1411 sd->flags &= ~SD_PREFER_SIBLING; 1403 sd->flags &= ~SD_PREFER_SIBLING;
1412 sd->flags |= SD_SERIALIZE; 1404 sd->flags |= SD_SERIALIZE;
@@ -1419,8 +1411,6 @@ sd_init(struct sched_domain_topology_level *tl,
1419#endif 1411#endif
1420 } else { 1412 } else {
1421 sd->cache_nice_tries = 1; 1413 sd->cache_nice_tries = 1;
1422 sd->busy_idx = 2;
1423 sd->idle_idx = 1;
1424 } 1414 }
1425 1415
1426 /* 1416 /*
@@ -1884,10 +1874,10 @@ static struct sched_domain_topology_level
1884 unsigned long cap; 1874 unsigned long cap;
1885 1875
1886 /* Is there any asymmetry? */ 1876 /* Is there any asymmetry? */
1887 cap = arch_scale_cpu_capacity(NULL, cpumask_first(cpu_map)); 1877 cap = arch_scale_cpu_capacity(cpumask_first(cpu_map));
1888 1878
1889 for_each_cpu(i, cpu_map) { 1879 for_each_cpu(i, cpu_map) {
1890 if (arch_scale_cpu_capacity(NULL, i) != cap) { 1880 if (arch_scale_cpu_capacity(i) != cap) {
1891 asym = true; 1881 asym = true;
1892 break; 1882 break;
1893 } 1883 }
@@ -1902,7 +1892,7 @@ static struct sched_domain_topology_level
1902 * to everyone. 1892 * to everyone.
1903 */ 1893 */
1904 for_each_cpu(i, cpu_map) { 1894 for_each_cpu(i, cpu_map) {
1905 unsigned long max_capacity = arch_scale_cpu_capacity(NULL, i); 1895 unsigned long max_capacity = arch_scale_cpu_capacity(i);
1906 int tl_id = 0; 1896 int tl_id = 0;
1907 1897
1908 for_each_sd_topology(tl) { 1898 for_each_sd_topology(tl) {
@@ -1912,7 +1902,7 @@ static struct sched_domain_topology_level
1912 for_each_cpu_and(j, tl->mask(i), cpu_map) { 1902 for_each_cpu_and(j, tl->mask(i), cpu_map) {
1913 unsigned long capacity; 1903 unsigned long capacity;
1914 1904
1915 capacity = arch_scale_cpu_capacity(NULL, j); 1905 capacity = arch_scale_cpu_capacity(j);
1916 1906
1917 if (capacity <= max_capacity) 1907 if (capacity <= max_capacity)
1918 continue; 1908 continue;
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 6eb1f8efd221..c1e566a114ca 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * Generic waiting primitives. 3 * Generic waiting primitives.
3 * 4 *
@@ -117,16 +118,12 @@ static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int
117 bookmark.func = NULL; 118 bookmark.func = NULL;
118 INIT_LIST_HEAD(&bookmark.entry); 119 INIT_LIST_HEAD(&bookmark.entry);
119 120
120 spin_lock_irqsave(&wq_head->lock, flags); 121 do {
121 nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key, &bookmark);
122 spin_unlock_irqrestore(&wq_head->lock, flags);
123
124 while (bookmark.flags & WQ_FLAG_BOOKMARK) {
125 spin_lock_irqsave(&wq_head->lock, flags); 122 spin_lock_irqsave(&wq_head->lock, flags);
126 nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, 123 nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive,
127 wake_flags, key, &bookmark); 124 wake_flags, key, &bookmark);
128 spin_unlock_irqrestore(&wq_head->lock, flags); 125 spin_unlock_irqrestore(&wq_head->lock, flags);
129 } 126 } while (bookmark.flags & WQ_FLAG_BOOKMARK);
130} 127}
131 128
132/** 129/**
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index c67c6d24adc2..45eba18a2898 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * The implementation of the wait_bit*() and related waiting APIs: 3 * The implementation of the wait_bit*() and related waiting APIs:
3 */ 4 */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 811b4a86cdf6..dba52a7db5e8 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -609,7 +609,7 @@ static void seccomp_send_sigsys(int syscall, int reason)
609{ 609{
610 struct kernel_siginfo info; 610 struct kernel_siginfo info;
611 seccomp_init_siginfo(&info, syscall, reason); 611 seccomp_init_siginfo(&info, syscall, reason);
612 force_sig_info(SIGSYS, &info, current); 612 force_sig_info(&info);
613} 613}
614#endif /* CONFIG_SECCOMP_FILTER */ 614#endif /* CONFIG_SECCOMP_FILTER */
615 615
diff --git a/kernel/signal.c b/kernel/signal.c
index 62f9aea4a15a..dabe100d2091 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * linux/kernel/signal.c 3 * linux/kernel/signal.c
3 * 4 *
@@ -44,6 +45,7 @@
44#include <linux/posix-timers.h> 45#include <linux/posix-timers.h>
45#include <linux/livepatch.h> 46#include <linux/livepatch.h>
46#include <linux/cgroup.h> 47#include <linux/cgroup.h>
48#include <linux/audit.h>
47 49
48#define CREATE_TRACE_POINTS 50#define CREATE_TRACE_POINTS
49#include <trace/events/signal.h> 51#include <trace/events/signal.h>
@@ -53,7 +55,6 @@
53#include <asm/unistd.h> 55#include <asm/unistd.h>
54#include <asm/siginfo.h> 56#include <asm/siginfo.h>
55#include <asm/cacheflush.h> 57#include <asm/cacheflush.h>
56#include "audit.h" /* audit_signal_info() */
57 58
58/* 59/*
59 * SLAB caches for signal bits. 60 * SLAB caches for signal bits.
@@ -840,6 +841,7 @@ static int check_kill_permission(int sig, struct kernel_siginfo *info,
840 */ 841 */
841 if (!sid || sid == task_session(current)) 842 if (!sid || sid == task_session(current))
842 break; 843 break;
844 /* fall through */
843 default: 845 default:
844 return -EPERM; 846 return -EPERM;
845 } 847 }
@@ -1055,29 +1057,8 @@ static inline bool legacy_queue(struct sigpending *signals, int sig)
1055 return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); 1057 return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
1056} 1058}
1057 1059
1058#ifdef CONFIG_USER_NS
1059static inline void userns_fixup_signal_uid(struct kernel_siginfo *info, struct task_struct *t)
1060{
1061 if (current_user_ns() == task_cred_xxx(t, user_ns))
1062 return;
1063
1064 if (SI_FROMKERNEL(info))
1065 return;
1066
1067 rcu_read_lock();
1068 info->si_uid = from_kuid_munged(task_cred_xxx(t, user_ns),
1069 make_kuid(current_user_ns(), info->si_uid));
1070 rcu_read_unlock();
1071}
1072#else
1073static inline void userns_fixup_signal_uid(struct kernel_siginfo *info, struct task_struct *t)
1074{
1075 return;
1076}
1077#endif
1078
1079static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t, 1060static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t,
1080 enum pid_type type, int from_ancestor_ns) 1061 enum pid_type type, bool force)
1081{ 1062{
1082 struct sigpending *pending; 1063 struct sigpending *pending;
1083 struct sigqueue *q; 1064 struct sigqueue *q;
@@ -1087,8 +1068,7 @@ static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struc
1087 assert_spin_locked(&t->sighand->siglock); 1068 assert_spin_locked(&t->sighand->siglock);
1088 1069
1089 result = TRACE_SIGNAL_IGNORED; 1070 result = TRACE_SIGNAL_IGNORED;
1090 if (!prepare_signal(sig, t, 1071 if (!prepare_signal(sig, t, force))
1091 from_ancestor_ns || (info == SEND_SIG_PRIV)))
1092 goto ret; 1072 goto ret;
1093 1073
1094 pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending; 1074 pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending;
@@ -1133,7 +1113,11 @@ static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struc
1133 q->info.si_code = SI_USER; 1113 q->info.si_code = SI_USER;
1134 q->info.si_pid = task_tgid_nr_ns(current, 1114 q->info.si_pid = task_tgid_nr_ns(current,
1135 task_active_pid_ns(t)); 1115 task_active_pid_ns(t));
1136 q->info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); 1116 rcu_read_lock();
1117 q->info.si_uid =
1118 from_kuid_munged(task_cred_xxx(t, user_ns),
1119 current_uid());
1120 rcu_read_unlock();
1137 break; 1121 break;
1138 case (unsigned long) SEND_SIG_PRIV: 1122 case (unsigned long) SEND_SIG_PRIV:
1139 clear_siginfo(&q->info); 1123 clear_siginfo(&q->info);
@@ -1145,30 +1129,24 @@ static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struc
1145 break; 1129 break;
1146 default: 1130 default:
1147 copy_siginfo(&q->info, info); 1131 copy_siginfo(&q->info, info);
1148 if (from_ancestor_ns)
1149 q->info.si_pid = 0;
1150 break; 1132 break;
1151 } 1133 }
1152 1134 } else if (!is_si_special(info) &&
1153 userns_fixup_signal_uid(&q->info, t); 1135 sig >= SIGRTMIN && info->si_code != SI_USER) {
1154 1136 /*
1155 } else if (!is_si_special(info)) { 1137 * Queue overflow, abort. We may abort if the
1156 if (sig >= SIGRTMIN && info->si_code != SI_USER) { 1138 * signal was rt and sent by user using something
1157 /* 1139 * other than kill().
1158 * Queue overflow, abort. We may abort if the 1140 */
1159 * signal was rt and sent by user using something 1141 result = TRACE_SIGNAL_OVERFLOW_FAIL;
1160 * other than kill(). 1142 ret = -EAGAIN;
1161 */ 1143 goto ret;
1162 result = TRACE_SIGNAL_OVERFLOW_FAIL; 1144 } else {
1163 ret = -EAGAIN; 1145 /*
1164 goto ret; 1146 * This is a silent loss of information. We still
1165 } else { 1147 * send the signal, but the *info bits are lost.
1166 /* 1148 */
1167 * This is a silent loss of information. We still 1149 result = TRACE_SIGNAL_LOSE_INFO;
1168 * send the signal, but the *info bits are lost.
1169 */
1170 result = TRACE_SIGNAL_LOSE_INFO;
1171 }
1172 } 1150 }
1173 1151
1174out_set: 1152out_set:
@@ -1195,17 +1173,62 @@ ret:
1195 return ret; 1173 return ret;
1196} 1174}
1197 1175
1176static inline bool has_si_pid_and_uid(struct kernel_siginfo *info)
1177{
1178 bool ret = false;
1179 switch (siginfo_layout(info->si_signo, info->si_code)) {
1180 case SIL_KILL:
1181 case SIL_CHLD:
1182 case SIL_RT:
1183 ret = true;
1184 break;
1185 case SIL_TIMER:
1186 case SIL_POLL:
1187 case SIL_FAULT:
1188 case SIL_FAULT_MCEERR:
1189 case SIL_FAULT_BNDERR:
1190 case SIL_FAULT_PKUERR:
1191 case SIL_SYS:
1192 ret = false;
1193 break;
1194 }
1195 return ret;
1196}
1197
1198static int send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t, 1198static int send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t,
1199 enum pid_type type) 1199 enum pid_type type)
1200{ 1200{
1201 int from_ancestor_ns = 0; 1201 /* Should SIGKILL or SIGSTOP be received by a pid namespace init? */
1202 bool force = false;
1202 1203
1203#ifdef CONFIG_PID_NS 1204 if (info == SEND_SIG_NOINFO) {
1204 from_ancestor_ns = si_fromuser(info) && 1205 /* Force if sent from an ancestor pid namespace */
1205 !task_pid_nr_ns(current, task_active_pid_ns(t)); 1206 force = !task_pid_nr_ns(current, task_active_pid_ns(t));
1206#endif 1207 } else if (info == SEND_SIG_PRIV) {
1208 /* Don't ignore kernel generated signals */
1209 force = true;
1210 } else if (has_si_pid_and_uid(info)) {
1211 /* SIGKILL and SIGSTOP is special or has ids */
1212 struct user_namespace *t_user_ns;
1213
1214 rcu_read_lock();
1215 t_user_ns = task_cred_xxx(t, user_ns);
1216 if (current_user_ns() != t_user_ns) {
1217 kuid_t uid = make_kuid(current_user_ns(), info->si_uid);
1218 info->si_uid = from_kuid_munged(t_user_ns, uid);
1219 }
1220 rcu_read_unlock();
1207 1221
1208 return __send_signal(sig, info, t, type, from_ancestor_ns); 1222 /* A kernel generated signal? */
1223 force = (info->si_code == SI_KERNEL);
1224
1225 /* From an ancestor pid namespace? */
1226 if (!task_pid_nr_ns(current, task_active_pid_ns(t))) {
1227 info->si_pid = 0;
1228 force = true;
1229 }
1230 }
1231 return __send_signal(sig, info, t, type, force);
1209} 1232}
1210 1233
1211static void print_fatal_signal(int signr) 1234static void print_fatal_signal(int signr)
@@ -1272,12 +1295,13 @@ int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p
1272 * We don't want to have recursive SIGSEGV's etc, for example, 1295 * We don't want to have recursive SIGSEGV's etc, for example,
1273 * that is why we also clear SIGNAL_UNKILLABLE. 1296 * that is why we also clear SIGNAL_UNKILLABLE.
1274 */ 1297 */
1275int 1298static int
1276force_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *t) 1299force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t)
1277{ 1300{
1278 unsigned long int flags; 1301 unsigned long int flags;
1279 int ret, blocked, ignored; 1302 int ret, blocked, ignored;
1280 struct k_sigaction *action; 1303 struct k_sigaction *action;
1304 int sig = info->si_signo;
1281 1305
1282 spin_lock_irqsave(&t->sighand->siglock, flags); 1306 spin_lock_irqsave(&t->sighand->siglock, flags);
1283 action = &t->sighand->action[sig-1]; 1307 action = &t->sighand->action[sig-1];
@@ -1302,6 +1326,11 @@ force_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *t)
1302 return ret; 1326 return ret;
1303} 1327}
1304 1328
1329int force_sig_info(struct kernel_siginfo *info)
1330{
1331 return force_sig_info_to_task(info, current);
1332}
1333
1305/* 1334/*
1306 * Nuke all other threads in the group. 1335 * Nuke all other threads in the group.
1307 */ 1336 */
@@ -1438,13 +1467,44 @@ static inline bool kill_as_cred_perm(const struct cred *cred,
1438 uid_eq(cred->uid, pcred->uid); 1467 uid_eq(cred->uid, pcred->uid);
1439} 1468}
1440 1469
1441/* like kill_pid_info(), but doesn't use uid/euid of "current" */ 1470/*
1442int kill_pid_info_as_cred(int sig, struct kernel_siginfo *info, struct pid *pid, 1471 * The usb asyncio usage of siginfo is wrong. The glibc support
1443 const struct cred *cred) 1472 * for asyncio which uses SI_ASYNCIO assumes the layout is SIL_RT.
1473 * AKA after the generic fields:
1474 * kernel_pid_t si_pid;
1475 * kernel_uid32_t si_uid;
1476 * sigval_t si_value;
1477 *
1478 * Unfortunately when usb generates SI_ASYNCIO it assumes the layout
1479 * after the generic fields is:
1480 * void __user *si_addr;
1481 *
1482 * This is a practical problem when there is a 64bit big endian kernel
1483 * and a 32bit userspace. As the 32bit address will encoded in the low
1484 * 32bits of the pointer. Those low 32bits will be stored at higher
1485 * address than appear in a 32 bit pointer. So userspace will not
1486 * see the address it was expecting for it's completions.
1487 *
1488 * There is nothing in the encoding that can allow
1489 * copy_siginfo_to_user32 to detect this confusion of formats, so
1490 * handle this by requiring the caller of kill_pid_usb_asyncio to
1491 * notice when this situration takes place and to store the 32bit
1492 * pointer in sival_int, instead of sival_addr of the sigval_t addr
1493 * parameter.
1494 */
1495int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr,
1496 struct pid *pid, const struct cred *cred)
1444{ 1497{
1445 int ret = -EINVAL; 1498 struct kernel_siginfo info;
1446 struct task_struct *p; 1499 struct task_struct *p;
1447 unsigned long flags; 1500 unsigned long flags;
1501 int ret = -EINVAL;
1502
1503 clear_siginfo(&info);
1504 info.si_signo = sig;
1505 info.si_errno = errno;
1506 info.si_code = SI_ASYNCIO;
1507 *((sigval_t *)&info.si_pid) = addr;
1448 1508
1449 if (!valid_signal(sig)) 1509 if (!valid_signal(sig))
1450 return ret; 1510 return ret;
@@ -1455,17 +1515,17 @@ int kill_pid_info_as_cred(int sig, struct kernel_siginfo *info, struct pid *pid,
1455 ret = -ESRCH; 1515 ret = -ESRCH;
1456 goto out_unlock; 1516 goto out_unlock;
1457 } 1517 }
1458 if (si_fromuser(info) && !kill_as_cred_perm(cred, p)) { 1518 if (!kill_as_cred_perm(cred, p)) {
1459 ret = -EPERM; 1519 ret = -EPERM;
1460 goto out_unlock; 1520 goto out_unlock;
1461 } 1521 }
1462 ret = security_task_kill(p, info, sig, cred); 1522 ret = security_task_kill(p, &info, sig, cred);
1463 if (ret) 1523 if (ret)
1464 goto out_unlock; 1524 goto out_unlock;
1465 1525
1466 if (sig) { 1526 if (sig) {
1467 if (lock_task_sighand(p, &flags)) { 1527 if (lock_task_sighand(p, &flags)) {
1468 ret = __send_signal(sig, info, p, PIDTYPE_TGID, 0); 1528 ret = __send_signal(sig, &info, p, PIDTYPE_TGID, false);
1469 unlock_task_sighand(p, &flags); 1529 unlock_task_sighand(p, &flags);
1470 } else 1530 } else
1471 ret = -ESRCH; 1531 ret = -ESRCH;
@@ -1474,7 +1534,7 @@ out_unlock:
1474 rcu_read_unlock(); 1534 rcu_read_unlock();
1475 return ret; 1535 return ret;
1476} 1536}
1477EXPORT_SYMBOL_GPL(kill_pid_info_as_cred); 1537EXPORT_SYMBOL_GPL(kill_pid_usb_asyncio);
1478 1538
1479/* 1539/*
1480 * kill_something_info() interprets pid in interesting ways just like kill(2). 1540 * kill_something_info() interprets pid in interesting ways just like kill(2).
@@ -1550,9 +1610,17 @@ send_sig(int sig, struct task_struct *p, int priv)
1550} 1610}
1551EXPORT_SYMBOL(send_sig); 1611EXPORT_SYMBOL(send_sig);
1552 1612
1553void force_sig(int sig, struct task_struct *p) 1613void force_sig(int sig)
1554{ 1614{
1555 force_sig_info(sig, SEND_SIG_PRIV, p); 1615 struct kernel_siginfo info;
1616
1617 clear_siginfo(&info);
1618 info.si_signo = sig;
1619 info.si_errno = 0;
1620 info.si_code = SI_KERNEL;
1621 info.si_pid = 0;
1622 info.si_uid = 0;
1623 force_sig_info(&info);
1556} 1624}
1557EXPORT_SYMBOL(force_sig); 1625EXPORT_SYMBOL(force_sig);
1558 1626
@@ -1562,18 +1630,20 @@ EXPORT_SYMBOL(force_sig);
1562 * the problem was already a SIGSEGV, we'll want to 1630 * the problem was already a SIGSEGV, we'll want to
1563 * make sure we don't even try to deliver the signal.. 1631 * make sure we don't even try to deliver the signal..
1564 */ 1632 */
1565void force_sigsegv(int sig, struct task_struct *p) 1633void force_sigsegv(int sig)
1566{ 1634{
1635 struct task_struct *p = current;
1636
1567 if (sig == SIGSEGV) { 1637 if (sig == SIGSEGV) {
1568 unsigned long flags; 1638 unsigned long flags;
1569 spin_lock_irqsave(&p->sighand->siglock, flags); 1639 spin_lock_irqsave(&p->sighand->siglock, flags);
1570 p->sighand->action[sig - 1].sa.sa_handler = SIG_DFL; 1640 p->sighand->action[sig - 1].sa.sa_handler = SIG_DFL;
1571 spin_unlock_irqrestore(&p->sighand->siglock, flags); 1641 spin_unlock_irqrestore(&p->sighand->siglock, flags);
1572 } 1642 }
1573 force_sig(SIGSEGV, p); 1643 force_sig(SIGSEGV);
1574} 1644}
1575 1645
1576int force_sig_fault(int sig, int code, void __user *addr 1646int force_sig_fault_to_task(int sig, int code, void __user *addr
1577 ___ARCH_SI_TRAPNO(int trapno) 1647 ___ARCH_SI_TRAPNO(int trapno)
1578 ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) 1648 ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
1579 , struct task_struct *t) 1649 , struct task_struct *t)
@@ -1593,7 +1663,16 @@ int force_sig_fault(int sig, int code, void __user *addr
1593 info.si_flags = flags; 1663 info.si_flags = flags;
1594 info.si_isr = isr; 1664 info.si_isr = isr;
1595#endif 1665#endif
1596 return force_sig_info(info.si_signo, &info, t); 1666 return force_sig_info_to_task(&info, t);
1667}
1668
1669int force_sig_fault(int sig, int code, void __user *addr
1670 ___ARCH_SI_TRAPNO(int trapno)
1671 ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr))
1672{
1673 return force_sig_fault_to_task(sig, code, addr
1674 ___ARCH_SI_TRAPNO(trapno)
1675 ___ARCH_SI_IA64(imm, flags, isr), current);
1597} 1676}
1598 1677
1599int send_sig_fault(int sig, int code, void __user *addr 1678int send_sig_fault(int sig, int code, void __user *addr
@@ -1619,7 +1698,7 @@ int send_sig_fault(int sig, int code, void __user *addr
1619 return send_sig_info(info.si_signo, &info, t); 1698 return send_sig_info(info.si_signo, &info, t);
1620} 1699}
1621 1700
1622int force_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t) 1701int force_sig_mceerr(int code, void __user *addr, short lsb)
1623{ 1702{
1624 struct kernel_siginfo info; 1703 struct kernel_siginfo info;
1625 1704
@@ -1630,7 +1709,7 @@ int force_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct
1630 info.si_code = code; 1709 info.si_code = code;
1631 info.si_addr = addr; 1710 info.si_addr = addr;
1632 info.si_addr_lsb = lsb; 1711 info.si_addr_lsb = lsb;
1633 return force_sig_info(info.si_signo, &info, t); 1712 return force_sig_info(&info);
1634} 1713}
1635 1714
1636int send_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t) 1715int send_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t)
@@ -1659,7 +1738,7 @@ int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper)
1659 info.si_addr = addr; 1738 info.si_addr = addr;
1660 info.si_lower = lower; 1739 info.si_lower = lower;
1661 info.si_upper = upper; 1740 info.si_upper = upper;
1662 return force_sig_info(info.si_signo, &info, current); 1741 return force_sig_info(&info);
1663} 1742}
1664 1743
1665#ifdef SEGV_PKUERR 1744#ifdef SEGV_PKUERR
@@ -1673,7 +1752,7 @@ int force_sig_pkuerr(void __user *addr, u32 pkey)
1673 info.si_code = SEGV_PKUERR; 1752 info.si_code = SEGV_PKUERR;
1674 info.si_addr = addr; 1753 info.si_addr = addr;
1675 info.si_pkey = pkey; 1754 info.si_pkey = pkey;
1676 return force_sig_info(info.si_signo, &info, current); 1755 return force_sig_info(&info);
1677} 1756}
1678#endif 1757#endif
1679 1758
@@ -1689,7 +1768,7 @@ int force_sig_ptrace_errno_trap(int errno, void __user *addr)
1689 info.si_errno = errno; 1768 info.si_errno = errno;
1690 info.si_code = TRAP_HWBKPT; 1769 info.si_code = TRAP_HWBKPT;
1691 info.si_addr = addr; 1770 info.si_addr = addr;
1692 return force_sig_info(info.si_signo, &info, current); 1771 return force_sig_info(&info);
1693} 1772}
1694 1773
1695int kill_pgrp(struct pid *pid, int sig, int priv) 1774int kill_pgrp(struct pid *pid, int sig, int priv)
@@ -1802,6 +1881,14 @@ ret:
1802 return ret; 1881 return ret;
1803} 1882}
1804 1883
1884static void do_notify_pidfd(struct task_struct *task)
1885{
1886 struct pid *pid;
1887
1888 pid = task_pid(task);
1889 wake_up_all(&pid->wait_pidfd);
1890}
1891
1805/* 1892/*
1806 * Let a parent know about the death of a child. 1893 * Let a parent know about the death of a child.
1807 * For a stopped/continued status change, use do_notify_parent_cldstop instead. 1894 * For a stopped/continued status change, use do_notify_parent_cldstop instead.
@@ -1825,6 +1912,9 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
1825 BUG_ON(!tsk->ptrace && 1912 BUG_ON(!tsk->ptrace &&
1826 (tsk->group_leader != tsk || !thread_group_empty(tsk))); 1913 (tsk->group_leader != tsk || !thread_group_empty(tsk)));
1827 1914
1915 /* Wake up all pidfd waiters */
1916 do_notify_pidfd(tsk);
1917
1828 if (sig != SIGCHLD) { 1918 if (sig != SIGCHLD) {
1829 /* 1919 /*
1830 * This is only possible if parent == real_parent. 1920 * This is only possible if parent == real_parent.
@@ -2112,6 +2202,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
2112 preempt_enable_no_resched(); 2202 preempt_enable_no_resched();
2113 cgroup_enter_frozen(); 2203 cgroup_enter_frozen();
2114 freezable_schedule(); 2204 freezable_schedule();
2205 cgroup_leave_frozen(true);
2115 } else { 2206 } else {
2116 /* 2207 /*
2117 * By the time we got the lock, our tracer went away. 2208 * By the time we got the lock, our tracer went away.
@@ -2482,6 +2573,8 @@ relock:
2482 if (signal_group_exit(signal)) { 2573 if (signal_group_exit(signal)) {
2483 ksig->info.si_signo = signr = SIGKILL; 2574 ksig->info.si_signo = signr = SIGKILL;
2484 sigdelset(&current->pending.signal, SIGKILL); 2575 sigdelset(&current->pending.signal, SIGKILL);
2576 trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO,
2577 &sighand->action[SIGKILL - 1]);
2485 recalc_sigpending(); 2578 recalc_sigpending();
2486 goto fatal; 2579 goto fatal;
2487 } 2580 }
@@ -2671,7 +2764,7 @@ static void signal_delivered(struct ksignal *ksig, int stepping)
2671void signal_setup_done(int failed, struct ksignal *ksig, int stepping) 2764void signal_setup_done(int failed, struct ksignal *ksig, int stepping)
2672{ 2765{
2673 if (failed) 2766 if (failed)
2674 force_sigsegv(ksig->sig, current); 2767 force_sigsegv(ksig->sig);
2675 else 2768 else
2676 signal_delivered(ksig, stepping); 2769 signal_delivered(ksig, stepping);
2677} 2770}
@@ -2907,7 +3000,8 @@ EXPORT_SYMBOL(set_compat_user_sigmask);
2907 * This is useful for syscalls such as ppoll, pselect, io_pgetevents and 3000 * This is useful for syscalls such as ppoll, pselect, io_pgetevents and
2908 * epoll_pwait where a new sigmask is passed in from userland for the syscalls. 3001 * epoll_pwait where a new sigmask is passed in from userland for the syscalls.
2909 */ 3002 */
2910void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved) 3003void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved,
3004 bool interrupted)
2911{ 3005{
2912 3006
2913 if (!usigmask) 3007 if (!usigmask)
@@ -2917,7 +3011,7 @@ void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved)
2917 * Restoring sigmask here can lead to delivering signals that the above 3011 * Restoring sigmask here can lead to delivering signals that the above
2918 * syscalls are intended to block because of the sigmask passed in. 3012 * syscalls are intended to block because of the sigmask passed in.
2919 */ 3013 */
2920 if (signal_pending(current)) { 3014 if (interrupted) {
2921 current->saved_sigmask = *sigsaved; 3015 current->saved_sigmask = *sigsaved;
2922 set_restore_sigmask(); 3016 set_restore_sigmask();
2923 return; 3017 return;
@@ -3616,12 +3710,11 @@ static struct pid *pidfd_to_pid(const struct file *file)
3616} 3710}
3617 3711
3618/** 3712/**
3619 * sys_pidfd_send_signal - send a signal to a process through a task file 3713 * sys_pidfd_send_signal - Signal a process through a pidfd
3620 * descriptor 3714 * @pidfd: file descriptor of the process
3621 * @pidfd: the file descriptor of the process 3715 * @sig: signal to send
3622 * @sig: signal to be sent 3716 * @info: signal info
3623 * @info: the signal info 3717 * @flags: future flags
3624 * @flags: future flags to be passed
3625 * 3718 *
3626 * The syscall currently only signals via PIDTYPE_PID which covers 3719 * The syscall currently only signals via PIDTYPE_PID which covers
3627 * kill(<positive-pid>, <signal>. It does not signal threads or process 3720 * kill(<positive-pid>, <signal>. It does not signal threads or process
@@ -4472,6 +4565,28 @@ static inline void siginfo_buildtime_checks(void)
4472 CHECK_OFFSET(si_syscall); 4565 CHECK_OFFSET(si_syscall);
4473 CHECK_OFFSET(si_arch); 4566 CHECK_OFFSET(si_arch);
4474#undef CHECK_OFFSET 4567#undef CHECK_OFFSET
4568
4569 /* usb asyncio */
4570 BUILD_BUG_ON(offsetof(struct siginfo, si_pid) !=
4571 offsetof(struct siginfo, si_addr));
4572 if (sizeof(int) == sizeof(void __user *)) {
4573 BUILD_BUG_ON(sizeof_field(struct siginfo, si_pid) !=
4574 sizeof(void __user *));
4575 } else {
4576 BUILD_BUG_ON((sizeof_field(struct siginfo, si_pid) +
4577 sizeof_field(struct siginfo, si_uid)) !=
4578 sizeof(void __user *));
4579 BUILD_BUG_ON(offsetofend(struct siginfo, si_pid) !=
4580 offsetof(struct siginfo, si_uid));
4581 }
4582#ifdef CONFIG_COMPAT
4583 BUILD_BUG_ON(offsetof(struct compat_siginfo, si_pid) !=
4584 offsetof(struct compat_siginfo, si_addr));
4585 BUILD_BUG_ON(sizeof_field(struct compat_siginfo, si_pid) !=
4586 sizeof(compat_uptr_t));
4587 BUILD_BUG_ON(sizeof_field(struct compat_siginfo, si_pid) !=
4588 sizeof_field(struct siginfo, si_pid));
4589#endif
4475} 4590}
4476 4591
4477void __init signals_init(void) 4592void __init signals_init(void)
diff --git a/kernel/smp.c b/kernel/smp.c
index f4cf1b0bb3b8..616d4d114847 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * Generic helpers for smp ipi calls 3 * Generic helpers for smp ipi calls
3 * 4 *
@@ -33,7 +34,7 @@ struct call_function_data {
33 cpumask_var_t cpumask_ipi; 34 cpumask_var_t cpumask_ipi;
34}; 35};
35 36
36static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data); 37static DEFINE_PER_CPU_ALIGNED(struct call_function_data, cfd_data);
37 38
38static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); 39static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
39 40
@@ -486,13 +487,11 @@ EXPORT_SYMBOL(smp_call_function_many);
486 * You must not call this function with disabled interrupts or from a 487 * You must not call this function with disabled interrupts or from a
487 * hardware interrupt handler or from a bottom half handler. 488 * hardware interrupt handler or from a bottom half handler.
488 */ 489 */
489int smp_call_function(smp_call_func_t func, void *info, int wait) 490void smp_call_function(smp_call_func_t func, void *info, int wait)
490{ 491{
491 preempt_disable(); 492 preempt_disable();
492 smp_call_function_many(cpu_online_mask, func, info, wait); 493 smp_call_function_many(cpu_online_mask, func, info, wait);
493 preempt_enable(); 494 preempt_enable();
494
495 return 0;
496} 495}
497EXPORT_SYMBOL(smp_call_function); 496EXPORT_SYMBOL(smp_call_function);
498 497
@@ -593,18 +592,16 @@ void __init smp_init(void)
593 * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead 592 * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead
594 * of local_irq_disable/enable(). 593 * of local_irq_disable/enable().
595 */ 594 */
596int on_each_cpu(void (*func) (void *info), void *info, int wait) 595void on_each_cpu(void (*func) (void *info), void *info, int wait)
597{ 596{
598 unsigned long flags; 597 unsigned long flags;
599 int ret = 0;
600 598
601 preempt_disable(); 599 preempt_disable();
602 ret = smp_call_function(func, info, wait); 600 smp_call_function(func, info, wait);
603 local_irq_save(flags); 601 local_irq_save(flags);
604 func(info); 602 func(info);
605 local_irq_restore(flags); 603 local_irq_restore(flags);
606 preempt_enable(); 604 preempt_enable();
607 return ret;
608} 605}
609EXPORT_SYMBOL(on_each_cpu); 606EXPORT_SYMBOL(on_each_cpu);
610 607
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index c230c2dd48e1..2efe1e206167 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * Common SMP CPU bringup/teardown functions 3 * Common SMP CPU bringup/teardown functions
3 */ 4 */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 2c3382378d94..0427a86743a4 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -1,10 +1,9 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * linux/kernel/softirq.c 3 * linux/kernel/softirq.c
3 * 4 *
4 * Copyright (C) 1992 Linus Torvalds 5 * Copyright (C) 1992 Linus Torvalds
5 * 6 *
6 * Distribute under GPLv2.
7 *
8 * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) 7 * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
9 */ 8 */
10 9
@@ -650,7 +649,7 @@ static int takeover_tasklets(unsigned int cpu)
650 /* Find end, append list for that CPU. */ 649 /* Find end, append list for that CPU. */
651 if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) { 650 if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) {
652 *__this_cpu_read(tasklet_vec.tail) = per_cpu(tasklet_vec, cpu).head; 651 *__this_cpu_read(tasklet_vec.tail) = per_cpu(tasklet_vec, cpu).head;
653 this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail); 652 __this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail);
654 per_cpu(tasklet_vec, cpu).head = NULL; 653 per_cpu(tasklet_vec, cpu).head = NULL;
655 per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; 654 per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head;
656 } 655 }
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 27bafc1e271e..e6a02b274b73 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * kernel/stacktrace.c 3 * kernel/stacktrace.c
3 * 4 *
@@ -206,7 +207,7 @@ int stack_trace_save_tsk_reliable(struct task_struct *tsk, unsigned long *store,
206 207
207 ret = arch_stack_walk_reliable(consume_entry, &c, tsk); 208 ret = arch_stack_walk_reliable(consume_entry, &c, tsk);
208 put_task_stack(tsk); 209 put_task_stack(tsk);
209 return ret; 210 return ret ? ret : c.len;
210} 211}
211#endif 212#endif
212 213
@@ -227,7 +228,7 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size)
227 }; 228 };
228 229
229 /* Trace user stack if not a kernel thread */ 230 /* Trace user stack if not a kernel thread */
230 if (!current->mm) 231 if (current->flags & PF_KTHREAD)
231 return 0; 232 return 0;
232 233
233 arch_stack_walk_user(consume_entry, &c, task_pt_regs(current)); 234 arch_stack_walk_user(consume_entry, &c, task_pt_regs(current));
@@ -254,14 +255,6 @@ save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace)
254 WARN_ONCE(1, KERN_INFO "save_stack_trace_regs() not implemented yet.\n"); 255 WARN_ONCE(1, KERN_INFO "save_stack_trace_regs() not implemented yet.\n");
255} 256}
256 257
257__weak int
258save_stack_trace_tsk_reliable(struct task_struct *tsk,
259 struct stack_trace *trace)
260{
261 WARN_ONCE(1, KERN_INFO "save_stack_tsk_reliable() not implemented yet.\n");
262 return -ENOSYS;
263}
264
265/** 258/**
266 * stack_trace_save - Save a stack trace into a storage array 259 * stack_trace_save - Save a stack trace into a storage array
267 * @store: Pointer to storage array 260 * @store: Pointer to storage array
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 7231fb5953fc..b4f83f7bdf86 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
1/* 2/*
2 * kernel/stop_machine.c 3 * kernel/stop_machine.c
3 * 4 *
@@ -5,8 +6,6 @@
5 * Copyright (C) 2008, 2005 Rusty Russell rusty@rustcorp.com.au 6 * Copyright (C) 2008, 2005 Rusty Russell rusty@rustcorp.com.au
6 * Copyright (C) 2010 SUSE Linux Products GmbH 7 * Copyright (C) 2010 SUSE Linux Products GmbH
7 * Copyright (C) 2010 Tejun Heo <tj@kernel.org> 8 * Copyright (C) 2010 Tejun Heo <tj@kernel.org>
8 *
9 * This file is released under the GPLv2 and any later version.
10 */ 9 */
11#include <linux/completion.h> 10#include <linux/completion.h>
12#include <linux/cpu.h> 11#include <linux/cpu.h>
@@ -178,12 +177,18 @@ static void ack_state(struct multi_stop_data *msdata)
178 set_state(msdata, msdata->state + 1); 177 set_state(msdata, msdata->state + 1);
179} 178}
180 179
180void __weak stop_machine_yield(const struct cpumask *cpumask)
181{
182 cpu_relax();
183}
184
181/* This is the cpu_stop function which stops the CPU. */ 185/* This is the cpu_stop function which stops the CPU. */
182static int multi_cpu_stop(void *data) 186static int multi_cpu_stop(void *data)
183{ 187{
184 struct multi_stop_data *msdata = data; 188 struct multi_stop_data *msdata = data;
185 enum multi_stop_state curstate = MULTI_STOP_NONE; 189 enum multi_stop_state curstate = MULTI_STOP_NONE;
186 int cpu = smp_processor_id(), err = 0; 190 int cpu = smp_processor_id(), err = 0;
191 const struct cpumask *cpumask;
187 unsigned long flags; 192 unsigned long flags;
188 bool is_active; 193 bool is_active;
189 194
@@ -193,15 +198,18 @@ static int multi_cpu_stop(void *data)
193 */ 198 */
194 local_save_flags(flags); 199 local_save_flags(flags);
195 200
196 if (!msdata->active_cpus) 201 if (!msdata->active_cpus) {
197 is_active = cpu == cpumask_first(cpu_online_mask); 202 cpumask = cpu_online_mask;
198 else 203 is_active = cpu == cpumask_first(cpumask);
199 is_active = cpumask_test_cpu(cpu, msdata->active_cpus); 204 } else {
205 cpumask = msdata->active_cpus;
206 is_active = cpumask_test_cpu(cpu, cpumask);
207 }
200 208
201 /* Simple state machine */ 209 /* Simple state machine */
202 do { 210 do {
203 /* Chill out and ensure we re-read multi_stop_state. */ 211 /* Chill out and ensure we re-read multi_stop_state. */
204 cpu_relax_yield(); 212 stop_machine_yield(cpumask);
205 if (msdata->state != curstate) { 213 if (msdata->state != curstate) {
206 curstate = msdata->state; 214 curstate = msdata->state;
207 switch (curstate) { 215 switch (curstate) {
diff --git a/kernel/sys.c b/kernel/sys.c
index bdbfe8d37418..2969304c29fe 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1882,13 +1882,14 @@ exit_err:
1882} 1882}
1883 1883
1884/* 1884/*
1885 * Check arithmetic relations of passed addresses.
1886 *
1885 * WARNING: we don't require any capability here so be very careful 1887 * WARNING: we don't require any capability here so be very careful
1886 * in what is allowed for modification from userspace. 1888 * in what is allowed for modification from userspace.
1887 */ 1889 */
1888static int validate_prctl_map(struct prctl_mm_map *prctl_map) 1890static int validate_prctl_map_addr(struct prctl_mm_map *prctl_map)
1889{ 1891{
1890 unsigned long mmap_max_addr = TASK_SIZE; 1892 unsigned long mmap_max_addr = TASK_SIZE;
1891 struct mm_struct *mm = current->mm;
1892 int error = -EINVAL, i; 1893 int error = -EINVAL, i;
1893 1894
1894 static const unsigned char offsets[] = { 1895 static const unsigned char offsets[] = {
@@ -1949,24 +1950,6 @@ static int validate_prctl_map(struct prctl_mm_map *prctl_map)
1949 prctl_map->start_data)) 1950 prctl_map->start_data))
1950 goto out; 1951 goto out;
1951 1952
1952 /*
1953 * Someone is trying to cheat the auxv vector.
1954 */
1955 if (prctl_map->auxv_size) {
1956 if (!prctl_map->auxv || prctl_map->auxv_size > sizeof(mm->saved_auxv))
1957 goto out;
1958 }
1959
1960 /*
1961 * Finally, make sure the caller has the rights to
1962 * change /proc/pid/exe link: only local sys admin should
1963 * be allowed to.
1964 */
1965 if (prctl_map->exe_fd != (u32)-1) {
1966 if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
1967 goto out;
1968 }
1969
1970 error = 0; 1953 error = 0;
1971out: 1954out:
1972 return error; 1955 return error;
@@ -1993,11 +1976,18 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
1993 if (copy_from_user(&prctl_map, addr, sizeof(prctl_map))) 1976 if (copy_from_user(&prctl_map, addr, sizeof(prctl_map)))
1994 return -EFAULT; 1977 return -EFAULT;
1995 1978
1996 error = validate_prctl_map(&prctl_map); 1979 error = validate_prctl_map_addr(&prctl_map);
1997 if (error) 1980 if (error)
1998 return error; 1981 return error;
1999 1982
2000 if (prctl_map.auxv_size) { 1983 if (prctl_map.auxv_size) {
1984 /*
1985 * Someone is trying to cheat the auxv vector.
1986 */
1987 if (!prctl_map.auxv ||
1988 prctl_map.auxv_size > sizeof(mm->saved_auxv))
1989 return -EINVAL;
1990
2001 memset(user_auxv, 0, sizeof(user_auxv)); 1991 memset(user_auxv, 0, sizeof(user_auxv));
2002 if (copy_from_user(user_auxv, 1992 if (copy_from_user(user_auxv,
2003 (const void __user *)prctl_map.auxv, 1993 (const void __user *)prctl_map.auxv,
@@ -2010,6 +2000,14 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
2010 } 2000 }
2011 2001
2012 if (prctl_map.exe_fd != (u32)-1) { 2002 if (prctl_map.exe_fd != (u32)-1) {
2003 /*
2004 * Make sure the caller has the rights to
2005 * change /proc/pid/exe link: only local sys admin should
2006 * be allowed to.
2007 */
2008 if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
2009 return -EINVAL;
2010
2013 error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd); 2011 error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd);
2014 if (error) 2012 if (error)
2015 return error; 2013 return error;
@@ -2097,7 +2095,11 @@ static int prctl_set_mm(int opt, unsigned long addr,
2097 unsigned long arg4, unsigned long arg5) 2095 unsigned long arg4, unsigned long arg5)
2098{ 2096{
2099 struct mm_struct *mm = current->mm; 2097 struct mm_struct *mm = current->mm;
2100 struct prctl_mm_map prctl_map; 2098 struct prctl_mm_map prctl_map = {
2099 .auxv = NULL,
2100 .auxv_size = 0,
2101 .exe_fd = -1,
2102 };
2101 struct vm_area_struct *vma; 2103 struct vm_area_struct *vma;
2102 int error; 2104 int error;
2103 2105
@@ -2125,9 +2127,15 @@ static int prctl_set_mm(int opt, unsigned long addr,
2125 2127
2126 error = -EINVAL; 2128 error = -EINVAL;
2127 2129
2128 down_write(&mm->mmap_sem); 2130 /*
2131 * arg_lock protects concurent updates of arg boundaries, we need
2132 * mmap_sem for a) concurrent sys_brk, b) finding VMA for addr
2133 * validation.
2134 */
2135 down_read(&mm->mmap_sem);
2129 vma = find_vma(mm, addr); 2136 vma = find_vma(mm, addr);
2130 2137
2138 spin_lock(&mm->arg_lock);
2131 prctl_map.start_code = mm->start_code; 2139 prctl_map.start_code = mm->start_code;
2132 prctl_map.end_code = mm->end_code; 2140 prctl_map.end_code = mm->end_code;
2133 prctl_map.start_data = mm->start_data; 2141 prctl_map.start_data = mm->start_data;
@@ -2139,9 +2147,6 @@ static int prctl_set_mm(int opt, unsigned long addr,
2139 prctl_map.arg_end = mm->arg_end; 2147 prctl_map.arg_end = mm->arg_end;
2140 prctl_map.env_start = mm->env_start; 2148 prctl_map.env_start = mm->env_start;
2141 prctl_map.env_end = mm->env_end; 2149 prctl_map.env_end = mm->env_end;
2142 prctl_map.auxv = NULL;
2143 prctl_map.auxv_size = 0;
2144 prctl_map.exe_fd = -1;
2145 2150
2146 switch (opt) { 2151 switch (opt) {
2147 case PR_SET_MM_START_CODE: 2152 case PR_SET_MM_START_CODE:
@@ -2181,7 +2186,7 @@ static int prctl_set_mm(int opt, unsigned long addr,
2181 goto out; 2186 goto out;
2182 } 2187 }
2183 2188
2184 error = validate_prctl_map(&prctl_map); 2189 error = validate_prctl_map_addr(&prctl_map);
2185 if (error) 2190 if (error)
2186 goto out; 2191 goto out;
2187 2192
@@ -2218,7 +2223,8 @@ static int prctl_set_mm(int opt, unsigned long addr,
2218 2223
2219 error = 0; 2224 error = 0;
2220out: 2225out:
2221 up_write(&mm->mmap_sem); 2226 spin_unlock(&mm->arg_lock);
2227 up_read(&mm->mmap_sem);
2222 return error; 2228 return error;
2223} 2229}
2224 2230
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 4d9ae5ea6caf..34b76895b81e 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -137,6 +137,8 @@ COND_SYSCALL(capset);
137/* kernel/exit.c */ 137/* kernel/exit.c */
138 138
139/* kernel/fork.c */ 139/* kernel/fork.c */
140/* __ARCH_WANT_SYS_CLONE3 */
141COND_SYSCALL(clone3);
140 142
141/* kernel/futex.c */ 143/* kernel/futex.c */
142COND_SYSCALL(futex); 144COND_SYSCALL(futex);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ba158f61aab4..1c1ad1e14f21 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * sysctl.c: General linux system control interface 3 * sysctl.c: General linux system control interface
3 * 4 *
@@ -229,11 +230,6 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,
229#endif 230#endif
230static int proc_dopipe_max_size(struct ctl_table *table, int write, 231static int proc_dopipe_max_size(struct ctl_table *table, int write,
231 void __user *buffer, size_t *lenp, loff_t *ppos); 232 void __user *buffer, size_t *lenp, loff_t *ppos);
232#ifdef CONFIG_BPF_SYSCALL
233static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write,
234 void __user *buffer, size_t *lenp,
235 loff_t *ppos);
236#endif
237 233
238#ifdef CONFIG_MAGIC_SYSRQ 234#ifdef CONFIG_MAGIC_SYSRQ
239/* Note: sysrq code uses its own private copy */ 235/* Note: sysrq code uses its own private copy */
@@ -456,6 +452,22 @@ static struct ctl_table kern_table[] = {
456 .mode = 0644, 452 .mode = 0644,
457 .proc_handler = sched_rr_handler, 453 .proc_handler = sched_rr_handler,
458 }, 454 },
455#ifdef CONFIG_UCLAMP_TASK
456 {
457 .procname = "sched_util_clamp_min",
458 .data = &sysctl_sched_uclamp_util_min,
459 .maxlen = sizeof(unsigned int),
460 .mode = 0644,
461 .proc_handler = sysctl_sched_uclamp_handler,
462 },
463 {
464 .procname = "sched_util_clamp_max",
465 .data = &sysctl_sched_uclamp_util_max,
466 .maxlen = sizeof(unsigned int),
467 .mode = 0644,
468 .proc_handler = sysctl_sched_uclamp_handler,
469 },
470#endif
459#ifdef CONFIG_SCHED_AUTOGROUP 471#ifdef CONFIG_SCHED_AUTOGROUP
460 { 472 {
461 .procname = "sched_autogroup_enabled", 473 .procname = "sched_autogroup_enabled",
@@ -1252,12 +1264,10 @@ static struct ctl_table kern_table[] = {
1252 }, 1264 },
1253 { 1265 {
1254 .procname = "bpf_stats_enabled", 1266 .procname = "bpf_stats_enabled",
1255 .data = &sysctl_bpf_stats_enabled, 1267 .data = &bpf_stats_enabled_key.key,
1256 .maxlen = sizeof(sysctl_bpf_stats_enabled), 1268 .maxlen = sizeof(bpf_stats_enabled_key),
1257 .mode = 0644, 1269 .mode = 0644,
1258 .proc_handler = proc_dointvec_minmax_bpf_stats, 1270 .proc_handler = proc_do_static_key,
1259 .extra1 = &zero,
1260 .extra2 = &one,
1261 }, 1271 },
1262#endif 1272#endif
1263#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) 1273#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
@@ -2886,8 +2896,10 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
2886 if (neg) 2896 if (neg)
2887 continue; 2897 continue;
2888 val = convmul * val / convdiv; 2898 val = convmul * val / convdiv;
2889 if ((min && val < *min) || (max && val > *max)) 2899 if ((min && val < *min) || (max && val > *max)) {
2890 continue; 2900 err = -EINVAL;
2901 break;
2902 }
2891 *i = val; 2903 *i = val;
2892 } else { 2904 } else {
2893 val = convdiv * (*i) / convmul; 2905 val = convdiv * (*i) / convmul;
@@ -3170,17 +3182,19 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
3170 3182
3171 if (write) { 3183 if (write) {
3172 char *kbuf, *p; 3184 char *kbuf, *p;
3185 size_t skipped = 0;
3173 3186
3174 if (left > PAGE_SIZE - 1) 3187 if (left > PAGE_SIZE - 1) {
3175 left = PAGE_SIZE - 1; 3188 left = PAGE_SIZE - 1;
3189 /* How much of the buffer we'll skip this pass */
3190 skipped = *lenp - left;
3191 }
3176 3192
3177 p = kbuf = memdup_user_nul(buffer, left); 3193 p = kbuf = memdup_user_nul(buffer, left);
3178 if (IS_ERR(kbuf)) 3194 if (IS_ERR(kbuf))
3179 return PTR_ERR(kbuf); 3195 return PTR_ERR(kbuf);
3180 3196
3181 tmp_bitmap = kcalloc(BITS_TO_LONGS(bitmap_len), 3197 tmp_bitmap = bitmap_zalloc(bitmap_len, GFP_KERNEL);
3182 sizeof(unsigned long),
3183 GFP_KERNEL);
3184 if (!tmp_bitmap) { 3198 if (!tmp_bitmap) {
3185 kfree(kbuf); 3199 kfree(kbuf);
3186 return -ENOMEM; 3200 return -ENOMEM;
@@ -3189,9 +3203,22 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
3189 while (!err && left) { 3203 while (!err && left) {
3190 unsigned long val_a, val_b; 3204 unsigned long val_a, val_b;
3191 bool neg; 3205 bool neg;
3206 size_t saved_left;
3192 3207
3208 /* In case we stop parsing mid-number, we can reset */
3209 saved_left = left;
3193 err = proc_get_long(&p, &left, &val_a, &neg, tr_a, 3210 err = proc_get_long(&p, &left, &val_a, &neg, tr_a,
3194 sizeof(tr_a), &c); 3211 sizeof(tr_a), &c);
3212 /*
3213 * If we consumed the entirety of a truncated buffer or
3214 * only one char is left (may be a "-"), then stop here,
3215 * reset, & come back for more.
3216 */
3217 if ((left <= 1) && skipped) {
3218 left = saved_left;
3219 break;
3220 }
3221
3195 if (err) 3222 if (err)
3196 break; 3223 break;
3197 if (val_a >= bitmap_len || neg) { 3224 if (val_a >= bitmap_len || neg) {
@@ -3209,6 +3236,15 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
3209 err = proc_get_long(&p, &left, &val_b, 3236 err = proc_get_long(&p, &left, &val_b,
3210 &neg, tr_b, sizeof(tr_b), 3237 &neg, tr_b, sizeof(tr_b),
3211 &c); 3238 &c);
3239 /*
3240 * If we consumed all of a truncated buffer or
3241 * then stop here, reset, & come back for more.
3242 */
3243 if (!left && skipped) {
3244 left = saved_left;
3245 break;
3246 }
3247
3212 if (err) 3248 if (err)
3213 break; 3249 break;
3214 if (val_b >= bitmap_len || neg || 3250 if (val_b >= bitmap_len || neg ||
@@ -3227,6 +3263,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
3227 proc_skip_char(&p, &left, '\n'); 3263 proc_skip_char(&p, &left, '\n');
3228 } 3264 }
3229 kfree(kbuf); 3265 kfree(kbuf);
3266 left += skipped;
3230 } else { 3267 } else {
3231 unsigned long bit_a, bit_b = 0; 3268 unsigned long bit_a, bit_b = 0;
3232 3269
@@ -3271,7 +3308,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
3271 *ppos += *lenp; 3308 *ppos += *lenp;
3272 } 3309 }
3273 3310
3274 kfree(tmp_bitmap); 3311 bitmap_free(tmp_bitmap);
3275 return err; 3312 return err;
3276} 3313}
3277 3314
@@ -3346,26 +3383,35 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
3346 3383
3347#endif /* CONFIG_PROC_SYSCTL */ 3384#endif /* CONFIG_PROC_SYSCTL */
3348 3385
3349#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_SYSCTL) 3386#if defined(CONFIG_SYSCTL)
3350static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write, 3387int proc_do_static_key(struct ctl_table *table, int write,
3351 void __user *buffer, size_t *lenp, 3388 void __user *buffer, size_t *lenp,
3352 loff_t *ppos) 3389 loff_t *ppos)
3353{ 3390{
3354 int ret, bpf_stats = *(int *)table->data; 3391 struct static_key *key = (struct static_key *)table->data;
3355 struct ctl_table tmp = *table; 3392 static DEFINE_MUTEX(static_key_mutex);
3393 int val, ret;
3394 struct ctl_table tmp = {
3395 .data = &val,
3396 .maxlen = sizeof(val),
3397 .mode = table->mode,
3398 .extra1 = &zero,
3399 .extra2 = &one,
3400 };
3356 3401
3357 if (write && !capable(CAP_SYS_ADMIN)) 3402 if (write && !capable(CAP_SYS_ADMIN))
3358 return -EPERM; 3403 return -EPERM;
3359 3404
3360 tmp.data = &bpf_stats; 3405 mutex_lock(&static_key_mutex);
3406 val = static_key_enabled(key);
3361 ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); 3407 ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
3362 if (write && !ret) { 3408 if (write && !ret) {
3363 *(int *)table->data = bpf_stats; 3409 if (val)
3364 if (bpf_stats) 3410 static_key_enable(key);
3365 static_branch_enable(&bpf_stats_enabled_key);
3366 else 3411 else
3367 static_branch_disable(&bpf_stats_enabled_key); 3412 static_key_disable(key);
3368 } 3413 }
3414 mutex_unlock(&static_key_mutex);
3369 return ret; 3415 return ret;
3370} 3416}
3371#endif 3417#endif
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 5f852b8f59f7..13a0f2e6ebc2 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -1,19 +1,9 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
1/* 2/*
2 * taskstats.c - Export per-task statistics to userland 3 * taskstats.c - Export per-task statistics to userland
3 * 4 *
4 * Copyright (C) Shailabh Nagar, IBM Corp. 2006 5 * Copyright (C) Shailabh Nagar, IBM Corp. 2006
5 * (C) Balbir Singh, IBM Corp. 2006 6 * (C) Balbir Singh, IBM Corp. 2006
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 */ 7 */
18 8
19#include <linux/kernel.h> 9#include <linux/kernel.h>
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index 7bca480151b0..76c997fdbc9d 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -1,17 +1,8 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
1/* 2/*
2 * test_kprobes.c - simple sanity test for *probes 3 * test_kprobes.c - simple sanity test for *probes
3 * 4 *
4 * Copyright IBM Corp. 2008 5 * Copyright IBM Corp. 2008
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it would be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
14 * the GNU General Public License for more details.
15 */ 6 */
16 7
17#define pr_fmt(fmt) "Kprobe smoke test: " fmt 8#define pr_fmt(fmt) "Kprobe smoke test: " fmt
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index e2c038d6c13c..fcc42353f125 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -1,3 +1,4 @@
1# SPDX-License-Identifier: GPL-2.0-only
1# 2#
2# Timer subsystem related configuration options 3# Timer subsystem related configuration options
3# 4#
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index f1e46f338a9c..1867044800bb 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -16,5 +16,6 @@ ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y)
16endif 16endif
17obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o 17obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o
18obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o 18obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o
19obj-$(CONFIG_HAVE_GENERIC_VDSO) += vsyscall.o
19obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o 20obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
20obj-$(CONFIG_TEST_UDELAY) += test_udelay.o 21obj-$(CONFIG_TEST_UDELAY) += test_udelay.o
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 0519a8805aab..57518efc3810 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -233,7 +233,6 @@ EXPORT_SYMBOL_GPL(alarm_expires_remaining);
233/** 233/**
234 * alarmtimer_suspend - Suspend time callback 234 * alarmtimer_suspend - Suspend time callback
235 * @dev: unused 235 * @dev: unused
236 * @state: unused
237 * 236 *
238 * When we are going into suspend, we look through the bases 237 * When we are going into suspend, we look through the bases
239 * to see which is the soonest timer to expire. We then 238 * to see which is the soonest timer to expire. We then
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 3bcc19ceb073..fff5f64981c6 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -105,12 +105,12 @@ static DEFINE_SPINLOCK(watchdog_lock);
105static int watchdog_running; 105static int watchdog_running;
106static atomic_t watchdog_reset_pending; 106static atomic_t watchdog_reset_pending;
107 107
108static void inline clocksource_watchdog_lock(unsigned long *flags) 108static inline void clocksource_watchdog_lock(unsigned long *flags)
109{ 109{
110 spin_lock_irqsave(&watchdog_lock, *flags); 110 spin_lock_irqsave(&watchdog_lock, *flags);
111} 111}
112 112
113static void inline clocksource_watchdog_unlock(unsigned long *flags) 113static inline void clocksource_watchdog_unlock(unsigned long *flags)
114{ 114{
115 spin_unlock_irqrestore(&watchdog_lock, *flags); 115 spin_unlock_irqrestore(&watchdog_lock, *flags);
116} 116}
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 41dfff23c1f9..5ee77f1a8a92 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -30,7 +30,6 @@
30#include <linux/syscalls.h> 30#include <linux/syscalls.h>
31#include <linux/interrupt.h> 31#include <linux/interrupt.h>
32#include <linux/tick.h> 32#include <linux/tick.h>
33#include <linux/seq_file.h>
34#include <linux/err.h> 33#include <linux/err.h>
35#include <linux/debugobjects.h> 34#include <linux/debugobjects.h>
36#include <linux/sched/signal.h> 35#include <linux/sched/signal.h>
@@ -1115,9 +1114,10 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
1115 * @timer: hrtimer to stop 1114 * @timer: hrtimer to stop
1116 * 1115 *
1117 * Returns: 1116 * Returns:
1118 * 0 when the timer was not active 1117 *
1119 * 1 when the timer was active 1118 * * 0 when the timer was not active
1120 * -1 when the timer is currently executing the callback function and 1119 * * 1 when the timer was active
1120 * * -1 when the timer is currently executing the callback function and
1121 * cannot be stopped 1121 * cannot be stopped
1122 */ 1122 */
1123int hrtimer_try_to_cancel(struct hrtimer *timer) 1123int hrtimer_try_to_cancel(struct hrtimer *timer)
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index ac5555e25733..65eb796610dc 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -43,6 +43,7 @@ static u64 tick_length_base;
43#define MAX_TICKADJ 500LL /* usecs */ 43#define MAX_TICKADJ 500LL /* usecs */
44#define MAX_TICKADJ_SCALED \ 44#define MAX_TICKADJ_SCALED \
45 (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) 45 (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
46#define MAX_TAI_OFFSET 100000
46 47
47/* 48/*
48 * phase-lock loop variables 49 * phase-lock loop variables
@@ -691,7 +692,8 @@ static inline void process_adjtimex_modes(const struct __kernel_timex *txc,
691 time_constant = max(time_constant, 0l); 692 time_constant = max(time_constant, 0l);
692 } 693 }
693 694
694 if (txc->modes & ADJ_TAI && txc->constant > 0) 695 if (txc->modes & ADJ_TAI &&
696 txc->constant >= 0 && txc->constant <= MAX_TAI_OFFSET)
695 *time_tai = txc->constant; 697 *time_tai = txc->constant;
696 698
697 if (txc->modes & ADJ_OFFSET) 699 if (txc->modes & ADJ_OFFSET)
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 29176635991f..d7f2d91acdac 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -980,23 +980,16 @@ retry_delete:
980 */ 980 */
981static void itimer_delete(struct k_itimer *timer) 981static void itimer_delete(struct k_itimer *timer)
982{ 982{
983 unsigned long flags;
984
985retry_delete: 983retry_delete:
986 spin_lock_irqsave(&timer->it_lock, flags); 984 spin_lock_irq(&timer->it_lock);
987 985
988 if (timer_delete_hook(timer) == TIMER_RETRY) { 986 if (timer_delete_hook(timer) == TIMER_RETRY) {
989 unlock_timer(timer, flags); 987 spin_unlock_irq(&timer->it_lock);
990 goto retry_delete; 988 goto retry_delete;
991 } 989 }
992 list_del(&timer->list); 990 list_del(&timer->list);
993 /*
994 * This keeps any tasks waiting on the spin lock from thinking
995 * they got something (see the lock code above).
996 */
997 timer->it_signal = NULL;
998 991
999 unlock_timer(timer, flags); 992 spin_unlock_irq(&timer->it_lock);
1000 release_posix_timer(timer, IT_ID_SET); 993 release_posix_timer(timer, IT_ID_SET);
1001} 994}
1002 995
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f4ee1a3428ae..be9707f68024 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -782,7 +782,6 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
782 */ 782 */
783 if (!ts->tick_stopped) { 783 if (!ts->tick_stopped) {
784 calc_load_nohz_start(); 784 calc_load_nohz_start();
785 cpu_load_update_nohz_start();
786 quiet_vmstat(); 785 quiet_vmstat();
787 786
788 ts->last_tick = hrtimer_get_expires(&ts->sched_timer); 787 ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
@@ -829,7 +828,6 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
829{ 828{
830 /* Update jiffies first */ 829 /* Update jiffies first */
831 tick_do_update_jiffies64(now); 830 tick_do_update_jiffies64(now);
832 cpu_load_update_nohz_stop();
833 831
834 /* 832 /*
835 * Clear the timer idle flag, so we avoid IPIs on remote queueing and 833 * Clear the timer idle flag, so we avoid IPIs on remote queueing and
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 7f7d6914ddd5..5c54ca632d08 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -251,6 +251,10 @@ COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv,
251 if (tv) { 251 if (tv) {
252 if (compat_get_timeval(&user_tv, tv)) 252 if (compat_get_timeval(&user_tv, tv))
253 return -EFAULT; 253 return -EFAULT;
254
255 if (!timeval_valid(&user_tv))
256 return -EINVAL;
257
254 new_ts.tv_sec = user_tv.tv_sec; 258 new_ts.tv_sec = user_tv.tv_sec;
255 new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; 259 new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC;
256 } 260 }
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 85f5912d8f70..d911c8470149 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -808,17 +808,18 @@ ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs)
808 struct timekeeper *tk = &tk_core.timekeeper; 808 struct timekeeper *tk = &tk_core.timekeeper;
809 unsigned int seq; 809 unsigned int seq;
810 ktime_t base, *offset = offsets[offs]; 810 ktime_t base, *offset = offsets[offs];
811 u64 nsecs;
811 812
812 WARN_ON(timekeeping_suspended); 813 WARN_ON(timekeeping_suspended);
813 814
814 do { 815 do {
815 seq = read_seqcount_begin(&tk_core.seq); 816 seq = read_seqcount_begin(&tk_core.seq);
816 base = ktime_add(tk->tkr_mono.base, *offset); 817 base = ktime_add(tk->tkr_mono.base, *offset);
818 nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
817 819
818 } while (read_seqcount_retry(&tk_core.seq, seq)); 820 } while (read_seqcount_retry(&tk_core.seq, seq));
819 821
820 return base; 822 return ktime_add_ns(base, nsecs);
821
822} 823}
823EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset); 824EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset);
824 825
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 98ba50dcb1b2..acb326f5f50a 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -282,23 +282,6 @@ static inline void timer_list_header(struct seq_file *m, u64 now)
282 SEQ_printf(m, "\n"); 282 SEQ_printf(m, "\n");
283} 283}
284 284
285static int timer_list_show(struct seq_file *m, void *v)
286{
287 struct timer_list_iter *iter = v;
288
289 if (iter->cpu == -1 && !iter->second_pass)
290 timer_list_header(m, iter->now);
291 else if (!iter->second_pass)
292 print_cpu(m, iter->cpu, iter->now);
293#ifdef CONFIG_GENERIC_CLOCKEVENTS
294 else if (iter->cpu == -1 && iter->second_pass)
295 timer_list_show_tickdevices_header(m);
296 else
297 print_tickdevice(m, tick_get_device(iter->cpu), iter->cpu);
298#endif
299 return 0;
300}
301
302void sysrq_timer_list_show(void) 285void sysrq_timer_list_show(void)
303{ 286{
304 u64 now = ktime_to_ns(ktime_get()); 287 u64 now = ktime_to_ns(ktime_get());
@@ -317,6 +300,24 @@ void sysrq_timer_list_show(void)
317 return; 300 return;
318} 301}
319 302
303#ifdef CONFIG_PROC_FS
304static int timer_list_show(struct seq_file *m, void *v)
305{
306 struct timer_list_iter *iter = v;
307
308 if (iter->cpu == -1 && !iter->second_pass)
309 timer_list_header(m, iter->now);
310 else if (!iter->second_pass)
311 print_cpu(m, iter->cpu, iter->now);
312#ifdef CONFIG_GENERIC_CLOCKEVENTS
313 else if (iter->cpu == -1 && iter->second_pass)
314 timer_list_show_tickdevices_header(m);
315 else
316 print_tickdevice(m, tick_get_device(iter->cpu), iter->cpu);
317#endif
318 return 0;
319}
320
320static void *move_iter(struct timer_list_iter *iter, loff_t offset) 321static void *move_iter(struct timer_list_iter *iter, loff_t offset)
321{ 322{
322 for (; offset; offset--) { 323 for (; offset; offset--) {
@@ -376,3 +377,4 @@ static int __init init_timer_list_procfs(void)
376 return 0; 377 return 0;
377} 378}
378__initcall(init_timer_list_procfs); 379__initcall(init_timer_list_procfs);
380#endif
diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c
new file mode 100644
index 000000000000..8cf3596a4ce6
--- /dev/null
+++ b/kernel/time/vsyscall.c
@@ -0,0 +1,129 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright 2019 ARM Ltd.
4 *
5 * Generic implementation of update_vsyscall and update_vsyscall_tz.
6 *
7 * Based on the x86 specific implementation.
8 */
9
10#include <linux/hrtimer.h>
11#include <linux/timekeeper_internal.h>
12#include <vdso/datapage.h>
13#include <vdso/helpers.h>
14#include <vdso/vsyscall.h>
15
16static inline void update_vdso_data(struct vdso_data *vdata,
17 struct timekeeper *tk)
18{
19 struct vdso_timestamp *vdso_ts;
20 u64 nsec;
21
22 vdata[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last;
23 vdata[CS_HRES_COARSE].mask = tk->tkr_mono.mask;
24 vdata[CS_HRES_COARSE].mult = tk->tkr_mono.mult;
25 vdata[CS_HRES_COARSE].shift = tk->tkr_mono.shift;
26 vdata[CS_RAW].cycle_last = tk->tkr_raw.cycle_last;
27 vdata[CS_RAW].mask = tk->tkr_raw.mask;
28 vdata[CS_RAW].mult = tk->tkr_raw.mult;
29 vdata[CS_RAW].shift = tk->tkr_raw.shift;
30
31 /* CLOCK_REALTIME */
32 vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME];
33 vdso_ts->sec = tk->xtime_sec;
34 vdso_ts->nsec = tk->tkr_mono.xtime_nsec;
35
36 /* CLOCK_MONOTONIC */
37 vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC];
38 vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
39
40 nsec = tk->tkr_mono.xtime_nsec;
41 nsec += ((u64)tk->wall_to_monotonic.tv_nsec << tk->tkr_mono.shift);
42 while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) {
43 nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift);
44 vdso_ts->sec++;
45 }
46 vdso_ts->nsec = nsec;
47
48 /* CLOCK_MONOTONIC_RAW */
49 vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW];
50 vdso_ts->sec = tk->raw_sec;
51 vdso_ts->nsec = tk->tkr_raw.xtime_nsec;
52
53 /* CLOCK_BOOTTIME */
54 vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME];
55 vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
56 nsec = tk->tkr_mono.xtime_nsec;
57 nsec += ((u64)(tk->wall_to_monotonic.tv_nsec +
58 ktime_to_ns(tk->offs_boot)) << tk->tkr_mono.shift);
59 while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) {
60 nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift);
61 vdso_ts->sec++;
62 }
63 vdso_ts->nsec = nsec;
64
65 /* CLOCK_TAI */
66 vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_TAI];
67 vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset;
68 vdso_ts->nsec = tk->tkr_mono.xtime_nsec;
69
70 /*
71 * Read without the seqlock held by clock_getres().
72 * Note: No need to have a second copy.
73 */
74 WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution);
75}
76
77void update_vsyscall(struct timekeeper *tk)
78{
79 struct vdso_data *vdata = __arch_get_k_vdso_data();
80 struct vdso_timestamp *vdso_ts;
81 u64 nsec;
82
83 if (__arch_update_vdso_data()) {
84 /*
85 * Some architectures might want to skip the update of the
86 * data page.
87 */
88 return;
89 }
90
91 /* copy vsyscall data */
92 vdso_write_begin(vdata);
93
94 vdata[CS_HRES_COARSE].clock_mode = __arch_get_clock_mode(tk);
95 vdata[CS_RAW].clock_mode = __arch_get_clock_mode(tk);
96
97 /* CLOCK_REALTIME_COARSE */
98 vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE];
99 vdso_ts->sec = tk->xtime_sec;
100 vdso_ts->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
101
102 /* CLOCK_MONOTONIC_COARSE */
103 vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE];
104 vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
105 nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
106 nsec = nsec + tk->wall_to_monotonic.tv_nsec;
107 vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &vdso_ts->nsec);
108
109 if (__arch_use_vsyscall(vdata))
110 update_vdso_data(vdata, tk);
111
112 __arch_update_vsyscall(vdata, tk);
113
114 vdso_write_end(vdata);
115
116 __arch_sync_vdso_data(vdata);
117}
118
119void update_vsyscall_tz(void)
120{
121 struct vdso_data *vdata = __arch_get_k_vdso_data();
122
123 if (__arch_use_vsyscall(vdata)) {
124 vdata[CS_HRES_COARSE].tz_minuteswest = sys_tz.tz_minuteswest;
125 vdata[CS_HRES_COARSE].tz_dsttime = sys_tz.tz_dsttime;
126 }
127
128 __arch_sync_vdso_data(vdata);
129}
diff --git a/kernel/torture.c b/kernel/torture.c
index 17b2be9bde12..a8d9bdfba7c3 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -570,6 +570,7 @@ static void torture_shutdown_cleanup(void)
570static struct task_struct *stutter_task; 570static struct task_struct *stutter_task;
571static int stutter_pause_test; 571static int stutter_pause_test;
572static int stutter; 572static int stutter;
573static int stutter_gap;
573 574
574/* 575/*
575 * Block until the stutter interval ends. This must be called periodically 576 * Block until the stutter interval ends. This must be called periodically
@@ -578,10 +579,12 @@ static int stutter;
578bool stutter_wait(const char *title) 579bool stutter_wait(const char *title)
579{ 580{
580 int spt; 581 int spt;
582 bool ret = false;
581 583
582 cond_resched_tasks_rcu_qs(); 584 cond_resched_tasks_rcu_qs();
583 spt = READ_ONCE(stutter_pause_test); 585 spt = READ_ONCE(stutter_pause_test);
584 for (; spt; spt = READ_ONCE(stutter_pause_test)) { 586 for (; spt; spt = READ_ONCE(stutter_pause_test)) {
587 ret = true;
585 if (spt == 1) { 588 if (spt == 1) {
586 schedule_timeout_interruptible(1); 589 schedule_timeout_interruptible(1);
587 } else if (spt == 2) { 590 } else if (spt == 2) {
@@ -592,7 +595,7 @@ bool stutter_wait(const char *title)
592 } 595 }
593 torture_shutdown_absorb(title); 596 torture_shutdown_absorb(title);
594 } 597 }
595 return !!spt; 598 return ret;
596} 599}
597EXPORT_SYMBOL_GPL(stutter_wait); 600EXPORT_SYMBOL_GPL(stutter_wait);
598 601
@@ -602,17 +605,24 @@ EXPORT_SYMBOL_GPL(stutter_wait);
602 */ 605 */
603static int torture_stutter(void *arg) 606static int torture_stutter(void *arg)
604{ 607{
608 int wtime;
609
605 VERBOSE_TOROUT_STRING("torture_stutter task started"); 610 VERBOSE_TOROUT_STRING("torture_stutter task started");
606 do { 611 do {
607 if (!torture_must_stop() && stutter > 1) { 612 if (!torture_must_stop() && stutter > 1) {
608 WRITE_ONCE(stutter_pause_test, 1); 613 wtime = stutter;
609 schedule_timeout_interruptible(stutter - 1); 614 if (stutter > HZ + 1) {
615 WRITE_ONCE(stutter_pause_test, 1);
616 wtime = stutter - HZ - 1;
617 schedule_timeout_interruptible(wtime);
618 wtime = HZ + 1;
619 }
610 WRITE_ONCE(stutter_pause_test, 2); 620 WRITE_ONCE(stutter_pause_test, 2);
611 schedule_timeout_interruptible(1); 621 schedule_timeout_interruptible(wtime);
612 } 622 }
613 WRITE_ONCE(stutter_pause_test, 0); 623 WRITE_ONCE(stutter_pause_test, 0);
614 if (!torture_must_stop()) 624 if (!torture_must_stop())
615 schedule_timeout_interruptible(stutter); 625 schedule_timeout_interruptible(stutter_gap);
616 torture_shutdown_absorb("torture_stutter"); 626 torture_shutdown_absorb("torture_stutter");
617 } while (!torture_must_stop()); 627 } while (!torture_must_stop());
618 torture_kthread_stopping("torture_stutter"); 628 torture_kthread_stopping("torture_stutter");
@@ -622,9 +632,10 @@ static int torture_stutter(void *arg)
622/* 632/*
623 * Initialize and kick off the torture_stutter kthread. 633 * Initialize and kick off the torture_stutter kthread.
624 */ 634 */
625int torture_stutter_init(const int s) 635int torture_stutter_init(const int s, const int sgap)
626{ 636{
627 stutter = s; 637 stutter = s;
638 stutter_gap = sgap;
628 return torture_create_kthread(torture_stutter, NULL, stutter_task); 639 return torture_create_kthread(torture_stutter, NULL, stutter_task);
629} 640}
630EXPORT_SYMBOL_GPL(torture_stutter_init); 641EXPORT_SYMBOL_GPL(torture_stutter_init);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 5d965cef6c77..564e5fdb025f 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -1,3 +1,4 @@
1# SPDX-License-Identifier: GPL-2.0-only
1# 2#
2# Architectures that offer an FUNCTION_TRACER implementation should 3# Architectures that offer an FUNCTION_TRACER implementation should
3# select HAVE_FUNCTION_TRACER: 4# select HAVE_FUNCTION_TRACER:
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index e1c6d79fb4cc..2d6e93ab0478 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -512,8 +512,6 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
512 dir = debugfs_lookup(buts->name, blk_debugfs_root); 512 dir = debugfs_lookup(buts->name, blk_debugfs_root);
513 if (!dir) 513 if (!dir)
514 bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); 514 bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root);
515 if (!dir)
516 goto err;
517 515
518 bt->dev = dev; 516 bt->dev = dev;
519 atomic_set(&bt->dropped, 0); 517 atomic_set(&bt->dropped, 0);
@@ -522,12 +520,8 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
522 ret = -EIO; 520 ret = -EIO;
523 bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, 521 bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt,
524 &blk_dropped_fops); 522 &blk_dropped_fops);
525 if (!bt->dropped_file)
526 goto err;
527 523
528 bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops); 524 bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);
529 if (!bt->msg_file)
530 goto err;
531 525
532 bt->rchan = relay_open("trace", dir, buts->buf_size, 526 bt->rchan = relay_open("trace", dir, buts->buf_size,
533 buts->buf_nr, &blk_relay_callbacks, bt); 527 buts->buf_nr, &blk_relay_callbacks, bt);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index b496ffdf5f36..ca1255d14576 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -19,6 +19,9 @@
19#include "trace_probe.h" 19#include "trace_probe.h"
20#include "trace.h" 20#include "trace.h"
21 21
22#define bpf_event_rcu_dereference(p) \
23 rcu_dereference_protected(p, lockdep_is_held(&bpf_event_mutex))
24
22#ifdef CONFIG_MODULES 25#ifdef CONFIG_MODULES
23struct bpf_trace_module { 26struct bpf_trace_module {
24 struct module *module; 27 struct module *module;
@@ -410,8 +413,6 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
410 .arg4_type = ARG_CONST_SIZE, 413 .arg4_type = ARG_CONST_SIZE,
411}; 414};
412 415
413static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd);
414
415static __always_inline u64 416static __always_inline u64
416__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, 417__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
417 u64 flags, struct perf_sample_data *sd) 418 u64 flags, struct perf_sample_data *sd)
@@ -442,24 +443,50 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
442 return perf_event_output(event, sd, regs); 443 return perf_event_output(event, sd, regs);
443} 444}
444 445
446/*
447 * Support executing tracepoints in normal, irq, and nmi context that each call
448 * bpf_perf_event_output
449 */
450struct bpf_trace_sample_data {
451 struct perf_sample_data sds[3];
452};
453
454static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_trace_sds);
455static DEFINE_PER_CPU(int, bpf_trace_nest_level);
445BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, 456BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
446 u64, flags, void *, data, u64, size) 457 u64, flags, void *, data, u64, size)
447{ 458{
448 struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd); 459 struct bpf_trace_sample_data *sds = this_cpu_ptr(&bpf_trace_sds);
460 int nest_level = this_cpu_inc_return(bpf_trace_nest_level);
449 struct perf_raw_record raw = { 461 struct perf_raw_record raw = {
450 .frag = { 462 .frag = {
451 .size = size, 463 .size = size,
452 .data = data, 464 .data = data,
453 }, 465 },
454 }; 466 };
467 struct perf_sample_data *sd;
468 int err;
455 469
456 if (unlikely(flags & ~(BPF_F_INDEX_MASK))) 470 if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds))) {
457 return -EINVAL; 471 err = -EBUSY;
472 goto out;
473 }
474
475 sd = &sds->sds[nest_level - 1];
476
477 if (unlikely(flags & ~(BPF_F_INDEX_MASK))) {
478 err = -EINVAL;
479 goto out;
480 }
458 481
459 perf_sample_data_init(sd, 0, 0); 482 perf_sample_data_init(sd, 0, 0);
460 sd->raw = &raw; 483 sd->raw = &raw;
461 484
462 return __bpf_perf_event_output(regs, map, flags, sd); 485 err = __bpf_perf_event_output(regs, map, flags, sd);
486
487out:
488 this_cpu_dec(bpf_trace_nest_level);
489 return err;
463} 490}
464 491
465static const struct bpf_func_proto bpf_perf_event_output_proto = { 492static const struct bpf_func_proto bpf_perf_event_output_proto = {
@@ -567,6 +594,69 @@ static const struct bpf_func_proto bpf_probe_read_str_proto = {
567 .arg3_type = ARG_ANYTHING, 594 .arg3_type = ARG_ANYTHING,
568}; 595};
569 596
597struct send_signal_irq_work {
598 struct irq_work irq_work;
599 struct task_struct *task;
600 u32 sig;
601};
602
603static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work);
604
605static void do_bpf_send_signal(struct irq_work *entry)
606{
607 struct send_signal_irq_work *work;
608
609 work = container_of(entry, struct send_signal_irq_work, irq_work);
610 group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, PIDTYPE_TGID);
611}
612
613BPF_CALL_1(bpf_send_signal, u32, sig)
614{
615 struct send_signal_irq_work *work = NULL;
616
617 /* Similar to bpf_probe_write_user, task needs to be
618 * in a sound condition and kernel memory access be
619 * permitted in order to send signal to the current
620 * task.
621 */
622 if (unlikely(current->flags & (PF_KTHREAD | PF_EXITING)))
623 return -EPERM;
624 if (unlikely(uaccess_kernel()))
625 return -EPERM;
626 if (unlikely(!nmi_uaccess_okay()))
627 return -EPERM;
628
629 if (in_nmi()) {
630 /* Do an early check on signal validity. Otherwise,
631 * the error is lost in deferred irq_work.
632 */
633 if (unlikely(!valid_signal(sig)))
634 return -EINVAL;
635
636 work = this_cpu_ptr(&send_signal_work);
637 if (work->irq_work.flags & IRQ_WORK_BUSY)
638 return -EBUSY;
639
640 /* Add the current task, which is the target of sending signal,
641 * to the irq_work. The current task may change when queued
642 * irq works get executed.
643 */
644 work->task = current;
645 work->sig = sig;
646 irq_work_queue(&work->irq_work);
647 return 0;
648 }
649
650 return group_send_sig_info(sig, SEND_SIG_PRIV, current, PIDTYPE_TGID);
651}
652
653static const struct bpf_func_proto bpf_send_signal_proto = {
654 .func = bpf_send_signal,
655 .gpl_only = false,
656 .ret_type = RET_INTEGER,
657 .arg1_type = ARG_ANYTHING,
658};
659
570static const struct bpf_func_proto * 660static const struct bpf_func_proto *
571tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 661tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
572{ 662{
@@ -617,6 +707,8 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
617 case BPF_FUNC_get_current_cgroup_id: 707 case BPF_FUNC_get_current_cgroup_id:
618 return &bpf_get_current_cgroup_id_proto; 708 return &bpf_get_current_cgroup_id_proto;
619#endif 709#endif
710 case BPF_FUNC_send_signal:
711 return &bpf_send_signal_proto;
620 default: 712 default:
621 return NULL; 713 return NULL;
622 } 714 }
@@ -822,16 +914,48 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
822/* 914/*
823 * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp 915 * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp
824 * to avoid potential recursive reuse issue when/if tracepoints are added 916 * to avoid potential recursive reuse issue when/if tracepoints are added
825 * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack 917 * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack.
918 *
919 * Since raw tracepoints run despite bpf_prog_active, support concurrent usage
920 * in normal, irq, and nmi context.
826 */ 921 */
827static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs); 922struct bpf_raw_tp_regs {
923 struct pt_regs regs[3];
924};
925static DEFINE_PER_CPU(struct bpf_raw_tp_regs, bpf_raw_tp_regs);
926static DEFINE_PER_CPU(int, bpf_raw_tp_nest_level);
927static struct pt_regs *get_bpf_raw_tp_regs(void)
928{
929 struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs);
930 int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level);
931
932 if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(tp_regs->regs))) {
933 this_cpu_dec(bpf_raw_tp_nest_level);
934 return ERR_PTR(-EBUSY);
935 }
936
937 return &tp_regs->regs[nest_level - 1];
938}
939
940static void put_bpf_raw_tp_regs(void)
941{
942 this_cpu_dec(bpf_raw_tp_nest_level);
943}
944
828BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args, 945BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args,
829 struct bpf_map *, map, u64, flags, void *, data, u64, size) 946 struct bpf_map *, map, u64, flags, void *, data, u64, size)
830{ 947{
831 struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); 948 struct pt_regs *regs = get_bpf_raw_tp_regs();
949 int ret;
950
951 if (IS_ERR(regs))
952 return PTR_ERR(regs);
832 953
833 perf_fetch_caller_regs(regs); 954 perf_fetch_caller_regs(regs);
834 return ____bpf_perf_event_output(regs, map, flags, data, size); 955 ret = ____bpf_perf_event_output(regs, map, flags, data, size);
956
957 put_bpf_raw_tp_regs();
958 return ret;
835} 959}
836 960
837static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { 961static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
@@ -848,12 +972,18 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
848BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args, 972BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args,
849 struct bpf_map *, map, u64, flags) 973 struct bpf_map *, map, u64, flags)
850{ 974{
851 struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); 975 struct pt_regs *regs = get_bpf_raw_tp_regs();
976 int ret;
977
978 if (IS_ERR(regs))
979 return PTR_ERR(regs);
852 980
853 perf_fetch_caller_regs(regs); 981 perf_fetch_caller_regs(regs);
854 /* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */ 982 /* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */
855 return bpf_get_stackid((unsigned long) regs, (unsigned long) map, 983 ret = bpf_get_stackid((unsigned long) regs, (unsigned long) map,
856 flags, 0, 0); 984 flags, 0, 0);
985 put_bpf_raw_tp_regs();
986 return ret;
857} 987}
858 988
859static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = { 989static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
@@ -868,11 +998,17 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
868BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args, 998BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args,
869 void *, buf, u32, size, u64, flags) 999 void *, buf, u32, size, u64, flags)
870{ 1000{
871 struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); 1001 struct pt_regs *regs = get_bpf_raw_tp_regs();
1002 int ret;
1003
1004 if (IS_ERR(regs))
1005 return PTR_ERR(regs);
872 1006
873 perf_fetch_caller_regs(regs); 1007 perf_fetch_caller_regs(regs);
874 return bpf_get_stack((unsigned long) regs, (unsigned long) buf, 1008 ret = bpf_get_stack((unsigned long) regs, (unsigned long) buf,
875 (unsigned long) size, flags, 0); 1009 (unsigned long) size, flags, 0);
1010 put_bpf_raw_tp_regs();
1011 return ret;
876} 1012}
877 1013
878static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = { 1014static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
@@ -1034,7 +1170,7 @@ static DEFINE_MUTEX(bpf_event_mutex);
1034int perf_event_attach_bpf_prog(struct perf_event *event, 1170int perf_event_attach_bpf_prog(struct perf_event *event,
1035 struct bpf_prog *prog) 1171 struct bpf_prog *prog)
1036{ 1172{
1037 struct bpf_prog_array __rcu *old_array; 1173 struct bpf_prog_array *old_array;
1038 struct bpf_prog_array *new_array; 1174 struct bpf_prog_array *new_array;
1039 int ret = -EEXIST; 1175 int ret = -EEXIST;
1040 1176
@@ -1052,7 +1188,7 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
1052 if (event->prog) 1188 if (event->prog)
1053 goto unlock; 1189 goto unlock;
1054 1190
1055 old_array = event->tp_event->prog_array; 1191 old_array = bpf_event_rcu_dereference(event->tp_event->prog_array);
1056 if (old_array && 1192 if (old_array &&
1057 bpf_prog_array_length(old_array) >= BPF_TRACE_MAX_PROGS) { 1193 bpf_prog_array_length(old_array) >= BPF_TRACE_MAX_PROGS) {
1058 ret = -E2BIG; 1194 ret = -E2BIG;
@@ -1075,7 +1211,7 @@ unlock:
1075 1211
1076void perf_event_detach_bpf_prog(struct perf_event *event) 1212void perf_event_detach_bpf_prog(struct perf_event *event)
1077{ 1213{
1078 struct bpf_prog_array __rcu *old_array; 1214 struct bpf_prog_array *old_array;
1079 struct bpf_prog_array *new_array; 1215 struct bpf_prog_array *new_array;
1080 int ret; 1216 int ret;
1081 1217
@@ -1084,7 +1220,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
1084 if (!event->prog) 1220 if (!event->prog)
1085 goto unlock; 1221 goto unlock;
1086 1222
1087 old_array = event->tp_event->prog_array; 1223 old_array = bpf_event_rcu_dereference(event->tp_event->prog_array);
1088 ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array); 1224 ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array);
1089 if (ret == -ENOENT) 1225 if (ret == -ENOENT)
1090 goto unlock; 1226 goto unlock;
@@ -1106,6 +1242,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info)
1106{ 1242{
1107 struct perf_event_query_bpf __user *uquery = info; 1243 struct perf_event_query_bpf __user *uquery = info;
1108 struct perf_event_query_bpf query = {}; 1244 struct perf_event_query_bpf query = {};
1245 struct bpf_prog_array *progs;
1109 u32 *ids, prog_cnt, ids_len; 1246 u32 *ids, prog_cnt, ids_len;
1110 int ret; 1247 int ret;
1111 1248
@@ -1130,10 +1267,8 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info)
1130 */ 1267 */
1131 1268
1132 mutex_lock(&bpf_event_mutex); 1269 mutex_lock(&bpf_event_mutex);
1133 ret = bpf_prog_array_copy_info(event->tp_event->prog_array, 1270 progs = bpf_event_rcu_dereference(event->tp_event->prog_array);
1134 ids, 1271 ret = bpf_prog_array_copy_info(progs, ids, ids_len, &prog_cnt);
1135 ids_len,
1136 &prog_cnt);
1137 mutex_unlock(&bpf_event_mutex); 1272 mutex_unlock(&bpf_event_mutex);
1138 1273
1139 if (copy_to_user(&uquery->prog_cnt, &prog_cnt, sizeof(prog_cnt)) || 1274 if (copy_to_user(&uquery->prog_cnt, &prog_cnt, sizeof(prog_cnt)) ||
@@ -1296,8 +1431,23 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
1296 return err; 1431 return err;
1297} 1432}
1298 1433
1434static int __init send_signal_irq_work_init(void)
1435{
1436 int cpu;
1437 struct send_signal_irq_work *work;
1438
1439 for_each_possible_cpu(cpu) {
1440 work = per_cpu_ptr(&send_signal_work, cpu);
1441 init_irq_work(&work->irq_work, do_bpf_send_signal);
1442 }
1443 return 0;
1444}
1445
1446subsys_initcall(send_signal_irq_work_init);
1447
1299#ifdef CONFIG_MODULES 1448#ifdef CONFIG_MODULES
1300int bpf_event_notify(struct notifier_block *nb, unsigned long op, void *module) 1449static int bpf_event_notify(struct notifier_block *nb, unsigned long op,
1450 void *module)
1301{ 1451{
1302 struct bpf_trace_module *btm, *tmp; 1452 struct bpf_trace_module *btm, *tmp;
1303 struct module *mod = module; 1453 struct module *mod = module;
@@ -1336,7 +1486,7 @@ static struct notifier_block bpf_module_nb = {
1336 .notifier_call = bpf_event_notify, 1486 .notifier_call = bpf_event_notify,
1337}; 1487};
1338 1488
1339int __init bpf_event_init(void) 1489static int __init bpf_event_init(void)
1340{ 1490{
1341 register_module_notifier(&bpf_module_nb); 1491 register_module_notifier(&bpf_module_nb);
1342 return 0; 1492 return 0;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b920358dd8f7..576c41644e77 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -70,12 +70,8 @@
70#define INIT_OPS_HASH(opsname) \ 70#define INIT_OPS_HASH(opsname) \
71 .func_hash = &opsname.local_hash, \ 71 .func_hash = &opsname.local_hash, \
72 .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock), 72 .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock),
73#define ASSIGN_OPS_HASH(opsname, val) \
74 .func_hash = val, \
75 .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock),
76#else 73#else
77#define INIT_OPS_HASH(opsname) 74#define INIT_OPS_HASH(opsname)
78#define ASSIGN_OPS_HASH(opsname, val)
79#endif 75#endif
80 76
81enum { 77enum {
@@ -2939,14 +2935,13 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
2939 p = &pg->records[i]; 2935 p = &pg->records[i];
2940 p->flags = rec_flags; 2936 p->flags = rec_flags;
2941 2937
2942#ifndef CC_USING_NOP_MCOUNT
2943 /* 2938 /*
2944 * Do the initial record conversion from mcount jump 2939 * Do the initial record conversion from mcount jump
2945 * to the NOP instructions. 2940 * to the NOP instructions.
2946 */ 2941 */
2947 if (!ftrace_code_disable(mod, p)) 2942 if (!__is_defined(CC_USING_NOP_MCOUNT) &&
2943 !ftrace_code_disable(mod, p))
2948 break; 2944 break;
2949#endif
2950 2945
2951 update_cnt++; 2946 update_cnt++;
2952 } 2947 }
@@ -3880,7 +3875,7 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops,
3880static bool module_exists(const char *module) 3875static bool module_exists(const char *module)
3881{ 3876{
3882 /* All modules have the symbol __this_module */ 3877 /* All modules have the symbol __this_module */
3883 const char this_mod[] = "__this_module"; 3878 static const char this_mod[] = "__this_module";
3884 char modname[MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 2]; 3879 char modname[MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 2];
3885 unsigned long val; 3880 unsigned long val;
3886 int n; 3881 int n;
@@ -4225,10 +4220,13 @@ void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper,
4225 struct ftrace_func_entry *entry; 4220 struct ftrace_func_entry *entry;
4226 struct ftrace_func_map *map; 4221 struct ftrace_func_map *map;
4227 struct hlist_head *hhd; 4222 struct hlist_head *hhd;
4228 int size = 1 << mapper->hash.size_bits; 4223 int size, i;
4229 int i; 4224
4225 if (!mapper)
4226 return;
4230 4227
4231 if (free_func && mapper->hash.count) { 4228 if (free_func && mapper->hash.count) {
4229 size = 1 << mapper->hash.size_bits;
4232 for (i = 0; i < size; i++) { 4230 for (i = 0; i < size; i++) {
4233 hhd = &mapper->hash.buckets[i]; 4231 hhd = &mapper->hash.buckets[i];
4234 hlist_for_each_entry(entry, hhd, hlist) { 4232 hlist_for_each_entry(entry, hhd, hlist) {
@@ -6265,6 +6263,9 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
6265 preempt_disable_notrace(); 6263 preempt_disable_notrace();
6266 6264
6267 do_for_each_ftrace_op(op, ftrace_ops_list) { 6265 do_for_each_ftrace_op(op, ftrace_ops_list) {
6266 /* Stub functions don't need to be called nor tested */
6267 if (op->flags & FTRACE_OPS_FL_STUB)
6268 continue;
6268 /* 6269 /*
6269 * Check the following for each ops before calling their func: 6270 * Check the following for each ops before calling their func:
6270 * if RCU flag is set, then rcu_is_watching() must be true 6271 * if RCU flag is set, then rcu_is_watching() must be true
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 4ee8d8aa3d0f..05b0b3139ebc 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4979,7 +4979,7 @@ static __init int rb_write_something(struct rb_test_data *data, bool nested)
4979 cnt = data->cnt + (nested ? 27 : 0); 4979 cnt = data->cnt + (nested ? 27 : 0);
4980 4980
4981 /* Multiply cnt by ~e, to make some unique increment */ 4981 /* Multiply cnt by ~e, to make some unique increment */
4982 size = (data->cnt * 68 / 25) % (sizeof(rb_string) - 1); 4982 size = (cnt * 68 / 25) % (sizeof(rb_string) - 1);
4983 4983
4984 len = size + sizeof(struct rb_item); 4984 len = size + sizeof(struct rb_item);
4985 4985
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index ffba6789c0e2..0564f6db0561 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -362,7 +362,7 @@ static void ring_buffer_producer(void)
362 hit--; /* make it non zero */ 362 hit--; /* make it non zero */
363 } 363 }
364 364
365 /* Caculate the average time in nanosecs */ 365 /* Calculate the average time in nanosecs */
366 avg = NSEC_PER_MSEC / (hit + missed); 366 avg = NSEC_PER_MSEC / (hit + missed);
367 trace_printk("%ld ns per entry\n", avg); 367 trace_printk("%ld ns per entry\n", avg);
368 } 368 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ec439999f387..c90c687cf950 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1727,6 +1727,10 @@ static __init int init_trace_selftests(void)
1727 pr_info("Running postponed tracer tests:\n"); 1727 pr_info("Running postponed tracer tests:\n");
1728 1728
1729 list_for_each_entry_safe(p, n, &postponed_selftests, list) { 1729 list_for_each_entry_safe(p, n, &postponed_selftests, list) {
1730 /* This loop can take minutes when sanitizers are enabled, so
1731 * lets make sure we allow RCU processing.
1732 */
1733 cond_resched();
1730 ret = run_tracer_selftest(p->type); 1734 ret = run_tracer_selftest(p->type);
1731 /* If the test fails, then warn and remove from available_tracers */ 1735 /* If the test fails, then warn and remove from available_tracers */
1732 if (ret < 0) { 1736 if (ret < 0) {
@@ -3045,6 +3049,7 @@ void trace_printk_init_buffers(void)
3045 if (global_trace.trace_buffer.buffer) 3049 if (global_trace.trace_buffer.buffer)
3046 tracing_start_cmdline_record(); 3050 tracing_start_cmdline_record();
3047} 3051}
3052EXPORT_SYMBOL_GPL(trace_printk_init_buffers);
3048 3053
3049void trace_printk_start_comm(void) 3054void trace_printk_start_comm(void)
3050{ 3055{
@@ -3205,6 +3210,7 @@ int trace_array_printk(struct trace_array *tr,
3205 va_end(ap); 3210 va_end(ap);
3206 return ret; 3211 return ret;
3207} 3212}
3213EXPORT_SYMBOL_GPL(trace_array_printk);
3208 3214
3209__printf(3, 4) 3215__printf(3, 4)
3210int trace_array_printk_buf(struct ring_buffer *buffer, 3216int trace_array_printk_buf(struct ring_buffer *buffer,
@@ -3483,33 +3489,68 @@ static void s_stop(struct seq_file *m, void *p)
3483} 3489}
3484 3490
3485static void 3491static void
3492get_total_entries_cpu(struct trace_buffer *buf, unsigned long *total,
3493 unsigned long *entries, int cpu)
3494{
3495 unsigned long count;
3496
3497 count = ring_buffer_entries_cpu(buf->buffer, cpu);
3498 /*
3499 * If this buffer has skipped entries, then we hold all
3500 * entries for the trace and we need to ignore the
3501 * ones before the time stamp.
3502 */
3503 if (per_cpu_ptr(buf->data, cpu)->skipped_entries) {
3504 count -= per_cpu_ptr(buf->data, cpu)->skipped_entries;
3505 /* total is the same as the entries */
3506 *total = count;
3507 } else
3508 *total = count +
3509 ring_buffer_overrun_cpu(buf->buffer, cpu);
3510 *entries = count;
3511}
3512
3513static void
3486get_total_entries(struct trace_buffer *buf, 3514get_total_entries(struct trace_buffer *buf,
3487 unsigned long *total, unsigned long *entries) 3515 unsigned long *total, unsigned long *entries)
3488{ 3516{
3489 unsigned long count; 3517 unsigned long t, e;
3490 int cpu; 3518 int cpu;
3491 3519
3492 *total = 0; 3520 *total = 0;
3493 *entries = 0; 3521 *entries = 0;
3494 3522
3495 for_each_tracing_cpu(cpu) { 3523 for_each_tracing_cpu(cpu) {
3496 count = ring_buffer_entries_cpu(buf->buffer, cpu); 3524 get_total_entries_cpu(buf, &t, &e, cpu);
3497 /* 3525 *total += t;
3498 * If this buffer has skipped entries, then we hold all 3526 *entries += e;
3499 * entries for the trace and we need to ignore the
3500 * ones before the time stamp.
3501 */
3502 if (per_cpu_ptr(buf->data, cpu)->skipped_entries) {
3503 count -= per_cpu_ptr(buf->data, cpu)->skipped_entries;
3504 /* total is the same as the entries */
3505 *total += count;
3506 } else
3507 *total += count +
3508 ring_buffer_overrun_cpu(buf->buffer, cpu);
3509 *entries += count;
3510 } 3527 }
3511} 3528}
3512 3529
3530unsigned long trace_total_entries_cpu(struct trace_array *tr, int cpu)
3531{
3532 unsigned long total, entries;
3533
3534 if (!tr)
3535 tr = &global_trace;
3536
3537 get_total_entries_cpu(&tr->trace_buffer, &total, &entries, cpu);
3538
3539 return entries;
3540}
3541
3542unsigned long trace_total_entries(struct trace_array *tr)
3543{
3544 unsigned long total, entries;
3545
3546 if (!tr)
3547 tr = &global_trace;
3548
3549 get_total_entries(&tr->trace_buffer, &total, &entries);
3550
3551 return entries;
3552}
3553
3513static void print_lat_help_header(struct seq_file *m) 3554static void print_lat_help_header(struct seq_file *m)
3514{ 3555{
3515 seq_puts(m, "# _------=> CPU# \n" 3556 seq_puts(m, "# _------=> CPU# \n"
@@ -3548,25 +3589,18 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file
3548 unsigned int flags) 3589 unsigned int flags)
3549{ 3590{
3550 bool tgid = flags & TRACE_ITER_RECORD_TGID; 3591 bool tgid = flags & TRACE_ITER_RECORD_TGID;
3551 const char tgid_space[] = " "; 3592 const char *space = " ";
3552 const char space[] = " "; 3593 int prec = tgid ? 10 : 2;
3553 3594
3554 print_event_info(buf, m); 3595 print_event_info(buf, m);
3555 3596
3556 seq_printf(m, "# %s _-----=> irqs-off\n", 3597 seq_printf(m, "# %.*s _-----=> irqs-off\n", prec, space);
3557 tgid ? tgid_space : space); 3598 seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space);
3558 seq_printf(m, "# %s / _----=> need-resched\n", 3599 seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space);
3559 tgid ? tgid_space : space); 3600 seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space);
3560 seq_printf(m, "# %s| / _---=> hardirq/softirq\n", 3601 seq_printf(m, "# %.*s||| / delay\n", prec, space);
3561 tgid ? tgid_space : space); 3602 seq_printf(m, "# TASK-PID %.*sCPU# |||| TIMESTAMP FUNCTION\n", prec, " TGID ");
3562 seq_printf(m, "# %s|| / _--=> preempt-depth\n", 3603 seq_printf(m, "# | | %.*s | |||| | |\n", prec, " | ");
3563 tgid ? tgid_space : space);
3564 seq_printf(m, "# %s||| / delay\n",
3565 tgid ? tgid_space : space);
3566 seq_printf(m, "# TASK-PID %sCPU# |||| TIMESTAMP FUNCTION\n",
3567 tgid ? " TGID " : space);
3568 seq_printf(m, "# | | %s | |||| | |\n",
3569 tgid ? " | " : space);
3570} 3604}
3571 3605
3572void 3606void
@@ -4692,6 +4726,7 @@ static const char readme_msg[] =
4692 " trace_pipe\t\t- A consuming read to see the contents of the buffer\n" 4726 " trace_pipe\t\t- A consuming read to see the contents of the buffer\n"
4693 " current_tracer\t- function and latency tracers\n" 4727 " current_tracer\t- function and latency tracers\n"
4694 " available_tracers\t- list of configured tracers for current_tracer\n" 4728 " available_tracers\t- list of configured tracers for current_tracer\n"
4729 " error_log\t- error log for failed commands (that support it)\n"
4695 " buffer_size_kb\t- view and modify size of per cpu buffer\n" 4730 " buffer_size_kb\t- view and modify size of per cpu buffer\n"
4696 " buffer_total_size_kb - view total size of all cpu buffers\n\n" 4731 " buffer_total_size_kb - view total size of all cpu buffers\n\n"
4697 " trace_clock\t\t-change the clock used to order events\n" 4732 " trace_clock\t\t-change the clock used to order events\n"
@@ -4712,7 +4747,7 @@ static const char readme_msg[] =
4712 " instances\t\t- Make sub-buffers with: mkdir instances/foo\n" 4747 " instances\t\t- Make sub-buffers with: mkdir instances/foo\n"
4713 "\t\t\t Remove sub-buffer with rmdir\n" 4748 "\t\t\t Remove sub-buffer with rmdir\n"
4714 " trace_options\t\t- Set format or modify how tracing happens\n" 4749 " trace_options\t\t- Set format or modify how tracing happens\n"
4715 "\t\t\t Disable an option by adding a suffix 'no' to the\n" 4750 "\t\t\t Disable an option by prefixing 'no' to the\n"
4716 "\t\t\t option name\n" 4751 "\t\t\t option name\n"
4717 " saved_cmdlines_size\t- echo command number in here to store comm-pid list\n" 4752 " saved_cmdlines_size\t- echo command number in here to store comm-pid list\n"
4718#ifdef CONFIG_DYNAMIC_FTRACE 4753#ifdef CONFIG_DYNAMIC_FTRACE
@@ -6296,13 +6331,13 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
6296 struct ring_buffer *buffer; 6331 struct ring_buffer *buffer;
6297 struct print_entry *entry; 6332 struct print_entry *entry;
6298 unsigned long irq_flags; 6333 unsigned long irq_flags;
6299 const char faulted[] = "<faulted>";
6300 ssize_t written; 6334 ssize_t written;
6301 int size; 6335 int size;
6302 int len; 6336 int len;
6303 6337
6304/* Used in tracing_mark_raw_write() as well */ 6338/* Used in tracing_mark_raw_write() as well */
6305#define FAULTED_SIZE (sizeof(faulted) - 1) /* '\0' is already accounted for */ 6339#define FAULTED_STR "<faulted>"
6340#define FAULTED_SIZE (sizeof(FAULTED_STR) - 1) /* '\0' is already accounted for */
6306 6341
6307 if (tracing_disabled) 6342 if (tracing_disabled)
6308 return -EINVAL; 6343 return -EINVAL;
@@ -6334,7 +6369,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
6334 6369
6335 len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt); 6370 len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt);
6336 if (len) { 6371 if (len) {
6337 memcpy(&entry->buf, faulted, FAULTED_SIZE); 6372 memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE);
6338 cnt = FAULTED_SIZE; 6373 cnt = FAULTED_SIZE;
6339 written = -EFAULT; 6374 written = -EFAULT;
6340 } else 6375 } else
@@ -6375,7 +6410,6 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
6375 struct ring_buffer_event *event; 6410 struct ring_buffer_event *event;
6376 struct ring_buffer *buffer; 6411 struct ring_buffer *buffer;
6377 struct raw_data_entry *entry; 6412 struct raw_data_entry *entry;
6378 const char faulted[] = "<faulted>";
6379 unsigned long irq_flags; 6413 unsigned long irq_flags;
6380 ssize_t written; 6414 ssize_t written;
6381 int size; 6415 int size;
@@ -6415,7 +6449,7 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
6415 len = __copy_from_user_inatomic(&entry->id, ubuf, cnt); 6449 len = __copy_from_user_inatomic(&entry->id, ubuf, cnt);
6416 if (len) { 6450 if (len) {
6417 entry->id = -1; 6451 entry->id = -1;
6418 memcpy(&entry->buf, faulted, FAULTED_SIZE); 6452 memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE);
6419 written = -EFAULT; 6453 written = -EFAULT;
6420 } else 6454 } else
6421 written = cnt; 6455 written = cnt;
@@ -6685,11 +6719,13 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
6685 break; 6719 break;
6686 } 6720 }
6687#endif 6721#endif
6688 if (!tr->allocated_snapshot) { 6722 if (tr->allocated_snapshot)
6723 ret = resize_buffer_duplicate_size(&tr->max_buffer,
6724 &tr->trace_buffer, iter->cpu_file);
6725 else
6689 ret = tracing_alloc_snapshot_instance(tr); 6726 ret = tracing_alloc_snapshot_instance(tr);
6690 if (ret < 0) 6727 if (ret < 0)
6691 break; 6728 break;
6692 }
6693 local_irq_disable(); 6729 local_irq_disable();
6694 /* Now, we're going to swap */ 6730 /* Now, we're going to swap */
6695 if (iter->cpu_file == RING_BUFFER_ALL_CPUS) 6731 if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
@@ -6868,6 +6904,250 @@ static const struct file_operations snapshot_raw_fops = {
6868 6904
6869#endif /* CONFIG_TRACER_SNAPSHOT */ 6905#endif /* CONFIG_TRACER_SNAPSHOT */
6870 6906
6907#define TRACING_LOG_ERRS_MAX 8
6908#define TRACING_LOG_LOC_MAX 128
6909
6910#define CMD_PREFIX " Command: "
6911
6912struct err_info {
6913 const char **errs; /* ptr to loc-specific array of err strings */
6914 u8 type; /* index into errs -> specific err string */
6915 u8 pos; /* MAX_FILTER_STR_VAL = 256 */
6916 u64 ts;
6917};
6918
6919struct tracing_log_err {
6920 struct list_head list;
6921 struct err_info info;
6922 char loc[TRACING_LOG_LOC_MAX]; /* err location */
6923 char cmd[MAX_FILTER_STR_VAL]; /* what caused err */
6924};
6925
6926static DEFINE_MUTEX(tracing_err_log_lock);
6927
6928static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr)
6929{
6930 struct tracing_log_err *err;
6931
6932 if (tr->n_err_log_entries < TRACING_LOG_ERRS_MAX) {
6933 err = kzalloc(sizeof(*err), GFP_KERNEL);
6934 if (!err)
6935 err = ERR_PTR(-ENOMEM);
6936 tr->n_err_log_entries++;
6937
6938 return err;
6939 }
6940
6941 err = list_first_entry(&tr->err_log, struct tracing_log_err, list);
6942 list_del(&err->list);
6943
6944 return err;
6945}
6946
6947/**
6948 * err_pos - find the position of a string within a command for error careting
6949 * @cmd: The tracing command that caused the error
6950 * @str: The string to position the caret at within @cmd
6951 *
6952 * Finds the position of the first occurence of @str within @cmd. The
6953 * return value can be passed to tracing_log_err() for caret placement
6954 * within @cmd.
6955 *
6956 * Returns the index within @cmd of the first occurence of @str or 0
6957 * if @str was not found.
6958 */
6959unsigned int err_pos(char *cmd, const char *str)
6960{
6961 char *found;
6962
6963 if (WARN_ON(!strlen(cmd)))
6964 return 0;
6965
6966 found = strstr(cmd, str);
6967 if (found)
6968 return found - cmd;
6969
6970 return 0;
6971}
6972
6973/**
6974 * tracing_log_err - write an error to the tracing error log
6975 * @tr: The associated trace array for the error (NULL for top level array)
6976 * @loc: A string describing where the error occurred
6977 * @cmd: The tracing command that caused the error
6978 * @errs: The array of loc-specific static error strings
6979 * @type: The index into errs[], which produces the specific static err string
6980 * @pos: The position the caret should be placed in the cmd
6981 *
6982 * Writes an error into tracing/error_log of the form:
6983 *
6984 * <loc>: error: <text>
6985 * Command: <cmd>
6986 * ^
6987 *
6988 * tracing/error_log is a small log file containing the last
6989 * TRACING_LOG_ERRS_MAX errors (8). Memory for errors isn't allocated
6990 * unless there has been a tracing error, and the error log can be
6991 * cleared and have its memory freed by writing the empty string in
6992 * truncation mode to it i.e. echo > tracing/error_log.
6993 *
6994 * NOTE: the @errs array along with the @type param are used to
6995 * produce a static error string - this string is not copied and saved
6996 * when the error is logged - only a pointer to it is saved. See
6997 * existing callers for examples of how static strings are typically
6998 * defined for use with tracing_log_err().
6999 */
7000void tracing_log_err(struct trace_array *tr,
7001 const char *loc, const char *cmd,
7002 const char **errs, u8 type, u8 pos)
7003{
7004 struct tracing_log_err *err;
7005
7006 if (!tr)
7007 tr = &global_trace;
7008
7009 mutex_lock(&tracing_err_log_lock);
7010 err = get_tracing_log_err(tr);
7011 if (PTR_ERR(err) == -ENOMEM) {
7012 mutex_unlock(&tracing_err_log_lock);
7013 return;
7014 }
7015
7016 snprintf(err->loc, TRACING_LOG_LOC_MAX, "%s: error: ", loc);
7017 snprintf(err->cmd, MAX_FILTER_STR_VAL,"\n" CMD_PREFIX "%s\n", cmd);
7018
7019 err->info.errs = errs;
7020 err->info.type = type;
7021 err->info.pos = pos;
7022 err->info.ts = local_clock();
7023
7024 list_add_tail(&err->list, &tr->err_log);
7025 mutex_unlock(&tracing_err_log_lock);
7026}
7027
7028static void clear_tracing_err_log(struct trace_array *tr)
7029{
7030 struct tracing_log_err *err, *next;
7031
7032 mutex_lock(&tracing_err_log_lock);
7033 list_for_each_entry_safe(err, next, &tr->err_log, list) {
7034 list_del(&err->list);
7035 kfree(err);
7036 }
7037
7038 tr->n_err_log_entries = 0;
7039 mutex_unlock(&tracing_err_log_lock);
7040}
7041
7042static void *tracing_err_log_seq_start(struct seq_file *m, loff_t *pos)
7043{
7044 struct trace_array *tr = m->private;
7045
7046 mutex_lock(&tracing_err_log_lock);
7047
7048 return seq_list_start(&tr->err_log, *pos);
7049}
7050
7051static void *tracing_err_log_seq_next(struct seq_file *m, void *v, loff_t *pos)
7052{
7053 struct trace_array *tr = m->private;
7054
7055 return seq_list_next(v, &tr->err_log, pos);
7056}
7057
7058static void tracing_err_log_seq_stop(struct seq_file *m, void *v)
7059{
7060 mutex_unlock(&tracing_err_log_lock);
7061}
7062
7063static void tracing_err_log_show_pos(struct seq_file *m, u8 pos)
7064{
7065 u8 i;
7066
7067 for (i = 0; i < sizeof(CMD_PREFIX) - 1; i++)
7068 seq_putc(m, ' ');
7069 for (i = 0; i < pos; i++)
7070 seq_putc(m, ' ');
7071 seq_puts(m, "^\n");
7072}
7073
7074static int tracing_err_log_seq_show(struct seq_file *m, void *v)
7075{
7076 struct tracing_log_err *err = v;
7077
7078 if (err) {
7079 const char *err_text = err->info.errs[err->info.type];
7080 u64 sec = err->info.ts;
7081 u32 nsec;
7082
7083 nsec = do_div(sec, NSEC_PER_SEC);
7084 seq_printf(m, "[%5llu.%06u] %s%s", sec, nsec / 1000,
7085 err->loc, err_text);
7086 seq_printf(m, "%s", err->cmd);
7087 tracing_err_log_show_pos(m, err->info.pos);
7088 }
7089
7090 return 0;
7091}
7092
7093static const struct seq_operations tracing_err_log_seq_ops = {
7094 .start = tracing_err_log_seq_start,
7095 .next = tracing_err_log_seq_next,
7096 .stop = tracing_err_log_seq_stop,
7097 .show = tracing_err_log_seq_show
7098};
7099
7100static int tracing_err_log_open(struct inode *inode, struct file *file)
7101{
7102 struct trace_array *tr = inode->i_private;
7103 int ret = 0;
7104
7105 if (trace_array_get(tr) < 0)
7106 return -ENODEV;
7107
7108 /* If this file was opened for write, then erase contents */
7109 if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC))
7110 clear_tracing_err_log(tr);
7111
7112 if (file->f_mode & FMODE_READ) {
7113 ret = seq_open(file, &tracing_err_log_seq_ops);
7114 if (!ret) {
7115 struct seq_file *m = file->private_data;
7116 m->private = tr;
7117 } else {
7118 trace_array_put(tr);
7119 }
7120 }
7121 return ret;
7122}
7123
7124static ssize_t tracing_err_log_write(struct file *file,
7125 const char __user *buffer,
7126 size_t count, loff_t *ppos)
7127{
7128 return count;
7129}
7130
7131static int tracing_err_log_release(struct inode *inode, struct file *file)
7132{
7133 struct trace_array *tr = inode->i_private;
7134
7135 trace_array_put(tr);
7136
7137 if (file->f_mode & FMODE_READ)
7138 seq_release(inode, file);
7139
7140 return 0;
7141}
7142
7143static const struct file_operations tracing_err_log_fops = {
7144 .open = tracing_err_log_open,
7145 .write = tracing_err_log_write,
7146 .read = seq_read,
7147 .llseek = seq_lseek,
7148 .release = tracing_err_log_release,
7149};
7150
6871static int tracing_buffers_open(struct inode *inode, struct file *filp) 7151static int tracing_buffers_open(struct inode *inode, struct file *filp)
6872{ 7152{
6873 struct trace_array *tr = inode->i_private; 7153 struct trace_array *tr = inode->i_private;
@@ -7926,7 +8206,7 @@ static const struct file_operations buffer_percent_fops = {
7926 .llseek = default_llseek, 8206 .llseek = default_llseek,
7927}; 8207};
7928 8208
7929struct dentry *trace_instance_dir; 8209static struct dentry *trace_instance_dir;
7930 8210
7931static void 8211static void
7932init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer); 8212init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer);
@@ -8033,7 +8313,7 @@ static void update_tracer_options(struct trace_array *tr)
8033 mutex_unlock(&trace_types_lock); 8313 mutex_unlock(&trace_types_lock);
8034} 8314}
8035 8315
8036static int instance_mkdir(const char *name) 8316struct trace_array *trace_array_create(const char *name)
8037{ 8317{
8038 struct trace_array *tr; 8318 struct trace_array *tr;
8039 int ret; 8319 int ret;
@@ -8072,6 +8352,7 @@ static int instance_mkdir(const char *name)
8072 INIT_LIST_HEAD(&tr->systems); 8352 INIT_LIST_HEAD(&tr->systems);
8073 INIT_LIST_HEAD(&tr->events); 8353 INIT_LIST_HEAD(&tr->events);
8074 INIT_LIST_HEAD(&tr->hist_vars); 8354 INIT_LIST_HEAD(&tr->hist_vars);
8355 INIT_LIST_HEAD(&tr->err_log);
8075 8356
8076 if (allocate_trace_buffers(tr, trace_buf_size) < 0) 8357 if (allocate_trace_buffers(tr, trace_buf_size) < 0)
8077 goto out_free_tr; 8358 goto out_free_tr;
@@ -8097,7 +8378,7 @@ static int instance_mkdir(const char *name)
8097 mutex_unlock(&trace_types_lock); 8378 mutex_unlock(&trace_types_lock);
8098 mutex_unlock(&event_mutex); 8379 mutex_unlock(&event_mutex);
8099 8380
8100 return 0; 8381 return tr;
8101 8382
8102 out_free_tr: 8383 out_free_tr:
8103 free_trace_buffers(tr); 8384 free_trace_buffers(tr);
@@ -8109,33 +8390,21 @@ static int instance_mkdir(const char *name)
8109 mutex_unlock(&trace_types_lock); 8390 mutex_unlock(&trace_types_lock);
8110 mutex_unlock(&event_mutex); 8391 mutex_unlock(&event_mutex);
8111 8392
8112 return ret; 8393 return ERR_PTR(ret);
8394}
8395EXPORT_SYMBOL_GPL(trace_array_create);
8113 8396
8397static int instance_mkdir(const char *name)
8398{
8399 return PTR_ERR_OR_ZERO(trace_array_create(name));
8114} 8400}
8115 8401
8116static int instance_rmdir(const char *name) 8402static int __remove_instance(struct trace_array *tr)
8117{ 8403{
8118 struct trace_array *tr;
8119 int found = 0;
8120 int ret;
8121 int i; 8404 int i;
8122 8405
8123 mutex_lock(&event_mutex);
8124 mutex_lock(&trace_types_lock);
8125
8126 ret = -ENODEV;
8127 list_for_each_entry(tr, &ftrace_trace_arrays, list) {
8128 if (tr->name && strcmp(tr->name, name) == 0) {
8129 found = 1;
8130 break;
8131 }
8132 }
8133 if (!found)
8134 goto out_unlock;
8135
8136 ret = -EBUSY;
8137 if (tr->ref || (tr->current_trace && tr->current_trace->ref)) 8406 if (tr->ref || (tr->current_trace && tr->current_trace->ref))
8138 goto out_unlock; 8407 return -EBUSY;
8139 8408
8140 list_del(&tr->list); 8409 list_del(&tr->list);
8141 8410
@@ -8161,10 +8430,46 @@ static int instance_rmdir(const char *name)
8161 free_cpumask_var(tr->tracing_cpumask); 8430 free_cpumask_var(tr->tracing_cpumask);
8162 kfree(tr->name); 8431 kfree(tr->name);
8163 kfree(tr); 8432 kfree(tr);
8433 tr = NULL;
8164 8434
8165 ret = 0; 8435 return 0;
8436}
8437
8438int trace_array_destroy(struct trace_array *tr)
8439{
8440 int ret;
8441
8442 if (!tr)
8443 return -EINVAL;
8444
8445 mutex_lock(&event_mutex);
8446 mutex_lock(&trace_types_lock);
8447
8448 ret = __remove_instance(tr);
8449
8450 mutex_unlock(&trace_types_lock);
8451 mutex_unlock(&event_mutex);
8452
8453 return ret;
8454}
8455EXPORT_SYMBOL_GPL(trace_array_destroy);
8456
8457static int instance_rmdir(const char *name)
8458{
8459 struct trace_array *tr;
8460 int ret;
8461
8462 mutex_lock(&event_mutex);
8463 mutex_lock(&trace_types_lock);
8464
8465 ret = -ENODEV;
8466 list_for_each_entry(tr, &ftrace_trace_arrays, list) {
8467 if (tr->name && strcmp(tr->name, name) == 0) {
8468 ret = __remove_instance(tr);
8469 break;
8470 }
8471 }
8166 8472
8167 out_unlock:
8168 mutex_unlock(&trace_types_lock); 8473 mutex_unlock(&trace_types_lock);
8169 mutex_unlock(&event_mutex); 8474 mutex_unlock(&event_mutex);
8170 8475
@@ -8254,6 +8559,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
8254 tr, &snapshot_fops); 8559 tr, &snapshot_fops);
8255#endif 8560#endif
8256 8561
8562 trace_create_file("error_log", 0644, d_tracer,
8563 tr, &tracing_err_log_fops);
8564
8257 for_each_tracing_cpu(cpu) 8565 for_each_tracing_cpu(cpu)
8258 tracing_init_tracefs_percpu(tr, cpu); 8566 tracing_init_tracefs_percpu(tr, cpu);
8259 8567
@@ -8310,10 +8618,6 @@ struct dentry *tracing_init_dentry(void)
8310 */ 8618 */
8311 tr->dir = debugfs_create_automount("tracing", NULL, 8619 tr->dir = debugfs_create_automount("tracing", NULL,
8312 trace_automount, NULL); 8620 trace_automount, NULL);
8313 if (!tr->dir) {
8314 pr_warn_once("Could not create debugfs directory 'tracing'\n");
8315 return ERR_PTR(-ENOMEM);
8316 }
8317 8621
8318 return NULL; 8622 return NULL;
8319} 8623}
@@ -8616,12 +8920,8 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
8616 8920
8617 cnt++; 8921 cnt++;
8618 8922
8619 /* reset all but tr, trace, and overruns */ 8923 trace_iterator_reset(&iter);
8620 memset(&iter.seq, 0,
8621 sizeof(struct trace_iterator) -
8622 offsetof(struct trace_iterator, seq));
8623 iter.iter_flags |= TRACE_FILE_LAT_FMT; 8924 iter.iter_flags |= TRACE_FILE_LAT_FMT;
8624 iter.pos = -1;
8625 8925
8626 if (trace_find_next_entry_inc(&iter) != NULL) { 8926 if (trace_find_next_entry_inc(&iter) != NULL) {
8627 int ret; 8927 int ret;
@@ -8839,6 +9139,7 @@ __init static int tracer_alloc_buffers(void)
8839 INIT_LIST_HEAD(&global_trace.systems); 9139 INIT_LIST_HEAD(&global_trace.systems);
8840 INIT_LIST_HEAD(&global_trace.events); 9140 INIT_LIST_HEAD(&global_trace.events);
8841 INIT_LIST_HEAD(&global_trace.hist_vars); 9141 INIT_LIST_HEAD(&global_trace.hist_vars);
9142 INIT_LIST_HEAD(&global_trace.err_log);
8842 list_add(&global_trace.list, &ftrace_trace_arrays); 9143 list_add(&global_trace.list, &ftrace_trace_arrays);
8843 9144
8844 apply_trace_boot_options(); 9145 apply_trace_boot_options();
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 639047b259d7..005f08629b8b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -15,7 +15,6 @@
15#include <linux/trace_seq.h> 15#include <linux/trace_seq.h>
16#include <linux/trace_events.h> 16#include <linux/trace_events.h>
17#include <linux/compiler.h> 17#include <linux/compiler.h>
18#include <linux/trace_seq.h>
19#include <linux/glob.h> 18#include <linux/glob.h>
20 19
21#ifdef CONFIG_FTRACE_SYSCALLS 20#ifdef CONFIG_FTRACE_SYSCALLS
@@ -293,11 +292,13 @@ struct trace_array {
293 int nr_topts; 292 int nr_topts;
294 bool clear_trace; 293 bool clear_trace;
295 int buffer_percent; 294 int buffer_percent;
295 unsigned int n_err_log_entries;
296 struct tracer *current_trace; 296 struct tracer *current_trace;
297 unsigned int trace_flags; 297 unsigned int trace_flags;
298 unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE]; 298 unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE];
299 unsigned int flags; 299 unsigned int flags;
300 raw_spinlock_t start_lock; 300 raw_spinlock_t start_lock;
301 struct list_head err_log;
301 struct dentry *dir; 302 struct dentry *dir;
302 struct dentry *options; 303 struct dentry *options;
303 struct dentry *percpu_dir; 304 struct dentry *percpu_dir;
@@ -719,6 +720,9 @@ void trace_init_global_iter(struct trace_iterator *iter);
719 720
720void tracing_iter_reset(struct trace_iterator *iter, int cpu); 721void tracing_iter_reset(struct trace_iterator *iter, int cpu);
721 722
723unsigned long trace_total_entries_cpu(struct trace_array *tr, int cpu);
724unsigned long trace_total_entries(struct trace_array *tr);
725
722void trace_function(struct trace_array *tr, 726void trace_function(struct trace_array *tr,
723 unsigned long ip, 727 unsigned long ip,
724 unsigned long parent_ip, 728 unsigned long parent_ip,
@@ -1545,7 +1549,8 @@ extern int apply_subsystem_event_filter(struct trace_subsystem_dir *dir,
1545extern void print_subsystem_event_filter(struct event_subsystem *system, 1549extern void print_subsystem_event_filter(struct event_subsystem *system,
1546 struct trace_seq *s); 1550 struct trace_seq *s);
1547extern int filter_assign_type(const char *type); 1551extern int filter_assign_type(const char *type);
1548extern int create_event_filter(struct trace_event_call *call, 1552extern int create_event_filter(struct trace_array *tr,
1553 struct trace_event_call *call,
1549 char *filter_str, bool set_str, 1554 char *filter_str, bool set_str,
1550 struct event_filter **filterp); 1555 struct event_filter **filterp);
1551extern void free_event_filter(struct event_filter *filter); 1556extern void free_event_filter(struct event_filter *filter);
@@ -1876,6 +1881,11 @@ extern ssize_t trace_parse_run_command(struct file *file,
1876 const char __user *buffer, size_t count, loff_t *ppos, 1881 const char __user *buffer, size_t count, loff_t *ppos,
1877 int (*createfn)(int, char**)); 1882 int (*createfn)(int, char**));
1878 1883
1884extern unsigned int err_pos(char *cmd, const char *str);
1885extern void tracing_log_err(struct trace_array *tr,
1886 const char *loc, const char *cmd,
1887 const char **errs, u8 type, u8 pos);
1888
1879/* 1889/*
1880 * Normal trace_printk() and friends allocates special buffers 1890 * Normal trace_printk() and friends allocates special buffers
1881 * to do the manipulation, as well as saves the print formats 1891 * to do the manipulation, as well as saves the print formats
@@ -1956,4 +1966,22 @@ static inline void tracer_hardirqs_off(unsigned long a0, unsigned long a1) { }
1956 1966
1957extern struct trace_iterator *tracepoint_print_iter; 1967extern struct trace_iterator *tracepoint_print_iter;
1958 1968
1969/*
1970 * Reset the state of the trace_iterator so that it can read consumed data.
1971 * Normally, the trace_iterator is used for reading the data when it is not
1972 * consumed, and must retain state.
1973 */
1974static __always_inline void trace_iterator_reset(struct trace_iterator *iter)
1975{
1976 const size_t offset = offsetof(struct trace_iterator, seq);
1977
1978 /*
1979 * Keep gcc from complaining about overwriting more than just one
1980 * member in the structure.
1981 */
1982 memset((char *)iter + offset, 0, sizeof(struct trace_iterator) - offset);
1983
1984 iter->pos = -1;
1985}
1986
1959#endif /* _LINUX_KERNEL_TRACE_H */ 1987#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 5b3b0c3c8a47..0ce3db67f556 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -832,6 +832,7 @@ static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
832 832
833 return ret; 833 return ret;
834} 834}
835EXPORT_SYMBOL_GPL(ftrace_set_clr_event);
835 836
836/** 837/**
837 * trace_set_clr_event - enable or disable an event 838 * trace_set_clr_event - enable or disable an event
@@ -1318,9 +1319,6 @@ event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
1318 char buf[32]; 1319 char buf[32];
1319 int len; 1320 int len;
1320 1321
1321 if (*ppos)
1322 return 0;
1323
1324 if (unlikely(!id)) 1322 if (unlikely(!id))
1325 return -ENODEV; 1323 return -ENODEV;
1326 1324
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 05a66493a164..5079d1db3754 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -66,7 +66,8 @@ static const char * ops[] = { OPS };
66 C(INVALID_FILTER, "Meaningless filter expression"), \ 66 C(INVALID_FILTER, "Meaningless filter expression"), \
67 C(IP_FIELD_ONLY, "Only 'ip' field is supported for function trace"), \ 67 C(IP_FIELD_ONLY, "Only 'ip' field is supported for function trace"), \
68 C(INVALID_VALUE, "Invalid value (did you forget quotes)?"), \ 68 C(INVALID_VALUE, "Invalid value (did you forget quotes)?"), \
69 C(NO_FILTER, "No filter found"), 69 C(ERRNO, "Error"), \
70 C(NO_FILTER, "No filter found")
70 71
71#undef C 72#undef C
72#define C(a, b) FILT_ERR_##a 73#define C(a, b) FILT_ERR_##a
@@ -76,7 +77,7 @@ enum { ERRORS };
76#undef C 77#undef C
77#define C(a, b) b 78#define C(a, b) b
78 79
79static char *err_text[] = { ERRORS }; 80static const char *err_text[] = { ERRORS };
80 81
81/* Called after a '!' character but "!=" and "!~" are not "not"s */ 82/* Called after a '!' character but "!=" and "!~" are not "not"s */
82static bool is_not(const char *str) 83static bool is_not(const char *str)
@@ -427,7 +428,7 @@ predicate_parse(const char *str, int nr_parens, int nr_preds,
427 op_stack = kmalloc_array(nr_parens, sizeof(*op_stack), GFP_KERNEL); 428 op_stack = kmalloc_array(nr_parens, sizeof(*op_stack), GFP_KERNEL);
428 if (!op_stack) 429 if (!op_stack)
429 return ERR_PTR(-ENOMEM); 430 return ERR_PTR(-ENOMEM);
430 prog_stack = kmalloc_array(nr_preds, sizeof(*prog_stack), GFP_KERNEL); 431 prog_stack = kcalloc(nr_preds, sizeof(*prog_stack), GFP_KERNEL);
431 if (!prog_stack) { 432 if (!prog_stack) {
432 parse_error(pe, -ENOMEM, 0); 433 parse_error(pe, -ENOMEM, 0);
433 goto out_free; 434 goto out_free;
@@ -578,7 +579,11 @@ predicate_parse(const char *str, int nr_parens, int nr_preds,
578out_free: 579out_free:
579 kfree(op_stack); 580 kfree(op_stack);
580 kfree(inverts); 581 kfree(inverts);
581 kfree(prog_stack); 582 if (prog_stack) {
583 for (i = 0; prog_stack[i].pred; i++)
584 kfree(prog_stack[i].pred);
585 kfree(prog_stack);
586 }
582 return ERR_PTR(ret); 587 return ERR_PTR(ret);
583} 588}
584 589
@@ -919,7 +924,8 @@ static void remove_filter_string(struct event_filter *filter)
919 filter->filter_string = NULL; 924 filter->filter_string = NULL;
920} 925}
921 926
922static void append_filter_err(struct filter_parse_error *pe, 927static void append_filter_err(struct trace_array *tr,
928 struct filter_parse_error *pe,
923 struct event_filter *filter) 929 struct event_filter *filter)
924{ 930{
925 struct trace_seq *s; 931 struct trace_seq *s;
@@ -947,8 +953,14 @@ static void append_filter_err(struct filter_parse_error *pe,
947 if (pe->lasterr > 0) { 953 if (pe->lasterr > 0) {
948 trace_seq_printf(s, "\n%*s", pos, "^"); 954 trace_seq_printf(s, "\n%*s", pos, "^");
949 trace_seq_printf(s, "\nparse_error: %s\n", err_text[pe->lasterr]); 955 trace_seq_printf(s, "\nparse_error: %s\n", err_text[pe->lasterr]);
956 tracing_log_err(tr, "event filter parse error",
957 filter->filter_string, err_text,
958 pe->lasterr, pe->lasterr_pos);
950 } else { 959 } else {
951 trace_seq_printf(s, "\nError: (%d)\n", pe->lasterr); 960 trace_seq_printf(s, "\nError: (%d)\n", pe->lasterr);
961 tracing_log_err(tr, "event filter parse error",
962 filter->filter_string, err_text,
963 FILT_ERR_ERRNO, 0);
952 } 964 }
953 trace_seq_putc(s, 0); 965 trace_seq_putc(s, 0);
954 buf = kmemdup_nul(s->buffer, s->seq.len, GFP_KERNEL); 966 buf = kmemdup_nul(s->buffer, s->seq.len, GFP_KERNEL);
@@ -1214,30 +1226,30 @@ static int parse_pred(const char *str, void *data,
1214 * (perf doesn't use it) and grab everything. 1226 * (perf doesn't use it) and grab everything.
1215 */ 1227 */
1216 if (strcmp(field->name, "ip") != 0) { 1228 if (strcmp(field->name, "ip") != 0) {
1217 parse_error(pe, FILT_ERR_IP_FIELD_ONLY, pos + i); 1229 parse_error(pe, FILT_ERR_IP_FIELD_ONLY, pos + i);
1218 goto err_free; 1230 goto err_free;
1219 } 1231 }
1220 pred->fn = filter_pred_none; 1232 pred->fn = filter_pred_none;
1221 1233
1222 /* 1234 /*
1223 * Quotes are not required, but if they exist then we need 1235 * Quotes are not required, but if they exist then we need
1224 * to read them till we hit a matching one. 1236 * to read them till we hit a matching one.
1225 */ 1237 */
1226 if (str[i] == '\'' || str[i] == '"') 1238 if (str[i] == '\'' || str[i] == '"')
1227 q = str[i]; 1239 q = str[i];
1228 else 1240 else
1229 q = 0; 1241 q = 0;
1230 1242
1231 for (i++; str[i]; i++) { 1243 for (i++; str[i]; i++) {
1232 if (q && str[i] == q) 1244 if (q && str[i] == q)
1233 break; 1245 break;
1234 if (!q && (str[i] == ')' || str[i] == '&' || 1246 if (!q && (str[i] == ')' || str[i] == '&' ||
1235 str[i] == '|')) 1247 str[i] == '|'))
1236 break; 1248 break;
1237 } 1249 }
1238 /* Skip quotes */ 1250 /* Skip quotes */
1239 if (q) 1251 if (q)
1240 s++; 1252 s++;
1241 len = i - s; 1253 len = i - s;
1242 if (len >= MAX_FILTER_STR_VAL) { 1254 if (len >= MAX_FILTER_STR_VAL) {
1243 parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i); 1255 parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i);
@@ -1600,7 +1612,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir,
1600 if (err) { 1612 if (err) {
1601 filter_disable(file); 1613 filter_disable(file);
1602 parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0); 1614 parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0);
1603 append_filter_err(pe, filter); 1615 append_filter_err(tr, pe, filter);
1604 } else 1616 } else
1605 event_set_filtered_flag(file); 1617 event_set_filtered_flag(file);
1606 1618
@@ -1712,7 +1724,8 @@ static void create_filter_finish(struct filter_parse_error *pe)
1712 * information if @set_str is %true and the caller is responsible for 1724 * information if @set_str is %true and the caller is responsible for
1713 * freeing it. 1725 * freeing it.
1714 */ 1726 */
1715static int create_filter(struct trace_event_call *call, 1727static int create_filter(struct trace_array *tr,
1728 struct trace_event_call *call,
1716 char *filter_string, bool set_str, 1729 char *filter_string, bool set_str,
1717 struct event_filter **filterp) 1730 struct event_filter **filterp)
1718{ 1731{
@@ -1729,17 +1742,18 @@ static int create_filter(struct trace_event_call *call,
1729 1742
1730 err = process_preds(call, filter_string, *filterp, pe); 1743 err = process_preds(call, filter_string, *filterp, pe);
1731 if (err && set_str) 1744 if (err && set_str)
1732 append_filter_err(pe, *filterp); 1745 append_filter_err(tr, pe, *filterp);
1733 create_filter_finish(pe); 1746 create_filter_finish(pe);
1734 1747
1735 return err; 1748 return err;
1736} 1749}
1737 1750
1738int create_event_filter(struct trace_event_call *call, 1751int create_event_filter(struct trace_array *tr,
1752 struct trace_event_call *call,
1739 char *filter_str, bool set_str, 1753 char *filter_str, bool set_str,
1740 struct event_filter **filterp) 1754 struct event_filter **filterp)
1741{ 1755{
1742 return create_filter(call, filter_str, set_str, filterp); 1756 return create_filter(tr, call, filter_str, set_str, filterp);
1743} 1757}
1744 1758
1745/** 1759/**
@@ -1766,7 +1780,7 @@ static int create_system_filter(struct trace_subsystem_dir *dir,
1766 kfree((*filterp)->filter_string); 1780 kfree((*filterp)->filter_string);
1767 (*filterp)->filter_string = NULL; 1781 (*filterp)->filter_string = NULL;
1768 } else { 1782 } else {
1769 append_filter_err(pe, *filterp); 1783 append_filter_err(tr, pe, *filterp);
1770 } 1784 }
1771 } 1785 }
1772 create_filter_finish(pe); 1786 create_filter_finish(pe);
@@ -1797,7 +1811,7 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string)
1797 return 0; 1811 return 0;
1798 } 1812 }
1799 1813
1800 err = create_filter(call, filter_string, true, &filter); 1814 err = create_filter(file->tr, call, filter_string, true, &filter);
1801 1815
1802 /* 1816 /*
1803 * Always swap the call filter with the new filter 1817 * Always swap the call filter with the new filter
@@ -2053,7 +2067,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
2053 if (event->filter) 2067 if (event->filter)
2054 goto out_unlock; 2068 goto out_unlock;
2055 2069
2056 err = create_filter(call, filter_str, false, &filter); 2070 err = create_filter(NULL, call, filter_str, false, &filter);
2057 if (err) 2071 if (err)
2058 goto free_filter; 2072 goto free_filter;
2059 2073
@@ -2202,8 +2216,8 @@ static __init int ftrace_test_event_filter(void)
2202 struct test_filter_data_t *d = &test_filter_data[i]; 2216 struct test_filter_data_t *d = &test_filter_data[i];
2203 int err; 2217 int err;
2204 2218
2205 err = create_filter(&event_ftrace_test_filter, d->filter, 2219 err = create_filter(NULL, &event_ftrace_test_filter,
2206 false, &filter); 2220 d->filter, false, &filter);
2207 if (err) { 2221 if (err) {
2208 printk(KERN_INFO 2222 printk(KERN_INFO
2209 "Failed to get filter for '%s', err %d\n", 2223 "Failed to get filter for '%s', err %d\n",
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index a1d20421f4b0..ca6b0dff60c5 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -22,6 +22,57 @@
22 22
23#define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */ 23#define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */
24 24
25#define ERRORS \
26 C(NONE, "No error"), \
27 C(DUPLICATE_VAR, "Variable already defined"), \
28 C(VAR_NOT_UNIQUE, "Variable name not unique, need to use fully qualified name (subsys.event.var) for variable"), \
29 C(TOO_MANY_VARS, "Too many variables defined"), \
30 C(MALFORMED_ASSIGNMENT, "Malformed assignment"), \
31 C(NAMED_MISMATCH, "Named hist trigger doesn't match existing named trigger (includes variables)"), \
32 C(TRIGGER_EEXIST, "Hist trigger already exists"), \
33 C(TRIGGER_ENOENT_CLEAR, "Can't clear or continue a nonexistent hist trigger"), \
34 C(SET_CLOCK_FAIL, "Couldn't set trace_clock"), \
35 C(BAD_FIELD_MODIFIER, "Invalid field modifier"), \
36 C(TOO_MANY_SUBEXPR, "Too many subexpressions (3 max)"), \
37 C(TIMESTAMP_MISMATCH, "Timestamp units in expression don't match"), \
38 C(TOO_MANY_FIELD_VARS, "Too many field variables defined"), \
39 C(EVENT_FILE_NOT_FOUND, "Event file not found"), \
40 C(HIST_NOT_FOUND, "Matching event histogram not found"), \
41 C(HIST_CREATE_FAIL, "Couldn't create histogram for field"), \
42 C(SYNTH_VAR_NOT_FOUND, "Couldn't find synthetic variable"), \
43 C(SYNTH_EVENT_NOT_FOUND,"Couldn't find synthetic event"), \
44 C(SYNTH_TYPE_MISMATCH, "Param type doesn't match synthetic event field type"), \
45 C(SYNTH_COUNT_MISMATCH, "Param count doesn't match synthetic event field count"), \
46 C(FIELD_VAR_PARSE_FAIL, "Couldn't parse field variable"), \
47 C(VAR_CREATE_FIND_FAIL, "Couldn't create or find variable"), \
48 C(ONX_NOT_VAR, "For onmax(x) or onchange(x), x must be a variable"), \
49 C(ONX_VAR_NOT_FOUND, "Couldn't find onmax or onchange variable"), \
50 C(ONX_VAR_CREATE_FAIL, "Couldn't create onmax or onchange variable"), \
51 C(FIELD_VAR_CREATE_FAIL,"Couldn't create field variable"), \
52 C(TOO_MANY_PARAMS, "Too many action params"), \
53 C(PARAM_NOT_FOUND, "Couldn't find param"), \
54 C(INVALID_PARAM, "Invalid action param"), \
55 C(ACTION_NOT_FOUND, "No action found"), \
56 C(NO_SAVE_PARAMS, "No params found for save()"), \
57 C(TOO_MANY_SAVE_ACTIONS,"Can't have more than one save() action per hist"), \
58 C(ACTION_MISMATCH, "Handler doesn't support action"), \
59 C(NO_CLOSING_PAREN, "No closing paren found"), \
60 C(SUBSYS_NOT_FOUND, "Missing subsystem"), \
61 C(INVALID_SUBSYS_EVENT, "Invalid subsystem or event name"), \
62 C(INVALID_REF_KEY, "Using variable references in keys not supported"), \
63 C(VAR_NOT_FOUND, "Couldn't find variable"), \
64 C(FIELD_NOT_FOUND, "Couldn't find field"),
65
66#undef C
67#define C(a, b) HIST_ERR_##a
68
69enum { ERRORS };
70
71#undef C
72#define C(a, b) b
73
74static const char *err_text[] = { ERRORS };
75
25struct hist_field; 76struct hist_field;
26 77
27typedef u64 (*hist_field_fn_t) (struct hist_field *field, 78typedef u64 (*hist_field_fn_t) (struct hist_field *field,
@@ -535,62 +586,49 @@ static struct track_data *track_data_alloc(unsigned int key_len,
535 return data; 586 return data;
536} 587}
537 588
538static char last_hist_cmd[MAX_FILTER_STR_VAL]; 589static char last_cmd[MAX_FILTER_STR_VAL];
539static char hist_err_str[MAX_FILTER_STR_VAL]; 590static char last_cmd_loc[MAX_FILTER_STR_VAL];
540 591
541static void last_cmd_set(char *str) 592static int errpos(char *str)
542{ 593{
543 if (!str) 594 return err_pos(last_cmd, str);
544 return;
545
546 strncpy(last_hist_cmd, str, MAX_FILTER_STR_VAL - 1);
547} 595}
548 596
549static void hist_err(char *str, char *var) 597static void last_cmd_set(struct trace_event_file *file, char *str)
550{ 598{
551 int maxlen = MAX_FILTER_STR_VAL - 1; 599 const char *system = NULL, *name = NULL;
600 struct trace_event_call *call;
552 601
553 if (!str) 602 if (!str)
554 return; 603 return;
555 604
556 if (strlen(hist_err_str)) 605 strncpy(last_cmd, str, MAX_FILTER_STR_VAL - 1);
557 return;
558 606
559 if (!var) 607 if (file) {
560 var = ""; 608 call = file->event_call;
561 609
562 if (strlen(hist_err_str) + strlen(str) + strlen(var) > maxlen) 610 system = call->class->system;
563 return; 611 if (system) {
612 name = trace_event_name(call);
613 if (!name)
614 system = NULL;
615 }
616 }
564 617
565 strcat(hist_err_str, str); 618 if (system)
566 strcat(hist_err_str, var); 619 snprintf(last_cmd_loc, MAX_FILTER_STR_VAL, "hist:%s:%s", system, name);
567} 620}
568 621
569static void hist_err_event(char *str, char *system, char *event, char *var) 622static void hist_err(struct trace_array *tr, u8 err_type, u8 err_pos)
570{ 623{
571 char err[MAX_FILTER_STR_VAL]; 624 tracing_log_err(tr, last_cmd_loc, last_cmd, err_text,
572 625 err_type, err_pos);
573 if (system && var)
574 snprintf(err, MAX_FILTER_STR_VAL, "%s.%s.%s", system, event, var);
575 else if (system)
576 snprintf(err, MAX_FILTER_STR_VAL, "%s.%s", system, event);
577 else
578 strscpy(err, var, MAX_FILTER_STR_VAL);
579
580 hist_err(str, err);
581} 626}
582 627
583static void hist_err_clear(void) 628static void hist_err_clear(void)
584{ 629{
585 hist_err_str[0] = '\0'; 630 last_cmd[0] = '\0';
586} 631 last_cmd_loc[0] = '\0';
587
588static bool have_hist_err(void)
589{
590 if (strlen(hist_err_str))
591 return true;
592
593 return false;
594} 632}
595 633
596struct synth_trace_event { 634struct synth_trace_event {
@@ -1719,7 +1757,7 @@ static struct trace_event_file *find_var_file(struct trace_array *tr,
1719 1757
1720 if (find_var_field(var_hist_data, var_name)) { 1758 if (find_var_field(var_hist_data, var_name)) {
1721 if (found) { 1759 if (found) {
1722 hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name); 1760 hist_err(tr, HIST_ERR_VAR_NOT_UNIQUE, errpos(var_name));
1723 return NULL; 1761 return NULL;
1724 } 1762 }
1725 1763
@@ -1770,7 +1808,8 @@ find_match_var(struct hist_trigger_data *hist_data, char *var_name)
1770 hist_field = find_file_var(file, var_name); 1808 hist_field = find_file_var(file, var_name);
1771 if (hist_field) { 1809 if (hist_field) {
1772 if (found) { 1810 if (found) {
1773 hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name); 1811 hist_err(tr, HIST_ERR_VAR_NOT_UNIQUE,
1812 errpos(var_name));
1774 return ERR_PTR(-EINVAL); 1813 return ERR_PTR(-EINVAL);
1775 } 1814 }
1776 1815
@@ -1815,6 +1854,9 @@ static u64 hist_field_var_ref(struct hist_field *hist_field,
1815 struct hist_elt_data *elt_data; 1854 struct hist_elt_data *elt_data;
1816 u64 var_val = 0; 1855 u64 var_val = 0;
1817 1856
1857 if (WARN_ON_ONCE(!elt))
1858 return var_val;
1859
1818 elt_data = elt->private_data; 1860 elt_data = elt->private_data;
1819 var_val = elt_data->var_ref_vals[hist_field->var_ref_idx]; 1861 var_val = elt_data->var_ref_vals[hist_field->var_ref_idx];
1820 1862
@@ -2002,11 +2044,11 @@ static int parse_action(char *str, struct hist_trigger_attrs *attrs)
2002 attrs->n_actions++; 2044 attrs->n_actions++;
2003 ret = 0; 2045 ret = 0;
2004 } 2046 }
2005
2006 return ret; 2047 return ret;
2007} 2048}
2008 2049
2009static int parse_assignment(char *str, struct hist_trigger_attrs *attrs) 2050static int parse_assignment(struct trace_array *tr,
2051 char *str, struct hist_trigger_attrs *attrs)
2010{ 2052{
2011 int ret = 0; 2053 int ret = 0;
2012 2054
@@ -2062,7 +2104,7 @@ static int parse_assignment(char *str, struct hist_trigger_attrs *attrs)
2062 char *assignment; 2104 char *assignment;
2063 2105
2064 if (attrs->n_assignments == TRACING_MAP_VARS_MAX) { 2106 if (attrs->n_assignments == TRACING_MAP_VARS_MAX) {
2065 hist_err("Too many variables defined: ", str); 2107 hist_err(tr, HIST_ERR_TOO_MANY_VARS, errpos(str));
2066 ret = -EINVAL; 2108 ret = -EINVAL;
2067 goto out; 2109 goto out;
2068 } 2110 }
@@ -2079,7 +2121,8 @@ static int parse_assignment(char *str, struct hist_trigger_attrs *attrs)
2079 return ret; 2121 return ret;
2080} 2122}
2081 2123
2082static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str) 2124static struct hist_trigger_attrs *
2125parse_hist_trigger_attrs(struct trace_array *tr, char *trigger_str)
2083{ 2126{
2084 struct hist_trigger_attrs *attrs; 2127 struct hist_trigger_attrs *attrs;
2085 int ret = 0; 2128 int ret = 0;
@@ -2092,7 +2135,7 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
2092 char *str = strsep(&trigger_str, ":"); 2135 char *str = strsep(&trigger_str, ":");
2093 2136
2094 if (strchr(str, '=')) { 2137 if (strchr(str, '=')) {
2095 ret = parse_assignment(str, attrs); 2138 ret = parse_assignment(tr, str, attrs);
2096 if (ret) 2139 if (ret)
2097 goto free; 2140 goto free;
2098 } else if (strcmp(str, "pause") == 0) 2141 } else if (strcmp(str, "pause") == 0)
@@ -2648,6 +2691,7 @@ static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data,
2648 char *var_name) 2691 char *var_name)
2649{ 2692{
2650 struct hist_field *var_field = NULL, *ref_field = NULL; 2693 struct hist_field *var_field = NULL, *ref_field = NULL;
2694 struct trace_array *tr = hist_data->event_file->tr;
2651 2695
2652 if (!is_var_ref(var_name)) 2696 if (!is_var_ref(var_name))
2653 return NULL; 2697 return NULL;
@@ -2660,8 +2704,7 @@ static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data,
2660 system, event_name); 2704 system, event_name);
2661 2705
2662 if (!ref_field) 2706 if (!ref_field)
2663 hist_err_event("Couldn't find variable: $", 2707 hist_err(tr, HIST_ERR_VAR_NOT_FOUND, errpos(var_name));
2664 system, event_name, var_name);
2665 2708
2666 return ref_field; 2709 return ref_field;
2667} 2710}
@@ -2672,6 +2715,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
2672{ 2715{
2673 struct ftrace_event_field *field = NULL; 2716 struct ftrace_event_field *field = NULL;
2674 char *field_name, *modifier, *str; 2717 char *field_name, *modifier, *str;
2718 struct trace_array *tr = file->tr;
2675 2719
2676 modifier = str = kstrdup(field_str, GFP_KERNEL); 2720 modifier = str = kstrdup(field_str, GFP_KERNEL);
2677 if (!modifier) 2721 if (!modifier)
@@ -2695,7 +2739,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
2695 else if (strcmp(modifier, "usecs") == 0) 2739 else if (strcmp(modifier, "usecs") == 0)
2696 *flags |= HIST_FIELD_FL_TIMESTAMP_USECS; 2740 *flags |= HIST_FIELD_FL_TIMESTAMP_USECS;
2697 else { 2741 else {
2698 hist_err("Invalid field modifier: ", modifier); 2742 hist_err(tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(modifier));
2699 field = ERR_PTR(-EINVAL); 2743 field = ERR_PTR(-EINVAL);
2700 goto out; 2744 goto out;
2701 } 2745 }
@@ -2711,7 +2755,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
2711 else { 2755 else {
2712 field = trace_find_event_field(file->event_call, field_name); 2756 field = trace_find_event_field(file->event_call, field_name);
2713 if (!field || !field->size) { 2757 if (!field || !field->size) {
2714 hist_err("Couldn't find field: ", field_name); 2758 hist_err(tr, HIST_ERR_FIELD_NOT_FOUND, errpos(field_name));
2715 field = ERR_PTR(-EINVAL); 2759 field = ERR_PTR(-EINVAL);
2716 goto out; 2760 goto out;
2717 } 2761 }
@@ -2773,7 +2817,8 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data,
2773 2817
2774 s = local_field_var_ref(hist_data, ref_system, ref_event, ref_var); 2818 s = local_field_var_ref(hist_data, ref_system, ref_event, ref_var);
2775 if (!s) { 2819 if (!s) {
2776 hist_field = parse_var_ref(hist_data, ref_system, ref_event, ref_var); 2820 hist_field = parse_var_ref(hist_data, ref_system,
2821 ref_event, ref_var);
2777 if (hist_field) { 2822 if (hist_field) {
2778 if (var_name) { 2823 if (var_name) {
2779 hist_field = create_alias(hist_data, hist_field, var_name); 2824 hist_field = create_alias(hist_data, hist_field, var_name);
@@ -2822,7 +2867,7 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
2822 /* we support only -(xxx) i.e. explicit parens required */ 2867 /* we support only -(xxx) i.e. explicit parens required */
2823 2868
2824 if (level > 3) { 2869 if (level > 3) {
2825 hist_err("Too many subexpressions (3 max): ", str); 2870 hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str));
2826 ret = -EINVAL; 2871 ret = -EINVAL;
2827 goto free; 2872 goto free;
2828 } 2873 }
@@ -2877,7 +2922,8 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
2877 return ERR_PTR(ret); 2922 return ERR_PTR(ret);
2878} 2923}
2879 2924
2880static int check_expr_operands(struct hist_field *operand1, 2925static int check_expr_operands(struct trace_array *tr,
2926 struct hist_field *operand1,
2881 struct hist_field *operand2) 2927 struct hist_field *operand2)
2882{ 2928{
2883 unsigned long operand1_flags = operand1->flags; 2929 unsigned long operand1_flags = operand1->flags;
@@ -2905,7 +2951,7 @@ static int check_expr_operands(struct hist_field *operand1,
2905 2951
2906 if ((operand1_flags & HIST_FIELD_FL_TIMESTAMP_USECS) != 2952 if ((operand1_flags & HIST_FIELD_FL_TIMESTAMP_USECS) !=
2907 (operand2_flags & HIST_FIELD_FL_TIMESTAMP_USECS)) { 2953 (operand2_flags & HIST_FIELD_FL_TIMESTAMP_USECS)) {
2908 hist_err("Timestamp units in expression don't match", NULL); 2954 hist_err(tr, HIST_ERR_TIMESTAMP_MISMATCH, 0);
2909 return -EINVAL; 2955 return -EINVAL;
2910 } 2956 }
2911 2957
@@ -2923,7 +2969,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
2923 char *sep, *operand1_str; 2969 char *sep, *operand1_str;
2924 2970
2925 if (level > 3) { 2971 if (level > 3) {
2926 hist_err("Too many subexpressions (3 max): ", str); 2972 hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str));
2927 return ERR_PTR(-EINVAL); 2973 return ERR_PTR(-EINVAL);
2928 } 2974 }
2929 2975
@@ -2968,7 +3014,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
2968 goto free; 3014 goto free;
2969 } 3015 }
2970 3016
2971 ret = check_expr_operands(operand1, operand2); 3017 ret = check_expr_operands(file->tr, operand1, operand2);
2972 if (ret) 3018 if (ret)
2973 goto free; 3019 goto free;
2974 3020
@@ -3161,16 +3207,14 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,
3161 int ret; 3207 int ret;
3162 3208
3163 if (target_hist_data->n_field_var_hists >= SYNTH_FIELDS_MAX) { 3209 if (target_hist_data->n_field_var_hists >= SYNTH_FIELDS_MAX) {
3164 hist_err_event("trace action: Too many field variables defined: ", 3210 hist_err(tr, HIST_ERR_TOO_MANY_FIELD_VARS, errpos(field_name));
3165 subsys_name, event_name, field_name);
3166 return ERR_PTR(-EINVAL); 3211 return ERR_PTR(-EINVAL);
3167 } 3212 }
3168 3213
3169 file = event_file(tr, subsys_name, event_name); 3214 file = event_file(tr, subsys_name, event_name);
3170 3215
3171 if (IS_ERR(file)) { 3216 if (IS_ERR(file)) {
3172 hist_err_event("trace action: Event file not found: ", 3217 hist_err(tr, HIST_ERR_EVENT_FILE_NOT_FOUND, errpos(field_name));
3173 subsys_name, event_name, field_name);
3174 ret = PTR_ERR(file); 3218 ret = PTR_ERR(file);
3175 return ERR_PTR(ret); 3219 return ERR_PTR(ret);
3176 } 3220 }
@@ -3183,8 +3227,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,
3183 */ 3227 */
3184 hist_data = find_compatible_hist(target_hist_data, file); 3228 hist_data = find_compatible_hist(target_hist_data, file);
3185 if (!hist_data) { 3229 if (!hist_data) {
3186 hist_err_event("trace action: Matching event histogram not found: ", 3230 hist_err(tr, HIST_ERR_HIST_NOT_FOUND, errpos(field_name));
3187 subsys_name, event_name, field_name);
3188 return ERR_PTR(-EINVAL); 3231 return ERR_PTR(-EINVAL);
3189 } 3232 }
3190 3233
@@ -3245,8 +3288,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,
3245 kfree(cmd); 3288 kfree(cmd);
3246 kfree(var_hist->cmd); 3289 kfree(var_hist->cmd);
3247 kfree(var_hist); 3290 kfree(var_hist);
3248 hist_err_event("trace action: Couldn't create histogram for field: ", 3291 hist_err(tr, HIST_ERR_HIST_CREATE_FAIL, errpos(field_name));
3249 subsys_name, event_name, field_name);
3250 return ERR_PTR(ret); 3292 return ERR_PTR(ret);
3251 } 3293 }
3252 3294
@@ -3258,8 +3300,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,
3258 if (IS_ERR_OR_NULL(event_var)) { 3300 if (IS_ERR_OR_NULL(event_var)) {
3259 kfree(var_hist->cmd); 3301 kfree(var_hist->cmd);
3260 kfree(var_hist); 3302 kfree(var_hist);
3261 hist_err_event("trace action: Couldn't find synthetic variable: ", 3303 hist_err(tr, HIST_ERR_SYNTH_VAR_NOT_FOUND, errpos(field_name));
3262 subsys_name, event_name, field_name);
3263 return ERR_PTR(-EINVAL); 3304 return ERR_PTR(-EINVAL);
3264 } 3305 }
3265 3306
@@ -3392,25 +3433,26 @@ static struct field_var *create_field_var(struct hist_trigger_data *hist_data,
3392{ 3433{
3393 struct hist_field *val = NULL, *var = NULL; 3434 struct hist_field *val = NULL, *var = NULL;
3394 unsigned long flags = HIST_FIELD_FL_VAR; 3435 unsigned long flags = HIST_FIELD_FL_VAR;
3436 struct trace_array *tr = file->tr;
3395 struct field_var *field_var; 3437 struct field_var *field_var;
3396 int ret = 0; 3438 int ret = 0;
3397 3439
3398 if (hist_data->n_field_vars >= SYNTH_FIELDS_MAX) { 3440 if (hist_data->n_field_vars >= SYNTH_FIELDS_MAX) {
3399 hist_err("Too many field variables defined: ", field_name); 3441 hist_err(tr, HIST_ERR_TOO_MANY_FIELD_VARS, errpos(field_name));
3400 ret = -EINVAL; 3442 ret = -EINVAL;
3401 goto err; 3443 goto err;
3402 } 3444 }
3403 3445
3404 val = parse_atom(hist_data, file, field_name, &flags, NULL); 3446 val = parse_atom(hist_data, file, field_name, &flags, NULL);
3405 if (IS_ERR(val)) { 3447 if (IS_ERR(val)) {
3406 hist_err("Couldn't parse field variable: ", field_name); 3448 hist_err(tr, HIST_ERR_FIELD_VAR_PARSE_FAIL, errpos(field_name));
3407 ret = PTR_ERR(val); 3449 ret = PTR_ERR(val);
3408 goto err; 3450 goto err;
3409 } 3451 }
3410 3452
3411 var = create_var(hist_data, file, field_name, val->size, val->type); 3453 var = create_var(hist_data, file, field_name, val->size, val->type);
3412 if (IS_ERR(var)) { 3454 if (IS_ERR(var)) {
3413 hist_err("Couldn't create or find variable: ", field_name); 3455 hist_err(tr, HIST_ERR_VAR_CREATE_FIND_FAIL, errpos(field_name));
3414 kfree(val); 3456 kfree(val);
3415 ret = PTR_ERR(var); 3457 ret = PTR_ERR(var);
3416 goto err; 3458 goto err;
@@ -3543,14 +3585,20 @@ static bool cond_snapshot_update(struct trace_array *tr, void *cond_data)
3543 struct track_data *track_data = tr->cond_snapshot->cond_data; 3585 struct track_data *track_data = tr->cond_snapshot->cond_data;
3544 struct hist_elt_data *elt_data, *track_elt_data; 3586 struct hist_elt_data *elt_data, *track_elt_data;
3545 struct snapshot_context *context = cond_data; 3587 struct snapshot_context *context = cond_data;
3588 struct action_data *action;
3546 u64 track_val; 3589 u64 track_val;
3547 3590
3548 if (!track_data) 3591 if (!track_data)
3549 return false; 3592 return false;
3550 3593
3594 action = track_data->action_data;
3595
3551 track_val = get_track_val(track_data->hist_data, context->elt, 3596 track_val = get_track_val(track_data->hist_data, context->elt,
3552 track_data->action_data); 3597 track_data->action_data);
3553 3598
3599 if (!action->track_data.check_val(track_data->track_val, track_val))
3600 return false;
3601
3554 track_data->track_val = track_val; 3602 track_data->track_val = track_val;
3555 memcpy(track_data->key, context->key, track_data->key_len); 3603 memcpy(track_data->key, context->key, track_data->key_len);
3556 3604
@@ -3737,19 +3785,20 @@ static int track_data_create(struct hist_trigger_data *hist_data,
3737{ 3785{
3738 struct hist_field *var_field, *ref_field, *track_var = NULL; 3786 struct hist_field *var_field, *ref_field, *track_var = NULL;
3739 struct trace_event_file *file = hist_data->event_file; 3787 struct trace_event_file *file = hist_data->event_file;
3788 struct trace_array *tr = file->tr;
3740 char *track_data_var_str; 3789 char *track_data_var_str;
3741 int ret = 0; 3790 int ret = 0;
3742 3791
3743 track_data_var_str = data->track_data.var_str; 3792 track_data_var_str = data->track_data.var_str;
3744 if (track_data_var_str[0] != '$') { 3793 if (track_data_var_str[0] != '$') {
3745 hist_err("For onmax(x) or onchange(x), x must be a variable: ", track_data_var_str); 3794 hist_err(tr, HIST_ERR_ONX_NOT_VAR, errpos(track_data_var_str));
3746 return -EINVAL; 3795 return -EINVAL;
3747 } 3796 }
3748 track_data_var_str++; 3797 track_data_var_str++;
3749 3798
3750 var_field = find_target_event_var(hist_data, NULL, NULL, track_data_var_str); 3799 var_field = find_target_event_var(hist_data, NULL, NULL, track_data_var_str);
3751 if (!var_field) { 3800 if (!var_field) {
3752 hist_err("Couldn't find onmax or onchange variable: ", track_data_var_str); 3801 hist_err(tr, HIST_ERR_ONX_VAR_NOT_FOUND, errpos(track_data_var_str));
3753 return -EINVAL; 3802 return -EINVAL;
3754 } 3803 }
3755 3804
@@ -3762,7 +3811,7 @@ static int track_data_create(struct hist_trigger_data *hist_data,
3762 if (data->handler == HANDLER_ONMAX) 3811 if (data->handler == HANDLER_ONMAX)
3763 track_var = create_var(hist_data, file, "__max", sizeof(u64), "u64"); 3812 track_var = create_var(hist_data, file, "__max", sizeof(u64), "u64");
3764 if (IS_ERR(track_var)) { 3813 if (IS_ERR(track_var)) {
3765 hist_err("Couldn't create onmax variable: ", "__max"); 3814 hist_err(tr, HIST_ERR_ONX_VAR_CREATE_FAIL, 0);
3766 ret = PTR_ERR(track_var); 3815 ret = PTR_ERR(track_var);
3767 goto out; 3816 goto out;
3768 } 3817 }
@@ -3770,7 +3819,7 @@ static int track_data_create(struct hist_trigger_data *hist_data,
3770 if (data->handler == HANDLER_ONCHANGE) 3819 if (data->handler == HANDLER_ONCHANGE)
3771 track_var = create_var(hist_data, file, "__change", sizeof(u64), "u64"); 3820 track_var = create_var(hist_data, file, "__change", sizeof(u64), "u64");
3772 if (IS_ERR(track_var)) { 3821 if (IS_ERR(track_var)) {
3773 hist_err("Couldn't create onchange variable: ", "__change"); 3822 hist_err(tr, HIST_ERR_ONX_VAR_CREATE_FAIL, 0);
3774 ret = PTR_ERR(track_var); 3823 ret = PTR_ERR(track_var);
3775 goto out; 3824 goto out;
3776 } 3825 }
@@ -3781,7 +3830,8 @@ static int track_data_create(struct hist_trigger_data *hist_data,
3781 return ret; 3830 return ret;
3782} 3831}
3783 3832
3784static int parse_action_params(char *params, struct action_data *data) 3833static int parse_action_params(struct trace_array *tr, char *params,
3834 struct action_data *data)
3785{ 3835{
3786 char *param, *saved_param; 3836 char *param, *saved_param;
3787 bool first_param = true; 3837 bool first_param = true;
@@ -3789,20 +3839,20 @@ static int parse_action_params(char *params, struct action_data *data)
3789 3839
3790 while (params) { 3840 while (params) {
3791 if (data->n_params >= SYNTH_FIELDS_MAX) { 3841 if (data->n_params >= SYNTH_FIELDS_MAX) {
3792 hist_err("Too many action params", ""); 3842 hist_err(tr, HIST_ERR_TOO_MANY_PARAMS, 0);
3793 goto out; 3843 goto out;
3794 } 3844 }
3795 3845
3796 param = strsep(&params, ","); 3846 param = strsep(&params, ",");
3797 if (!param) { 3847 if (!param) {
3798 hist_err("No action param found", ""); 3848 hist_err(tr, HIST_ERR_PARAM_NOT_FOUND, 0);
3799 ret = -EINVAL; 3849 ret = -EINVAL;
3800 goto out; 3850 goto out;
3801 } 3851 }
3802 3852
3803 param = strstrip(param); 3853 param = strstrip(param);
3804 if (strlen(param) < 2) { 3854 if (strlen(param) < 2) {
3805 hist_err("Invalid action param: ", param); 3855 hist_err(tr, HIST_ERR_INVALID_PARAM, errpos(param));
3806 ret = -EINVAL; 3856 ret = -EINVAL;
3807 goto out; 3857 goto out;
3808 } 3858 }
@@ -3826,7 +3876,7 @@ static int parse_action_params(char *params, struct action_data *data)
3826 return ret; 3876 return ret;
3827} 3877}
3828 3878
3829static int action_parse(char *str, struct action_data *data, 3879static int action_parse(struct trace_array *tr, char *str, struct action_data *data,
3830 enum handler_id handler) 3880 enum handler_id handler)
3831{ 3881{
3832 char *action_name; 3882 char *action_name;
@@ -3834,14 +3884,14 @@ static int action_parse(char *str, struct action_data *data,
3834 3884
3835 strsep(&str, "."); 3885 strsep(&str, ".");
3836 if (!str) { 3886 if (!str) {
3837 hist_err("action parsing: No action found", ""); 3887 hist_err(tr, HIST_ERR_ACTION_NOT_FOUND, 0);
3838 ret = -EINVAL; 3888 ret = -EINVAL;
3839 goto out; 3889 goto out;
3840 } 3890 }
3841 3891
3842 action_name = strsep(&str, "("); 3892 action_name = strsep(&str, "(");
3843 if (!action_name || !str) { 3893 if (!action_name || !str) {
3844 hist_err("action parsing: No action found", ""); 3894 hist_err(tr, HIST_ERR_ACTION_NOT_FOUND, 0);
3845 ret = -EINVAL; 3895 ret = -EINVAL;
3846 goto out; 3896 goto out;
3847 } 3897 }
@@ -3850,12 +3900,12 @@ static int action_parse(char *str, struct action_data *data,
3850 char *params = strsep(&str, ")"); 3900 char *params = strsep(&str, ")");
3851 3901
3852 if (!params) { 3902 if (!params) {
3853 hist_err("action parsing: No params found for %s", "save"); 3903 hist_err(tr, HIST_ERR_NO_SAVE_PARAMS, 0);
3854 ret = -EINVAL; 3904 ret = -EINVAL;
3855 goto out; 3905 goto out;
3856 } 3906 }
3857 3907
3858 ret = parse_action_params(params, data); 3908 ret = parse_action_params(tr, params, data);
3859 if (ret) 3909 if (ret)
3860 goto out; 3910 goto out;
3861 3911
@@ -3864,7 +3914,7 @@ static int action_parse(char *str, struct action_data *data,
3864 else if (handler == HANDLER_ONCHANGE) 3914 else if (handler == HANDLER_ONCHANGE)
3865 data->track_data.check_val = check_track_val_changed; 3915 data->track_data.check_val = check_track_val_changed;
3866 else { 3916 else {
3867 hist_err("action parsing: Handler doesn't support action: ", action_name); 3917 hist_err(tr, HIST_ERR_ACTION_MISMATCH, errpos(action_name));
3868 ret = -EINVAL; 3918 ret = -EINVAL;
3869 goto out; 3919 goto out;
3870 } 3920 }
@@ -3876,7 +3926,7 @@ static int action_parse(char *str, struct action_data *data,
3876 char *params = strsep(&str, ")"); 3926 char *params = strsep(&str, ")");
3877 3927
3878 if (!str) { 3928 if (!str) {
3879 hist_err("action parsing: No closing paren found: %s", params); 3929 hist_err(tr, HIST_ERR_NO_CLOSING_PAREN, errpos(params));
3880 ret = -EINVAL; 3930 ret = -EINVAL;
3881 goto out; 3931 goto out;
3882 } 3932 }
@@ -3886,7 +3936,7 @@ static int action_parse(char *str, struct action_data *data,
3886 else if (handler == HANDLER_ONCHANGE) 3936 else if (handler == HANDLER_ONCHANGE)
3887 data->track_data.check_val = check_track_val_changed; 3937 data->track_data.check_val = check_track_val_changed;
3888 else { 3938 else {
3889 hist_err("action parsing: Handler doesn't support action: ", action_name); 3939 hist_err(tr, HIST_ERR_ACTION_MISMATCH, errpos(action_name));
3890 ret = -EINVAL; 3940 ret = -EINVAL;
3891 goto out; 3941 goto out;
3892 } 3942 }
@@ -3901,7 +3951,7 @@ static int action_parse(char *str, struct action_data *data,
3901 data->use_trace_keyword = true; 3951 data->use_trace_keyword = true;
3902 3952
3903 if (params) { 3953 if (params) {
3904 ret = parse_action_params(params, data); 3954 ret = parse_action_params(tr, params, data);
3905 if (ret) 3955 if (ret)
3906 goto out; 3956 goto out;
3907 } 3957 }
@@ -3954,7 +4004,7 @@ static struct action_data *track_data_parse(struct hist_trigger_data *hist_data,
3954 goto free; 4004 goto free;
3955 } 4005 }
3956 4006
3957 ret = action_parse(str, data, handler); 4007 ret = action_parse(hist_data->event_file->tr, str, data, handler);
3958 if (ret) 4008 if (ret)
3959 goto free; 4009 goto free;
3960 out: 4010 out:
@@ -4024,6 +4074,7 @@ trace_action_find_var(struct hist_trigger_data *hist_data,
4024 struct action_data *data, 4074 struct action_data *data,
4025 char *system, char *event, char *var) 4075 char *system, char *event, char *var)
4026{ 4076{
4077 struct trace_array *tr = hist_data->event_file->tr;
4027 struct hist_field *hist_field; 4078 struct hist_field *hist_field;
4028 4079
4029 var++; /* skip '$' */ 4080 var++; /* skip '$' */
@@ -4039,7 +4090,7 @@ trace_action_find_var(struct hist_trigger_data *hist_data,
4039 } 4090 }
4040 4091
4041 if (!hist_field) 4092 if (!hist_field)
4042 hist_err_event("trace action: Couldn't find param: $", system, event, var); 4093 hist_err(tr, HIST_ERR_PARAM_NOT_FOUND, errpos(var));
4043 4094
4044 return hist_field; 4095 return hist_field;
4045} 4096}
@@ -4097,6 +4148,7 @@ trace_action_create_field_var(struct hist_trigger_data *hist_data,
4097static int trace_action_create(struct hist_trigger_data *hist_data, 4148static int trace_action_create(struct hist_trigger_data *hist_data,
4098 struct action_data *data) 4149 struct action_data *data)
4099{ 4150{
4151 struct trace_array *tr = hist_data->event_file->tr;
4100 char *event_name, *param, *system = NULL; 4152 char *event_name, *param, *system = NULL;
4101 struct hist_field *hist_field, *var_ref; 4153 struct hist_field *hist_field, *var_ref;
4102 unsigned int i, var_ref_idx; 4154 unsigned int i, var_ref_idx;
@@ -4114,7 +4166,7 @@ static int trace_action_create(struct hist_trigger_data *hist_data,
4114 4166
4115 event = find_synth_event(synth_event_name); 4167 event = find_synth_event(synth_event_name);
4116 if (!event) { 4168 if (!event) {
4117 hist_err("trace action: Couldn't find synthetic event: ", synth_event_name); 4169 hist_err(tr, HIST_ERR_SYNTH_EVENT_NOT_FOUND, errpos(synth_event_name));
4118 return -EINVAL; 4170 return -EINVAL;
4119 } 4171 }
4120 4172
@@ -4175,15 +4227,14 @@ static int trace_action_create(struct hist_trigger_data *hist_data,
4175 continue; 4227 continue;
4176 } 4228 }
4177 4229
4178 hist_err_event("trace action: Param type doesn't match synthetic event field type: ", 4230 hist_err(tr, HIST_ERR_SYNTH_TYPE_MISMATCH, errpos(param));
4179 system, event_name, param);
4180 kfree(p); 4231 kfree(p);
4181 ret = -EINVAL; 4232 ret = -EINVAL;
4182 goto err; 4233 goto err;
4183 } 4234 }
4184 4235
4185 if (field_pos != event->n_fields) { 4236 if (field_pos != event->n_fields) {
4186 hist_err("trace action: Param count doesn't match synthetic event field count: ", event->name); 4237 hist_err(tr, HIST_ERR_SYNTH_COUNT_MISMATCH, errpos(event->name));
4187 ret = -EINVAL; 4238 ret = -EINVAL;
4188 goto err; 4239 goto err;
4189 } 4240 }
@@ -4202,6 +4253,7 @@ static int action_create(struct hist_trigger_data *hist_data,
4202 struct action_data *data) 4253 struct action_data *data)
4203{ 4254{
4204 struct trace_event_file *file = hist_data->event_file; 4255 struct trace_event_file *file = hist_data->event_file;
4256 struct trace_array *tr = file->tr;
4205 struct track_data *track_data; 4257 struct track_data *track_data;
4206 struct field_var *field_var; 4258 struct field_var *field_var;
4207 unsigned int i; 4259 unsigned int i;
@@ -4229,7 +4281,7 @@ static int action_create(struct hist_trigger_data *hist_data,
4229 if (data->action == ACTION_SAVE) { 4281 if (data->action == ACTION_SAVE) {
4230 if (hist_data->n_save_vars) { 4282 if (hist_data->n_save_vars) {
4231 ret = -EEXIST; 4283 ret = -EEXIST;
4232 hist_err("save action: Can't have more than one save() action per hist", ""); 4284 hist_err(tr, HIST_ERR_TOO_MANY_SAVE_ACTIONS, 0);
4233 goto out; 4285 goto out;
4234 } 4286 }
4235 4287
@@ -4242,7 +4294,8 @@ static int action_create(struct hist_trigger_data *hist_data,
4242 4294
4243 field_var = create_target_field_var(hist_data, NULL, NULL, param); 4295 field_var = create_target_field_var(hist_data, NULL, NULL, param);
4244 if (IS_ERR(field_var)) { 4296 if (IS_ERR(field_var)) {
4245 hist_err("save action: Couldn't create field variable: ", param); 4297 hist_err(tr, HIST_ERR_FIELD_VAR_CREATE_FAIL,
4298 errpos(param));
4246 ret = PTR_ERR(field_var); 4299 ret = PTR_ERR(field_var);
4247 kfree(param); 4300 kfree(param);
4248 goto out; 4301 goto out;
@@ -4276,19 +4329,18 @@ static struct action_data *onmatch_parse(struct trace_array *tr, char *str)
4276 4329
4277 match_event = strsep(&str, ")"); 4330 match_event = strsep(&str, ")");
4278 if (!match_event || !str) { 4331 if (!match_event || !str) {
4279 hist_err("onmatch: Missing closing paren: ", match_event); 4332 hist_err(tr, HIST_ERR_NO_CLOSING_PAREN, errpos(match_event));
4280 goto free; 4333 goto free;
4281 } 4334 }
4282 4335
4283 match_event_system = strsep(&match_event, "."); 4336 match_event_system = strsep(&match_event, ".");
4284 if (!match_event) { 4337 if (!match_event) {
4285 hist_err("onmatch: Missing subsystem for match event: ", match_event_system); 4338 hist_err(tr, HIST_ERR_SUBSYS_NOT_FOUND, errpos(match_event_system));
4286 goto free; 4339 goto free;
4287 } 4340 }
4288 4341
4289 if (IS_ERR(event_file(tr, match_event_system, match_event))) { 4342 if (IS_ERR(event_file(tr, match_event_system, match_event))) {
4290 hist_err_event("onmatch: Invalid subsystem or event name: ", 4343 hist_err(tr, HIST_ERR_INVALID_SUBSYS_EVENT, errpos(match_event));
4291 match_event_system, match_event, NULL);
4292 goto free; 4344 goto free;
4293 } 4345 }
4294 4346
@@ -4304,7 +4356,7 @@ static struct action_data *onmatch_parse(struct trace_array *tr, char *str)
4304 goto free; 4356 goto free;
4305 } 4357 }
4306 4358
4307 ret = action_parse(str, data, HANDLER_ONMATCH); 4359 ret = action_parse(tr, str, data, HANDLER_ONMATCH);
4308 if (ret) 4360 if (ret)
4309 goto free; 4361 goto free;
4310 out: 4362 out:
@@ -4373,13 +4425,14 @@ static int create_var_field(struct hist_trigger_data *hist_data,
4373 struct trace_event_file *file, 4425 struct trace_event_file *file,
4374 char *var_name, char *expr_str) 4426 char *var_name, char *expr_str)
4375{ 4427{
4428 struct trace_array *tr = hist_data->event_file->tr;
4376 unsigned long flags = 0; 4429 unsigned long flags = 0;
4377 4430
4378 if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX)) 4431 if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX))
4379 return -EINVAL; 4432 return -EINVAL;
4380 4433
4381 if (find_var(hist_data, file, var_name) && !hist_data->remove) { 4434 if (find_var(hist_data, file, var_name) && !hist_data->remove) {
4382 hist_err("Variable already defined: ", var_name); 4435 hist_err(tr, HIST_ERR_DUPLICATE_VAR, errpos(var_name));
4383 return -EINVAL; 4436 return -EINVAL;
4384 } 4437 }
4385 4438
@@ -4436,8 +4489,8 @@ static int create_key_field(struct hist_trigger_data *hist_data,
4436 struct trace_event_file *file, 4489 struct trace_event_file *file,
4437 char *field_str) 4490 char *field_str)
4438{ 4491{
4492 struct trace_array *tr = hist_data->event_file->tr;
4439 struct hist_field *hist_field = NULL; 4493 struct hist_field *hist_field = NULL;
4440
4441 unsigned long flags = 0; 4494 unsigned long flags = 0;
4442 unsigned int key_size; 4495 unsigned int key_size;
4443 int ret = 0; 4496 int ret = 0;
@@ -4459,8 +4512,8 @@ static int create_key_field(struct hist_trigger_data *hist_data,
4459 goto out; 4512 goto out;
4460 } 4513 }
4461 4514
4462 if (hist_field->flags & HIST_FIELD_FL_VAR_REF) { 4515 if (field_has_hist_vars(hist_field, 0)) {
4463 hist_err("Using variable references as keys not supported: ", field_str); 4516 hist_err(tr, HIST_ERR_INVALID_REF_KEY, errpos(field_str));
4464 destroy_hist_field(hist_field, 0); 4517 destroy_hist_field(hist_field, 0);
4465 ret = -EINVAL; 4518 ret = -EINVAL;
4466 goto out; 4519 goto out;
@@ -4561,6 +4614,7 @@ static void free_var_defs(struct hist_trigger_data *hist_data)
4561 4614
4562static int parse_var_defs(struct hist_trigger_data *hist_data) 4615static int parse_var_defs(struct hist_trigger_data *hist_data)
4563{ 4616{
4617 struct trace_array *tr = hist_data->event_file->tr;
4564 char *s, *str, *var_name, *field_str; 4618 char *s, *str, *var_name, *field_str;
4565 unsigned int i, j, n_vars = 0; 4619 unsigned int i, j, n_vars = 0;
4566 int ret = 0; 4620 int ret = 0;
@@ -4574,13 +4628,14 @@ static int parse_var_defs(struct hist_trigger_data *hist_data)
4574 4628
4575 var_name = strsep(&field_str, "="); 4629 var_name = strsep(&field_str, "=");
4576 if (!var_name || !field_str) { 4630 if (!var_name || !field_str) {
4577 hist_err("Malformed assignment: ", var_name); 4631 hist_err(tr, HIST_ERR_MALFORMED_ASSIGNMENT,
4632 errpos(var_name));
4578 ret = -EINVAL; 4633 ret = -EINVAL;
4579 goto free; 4634 goto free;
4580 } 4635 }
4581 4636
4582 if (n_vars == TRACING_MAP_VARS_MAX) { 4637 if (n_vars == TRACING_MAP_VARS_MAX) {
4583 hist_err("Too many variables defined: ", var_name); 4638 hist_err(tr, HIST_ERR_TOO_MANY_VARS, errpos(var_name));
4584 ret = -EINVAL; 4639 ret = -EINVAL;
4585 goto free; 4640 goto free;
4586 } 4641 }
@@ -5431,11 +5486,6 @@ static int hist_show(struct seq_file *m, void *v)
5431 hist_trigger_show(m, data, n++); 5486 hist_trigger_show(m, data, n++);
5432 } 5487 }
5433 5488
5434 if (have_hist_err()) {
5435 seq_printf(m, "\nERROR: %s\n", hist_err_str);
5436 seq_printf(m, " Last command: %s\n", last_hist_cmd);
5437 }
5438
5439 out_unlock: 5489 out_unlock:
5440 mutex_unlock(&event_mutex); 5490 mutex_unlock(&event_mutex);
5441 5491
@@ -5800,6 +5850,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
5800{ 5850{
5801 struct hist_trigger_data *hist_data = data->private_data; 5851 struct hist_trigger_data *hist_data = data->private_data;
5802 struct event_trigger_data *test, *named_data = NULL; 5852 struct event_trigger_data *test, *named_data = NULL;
5853 struct trace_array *tr = file->tr;
5803 int ret = 0; 5854 int ret = 0;
5804 5855
5805 if (hist_data->attrs->name) { 5856 if (hist_data->attrs->name) {
@@ -5807,7 +5858,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
5807 if (named_data) { 5858 if (named_data) {
5808 if (!hist_trigger_match(data, named_data, named_data, 5859 if (!hist_trigger_match(data, named_data, named_data,
5809 true)) { 5860 true)) {
5810 hist_err("Named hist trigger doesn't match existing named trigger (includes variables): ", hist_data->attrs->name); 5861 hist_err(tr, HIST_ERR_NAMED_MISMATCH, errpos(hist_data->attrs->name));
5811 ret = -EINVAL; 5862 ret = -EINVAL;
5812 goto out; 5863 goto out;
5813 } 5864 }
@@ -5828,7 +5879,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
5828 else if (hist_data->attrs->clear) 5879 else if (hist_data->attrs->clear)
5829 hist_clear(test); 5880 hist_clear(test);
5830 else { 5881 else {
5831 hist_err("Hist trigger already exists", NULL); 5882 hist_err(tr, HIST_ERR_TRIGGER_EEXIST, 0);
5832 ret = -EEXIST; 5883 ret = -EEXIST;
5833 } 5884 }
5834 goto out; 5885 goto out;
@@ -5836,7 +5887,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
5836 } 5887 }
5837 new: 5888 new:
5838 if (hist_data->attrs->cont || hist_data->attrs->clear) { 5889 if (hist_data->attrs->cont || hist_data->attrs->clear) {
5839 hist_err("Can't clear or continue a nonexistent hist trigger", NULL); 5890 hist_err(tr, HIST_ERR_TRIGGER_ENOENT_CLEAR, 0);
5840 ret = -ENOENT; 5891 ret = -ENOENT;
5841 goto out; 5892 goto out;
5842 } 5893 }
@@ -5861,7 +5912,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
5861 5912
5862 ret = tracing_set_clock(file->tr, hist_data->attrs->clock); 5913 ret = tracing_set_clock(file->tr, hist_data->attrs->clock);
5863 if (ret) { 5914 if (ret) {
5864 hist_err("Couldn't set trace_clock: ", clock); 5915 hist_err(tr, HIST_ERR_SET_CLOCK_FAIL, errpos(clock));
5865 goto out; 5916 goto out;
5866 } 5917 }
5867 5918
@@ -6037,8 +6088,8 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
6037 lockdep_assert_held(&event_mutex); 6088 lockdep_assert_held(&event_mutex);
6038 6089
6039 if (glob && strlen(glob)) { 6090 if (glob && strlen(glob)) {
6040 last_cmd_set(param);
6041 hist_err_clear(); 6091 hist_err_clear();
6092 last_cmd_set(file, param);
6042 } 6093 }
6043 6094
6044 if (!param) 6095 if (!param)
@@ -6079,7 +6130,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
6079 trigger = strstrip(trigger); 6130 trigger = strstrip(trigger);
6080 } 6131 }
6081 6132
6082 attrs = parse_hist_trigger_attrs(trigger); 6133 attrs = parse_hist_trigger_attrs(file->tr, trigger);
6083 if (IS_ERR(attrs)) 6134 if (IS_ERR(attrs))
6084 return PTR_ERR(attrs); 6135 return PTR_ERR(attrs);
6085 6136
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index cd12ecb66eb9..2a2912cb4533 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -731,7 +731,8 @@ int set_trigger_filter(char *filter_str,
731 goto out; 731 goto out;
732 732
733 /* The filter is for the 'trigger' event, not the triggered event */ 733 /* The filter is for the 'trigger' event, not the triggered event */
734 ret = create_event_filter(file->event_call, filter_str, false, &filter); 734 ret = create_event_filter(file->tr, file->event_call,
735 filter_str, false, &filter);
735 /* 736 /*
736 * If create_event_filter() fails, filter still needs to be freed. 737 * If create_event_filter() fails, filter still needs to be freed.
737 * Which the calling code will do with data->filter. 738 * Which the calling code will do with data->filter.
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index 1e6db9cbe4dc..fa95139445b2 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -277,7 +277,7 @@ static void move_to_next_cpu(void)
277 * of this thread, than stop migrating for the duration 277 * of this thread, than stop migrating for the duration
278 * of the current test. 278 * of the current test.
279 */ 279 */
280 if (!cpumask_equal(current_mask, &current->cpus_allowed)) 280 if (!cpumask_equal(current_mask, current->cpus_ptr))
281 goto disable; 281 goto disable;
282 282
283 get_online_cpus(); 283 get_online_cpus();
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 810d78a8d14c..cca65044c14c 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -17,36 +17,28 @@
17#include "trace.h" 17#include "trace.h"
18#include "trace_output.h" 18#include "trace_output.h"
19 19
20static void ftrace_dump_buf(int skip_lines, long cpu_file) 20static struct trace_iterator iter;
21static struct ring_buffer_iter *buffer_iter[CONFIG_NR_CPUS];
22
23static void ftrace_dump_buf(int skip_entries, long cpu_file)
21{ 24{
22 /* use static because iter can be a bit big for the stack */
23 static struct trace_iterator iter;
24 static struct ring_buffer_iter *buffer_iter[CONFIG_NR_CPUS];
25 struct trace_array *tr; 25 struct trace_array *tr;
26 unsigned int old_userobj; 26 unsigned int old_userobj;
27 int cnt = 0, cpu; 27 int cnt = 0, cpu;
28 28
29 trace_init_global_iter(&iter);
30 iter.buffer_iter = buffer_iter;
31 tr = iter.tr; 29 tr = iter.tr;
32 30
33 for_each_tracing_cpu(cpu) {
34 atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
35 }
36
37 old_userobj = tr->trace_flags; 31 old_userobj = tr->trace_flags;
38 32
39 /* don't look at user memory in panic mode */ 33 /* don't look at user memory in panic mode */
40 tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ; 34 tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
41 35
42 kdb_printf("Dumping ftrace buffer:\n"); 36 kdb_printf("Dumping ftrace buffer:\n");
37 if (skip_entries)
38 kdb_printf("(skipping %d entries)\n", skip_entries);
43 39
44 /* reset all but tr, trace, and overruns */ 40 trace_iterator_reset(&iter);
45 memset(&iter.seq, 0,
46 sizeof(struct trace_iterator) -
47 offsetof(struct trace_iterator, seq));
48 iter.iter_flags |= TRACE_FILE_LAT_FMT; 41 iter.iter_flags |= TRACE_FILE_LAT_FMT;
49 iter.pos = -1;
50 42
51 if (cpu_file == RING_BUFFER_ALL_CPUS) { 43 if (cpu_file == RING_BUFFER_ALL_CPUS) {
52 for_each_tracing_cpu(cpu) { 44 for_each_tracing_cpu(cpu) {
@@ -70,11 +62,11 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
70 kdb_printf("---------------------------------\n"); 62 kdb_printf("---------------------------------\n");
71 cnt++; 63 cnt++;
72 64
73 if (!skip_lines) { 65 if (!skip_entries) {
74 print_trace_line(&iter); 66 print_trace_line(&iter);
75 trace_printk_seq(&iter.seq); 67 trace_printk_seq(&iter.seq);
76 } else { 68 } else {
77 skip_lines--; 69 skip_entries--;
78 } 70 }
79 71
80 if (KDB_FLAG(CMD_INTERRUPT)) 72 if (KDB_FLAG(CMD_INTERRUPT))
@@ -90,10 +82,6 @@ out:
90 tr->trace_flags = old_userobj; 82 tr->trace_flags = old_userobj;
91 83
92 for_each_tracing_cpu(cpu) { 84 for_each_tracing_cpu(cpu) {
93 atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
94 }
95
96 for_each_tracing_cpu(cpu) {
97 if (iter.buffer_iter[cpu]) { 85 if (iter.buffer_iter[cpu]) {
98 ring_buffer_read_finish(iter.buffer_iter[cpu]); 86 ring_buffer_read_finish(iter.buffer_iter[cpu]);
99 iter.buffer_iter[cpu] = NULL; 87 iter.buffer_iter[cpu] = NULL;
@@ -106,17 +94,19 @@ out:
106 */ 94 */
107static int kdb_ftdump(int argc, const char **argv) 95static int kdb_ftdump(int argc, const char **argv)
108{ 96{
109 int skip_lines = 0; 97 int skip_entries = 0;
110 long cpu_file; 98 long cpu_file;
111 char *cp; 99 char *cp;
100 int cnt;
101 int cpu;
112 102
113 if (argc > 2) 103 if (argc > 2)
114 return KDB_ARGCOUNT; 104 return KDB_ARGCOUNT;
115 105
116 if (argc) { 106 if (argc) {
117 skip_lines = simple_strtol(argv[1], &cp, 0); 107 skip_entries = simple_strtol(argv[1], &cp, 0);
118 if (*cp) 108 if (*cp)
119 skip_lines = 0; 109 skip_entries = 0;
120 } 110 }
121 111
122 if (argc == 2) { 112 if (argc == 2) {
@@ -129,7 +119,29 @@ static int kdb_ftdump(int argc, const char **argv)
129 } 119 }
130 120
131 kdb_trap_printk++; 121 kdb_trap_printk++;
132 ftrace_dump_buf(skip_lines, cpu_file); 122
123 trace_init_global_iter(&iter);
124 iter.buffer_iter = buffer_iter;
125
126 for_each_tracing_cpu(cpu) {
127 atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
128 }
129
130 /* A negative skip_entries means skip all but the last entries */
131 if (skip_entries < 0) {
132 if (cpu_file == RING_BUFFER_ALL_CPUS)
133 cnt = trace_total_entries(NULL);
134 else
135 cnt = trace_total_entries_cpu(NULL, cpu_file);
136 skip_entries = max(cnt + skip_entries, 0);
137 }
138
139 ftrace_dump_buf(skip_entries, cpu_file);
140
141 for_each_tracing_cpu(cpu) {
142 atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
143 }
144
133 kdb_trap_printk--; 145 kdb_trap_printk--;
134 146
135 return 0; 147 return 0;
@@ -137,8 +149,9 @@ static int kdb_ftdump(int argc, const char **argv)
137 149
138static __init int kdb_ftrace_register(void) 150static __init int kdb_ftrace_register(void)
139{ 151{
140 kdb_register_flags("ftdump", kdb_ftdump, "[skip_#lines] [cpu]", 152 kdb_register_flags("ftdump", kdb_ftdump, "[skip_#entries] [cpu]",
141 "Dump ftrace log", 0, KDB_ENABLE_ALWAYS_SAFE); 153 "Dump ftrace log; -skip dumps last #entries", 0,
154 KDB_ENABLE_ALWAYS_SAFE);
142 return 0; 155 return 0;
143} 156}
144 157
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5d5129b05df7..7d736248a070 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -441,13 +441,8 @@ static int __register_trace_kprobe(struct trace_kprobe *tk)
441 else 441 else
442 ret = register_kprobe(&tk->rp.kp); 442 ret = register_kprobe(&tk->rp.kp);
443 443
444 if (ret == 0) { 444 if (ret == 0)
445 tk->tp.flags |= TP_FLAG_REGISTERED; 445 tk->tp.flags |= TP_FLAG_REGISTERED;
446 } else if (ret == -EILSEQ) {
447 pr_warn("Probing address(0x%p) is not an instruction boundary.\n",
448 tk->rp.kp.addr);
449 ret = -EINVAL;
450 }
451 return ret; 446 return ret;
452} 447}
453 448
@@ -591,7 +586,7 @@ static int trace_kprobe_create(int argc, const char *argv[])
591 * Type of args: 586 * Type of args:
592 * FETCHARG:TYPE : use TYPE instead of unsigned long. 587 * FETCHARG:TYPE : use TYPE instead of unsigned long.
593 */ 588 */
594 struct trace_kprobe *tk; 589 struct trace_kprobe *tk = NULL;
595 int i, len, ret = 0; 590 int i, len, ret = 0;
596 bool is_return = false; 591 bool is_return = false;
597 char *symbol = NULL, *tmp = NULL; 592 char *symbol = NULL, *tmp = NULL;
@@ -615,44 +610,50 @@ static int trace_kprobe_create(int argc, const char *argv[])
615 if (argc < 2) 610 if (argc < 2)
616 return -ECANCELED; 611 return -ECANCELED;
617 612
613 trace_probe_log_init("trace_kprobe", argc, argv);
614
618 event = strchr(&argv[0][1], ':'); 615 event = strchr(&argv[0][1], ':');
619 if (event) 616 if (event)
620 event++; 617 event++;
621 618
622 if (isdigit(argv[0][1])) { 619 if (isdigit(argv[0][1])) {
623 if (!is_return) { 620 if (!is_return) {
624 pr_info("Maxactive is not for kprobe"); 621 trace_probe_log_err(1, MAXACT_NO_KPROBE);
625 return -EINVAL; 622 goto parse_error;
626 } 623 }
627 if (event) 624 if (event)
628 len = event - &argv[0][1] - 1; 625 len = event - &argv[0][1] - 1;
629 else 626 else
630 len = strlen(&argv[0][1]); 627 len = strlen(&argv[0][1]);
631 if (len > MAX_EVENT_NAME_LEN - 1) 628 if (len > MAX_EVENT_NAME_LEN - 1) {
632 return -E2BIG; 629 trace_probe_log_err(1, BAD_MAXACT);
630 goto parse_error;
631 }
633 memcpy(buf, &argv[0][1], len); 632 memcpy(buf, &argv[0][1], len);
634 buf[len] = '\0'; 633 buf[len] = '\0';
635 ret = kstrtouint(buf, 0, &maxactive); 634 ret = kstrtouint(buf, 0, &maxactive);
636 if (ret || !maxactive) { 635 if (ret || !maxactive) {
637 pr_info("Invalid maxactive number\n"); 636 trace_probe_log_err(1, BAD_MAXACT);
638 return ret; 637 goto parse_error;
639 } 638 }
640 /* kretprobes instances are iterated over via a list. The 639 /* kretprobes instances are iterated over via a list. The
641 * maximum should stay reasonable. 640 * maximum should stay reasonable.
642 */ 641 */
643 if (maxactive > KRETPROBE_MAXACTIVE_MAX) { 642 if (maxactive > KRETPROBE_MAXACTIVE_MAX) {
644 pr_info("Maxactive is too big (%d > %d).\n", 643 trace_probe_log_err(1, MAXACT_TOO_BIG);
645 maxactive, KRETPROBE_MAXACTIVE_MAX); 644 goto parse_error;
646 return -E2BIG;
647 } 645 }
648 } 646 }
649 647
650 /* try to parse an address. if that fails, try to read the 648 /* try to parse an address. if that fails, try to read the
651 * input as a symbol. */ 649 * input as a symbol. */
652 if (kstrtoul(argv[1], 0, (unsigned long *)&addr)) { 650 if (kstrtoul(argv[1], 0, (unsigned long *)&addr)) {
651 trace_probe_log_set_index(1);
653 /* Check whether uprobe event specified */ 652 /* Check whether uprobe event specified */
654 if (strchr(argv[1], '/') && strchr(argv[1], ':')) 653 if (strchr(argv[1], '/') && strchr(argv[1], ':')) {
655 return -ECANCELED; 654 ret = -ECANCELED;
655 goto error;
656 }
656 /* a symbol specified */ 657 /* a symbol specified */
657 symbol = kstrdup(argv[1], GFP_KERNEL); 658 symbol = kstrdup(argv[1], GFP_KERNEL);
658 if (!symbol) 659 if (!symbol)
@@ -660,23 +661,23 @@ static int trace_kprobe_create(int argc, const char *argv[])
660 /* TODO: support .init module functions */ 661 /* TODO: support .init module functions */
661 ret = traceprobe_split_symbol_offset(symbol, &offset); 662 ret = traceprobe_split_symbol_offset(symbol, &offset);
662 if (ret || offset < 0 || offset > UINT_MAX) { 663 if (ret || offset < 0 || offset > UINT_MAX) {
663 pr_info("Failed to parse either an address or a symbol.\n"); 664 trace_probe_log_err(0, BAD_PROBE_ADDR);
664 goto out; 665 goto parse_error;
665 } 666 }
666 if (kprobe_on_func_entry(NULL, symbol, offset)) 667 if (kprobe_on_func_entry(NULL, symbol, offset))
667 flags |= TPARG_FL_FENTRY; 668 flags |= TPARG_FL_FENTRY;
668 if (offset && is_return && !(flags & TPARG_FL_FENTRY)) { 669 if (offset && is_return && !(flags & TPARG_FL_FENTRY)) {
669 pr_info("Given offset is not valid for return probe.\n"); 670 trace_probe_log_err(0, BAD_RETPROBE);
670 ret = -EINVAL; 671 goto parse_error;
671 goto out;
672 } 672 }
673 } 673 }
674 argc -= 2; argv += 2;
675 674
675 trace_probe_log_set_index(0);
676 if (event) { 676 if (event) {
677 ret = traceprobe_parse_event_name(&event, &group, buf); 677 ret = traceprobe_parse_event_name(&event, &group, buf,
678 event - argv[0]);
678 if (ret) 679 if (ret)
679 goto out; 680 goto parse_error;
680 } else { 681 } else {
681 /* Make a new event name */ 682 /* Make a new event name */
682 if (symbol) 683 if (symbol)
@@ -691,13 +692,14 @@ static int trace_kprobe_create(int argc, const char *argv[])
691 692
692 /* setup a probe */ 693 /* setup a probe */
693 tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, 694 tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive,
694 argc, is_return); 695 argc - 2, is_return);
695 if (IS_ERR(tk)) { 696 if (IS_ERR(tk)) {
696 ret = PTR_ERR(tk); 697 ret = PTR_ERR(tk);
697 /* This must return -ENOMEM otherwise there is a bug */ 698 /* This must return -ENOMEM, else there is a bug */
698 WARN_ON_ONCE(ret != -ENOMEM); 699 WARN_ON_ONCE(ret != -ENOMEM);
699 goto out; 700 goto out; /* We know tk is not allocated */
700 } 701 }
702 argc -= 2; argv += 2;
701 703
702 /* parse arguments */ 704 /* parse arguments */
703 for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { 705 for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
@@ -707,19 +709,32 @@ static int trace_kprobe_create(int argc, const char *argv[])
707 goto error; 709 goto error;
708 } 710 }
709 711
712 trace_probe_log_set_index(i + 2);
710 ret = traceprobe_parse_probe_arg(&tk->tp, i, tmp, flags); 713 ret = traceprobe_parse_probe_arg(&tk->tp, i, tmp, flags);
711 kfree(tmp); 714 kfree(tmp);
712 if (ret) 715 if (ret)
713 goto error; 716 goto error; /* This can be -ENOMEM */
714 } 717 }
715 718
716 ret = register_trace_kprobe(tk); 719 ret = register_trace_kprobe(tk);
717 if (ret) 720 if (ret) {
721 trace_probe_log_set_index(1);
722 if (ret == -EILSEQ)
723 trace_probe_log_err(0, BAD_INSN_BNDRY);
724 else if (ret == -ENOENT)
725 trace_probe_log_err(0, BAD_PROBE_ADDR);
726 else if (ret != -ENOMEM)
727 trace_probe_log_err(0, FAIL_REG_PROBE);
718 goto error; 728 goto error;
729 }
730
719out: 731out:
732 trace_probe_log_clear();
720 kfree(symbol); 733 kfree(symbol);
721 return ret; 734 return ret;
722 735
736parse_error:
737 ret = -EINVAL;
723error: 738error:
724 free_trace_kprobe(tk); 739 free_trace_kprobe(tk);
725 goto out; 740 goto out;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 54373d93e251..ba751f993c3b 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1057,7 +1057,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
1057 1057
1058 trace_seq_puts(s, "<stack trace>\n"); 1058 trace_seq_puts(s, "<stack trace>\n");
1059 1059
1060 for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) { 1060 for (p = field->caller; p && p < end && *p != ULONG_MAX; p++) {
1061 1061
1062 if (trace_seq_has_overflowed(s)) 1062 if (trace_seq_has_overflowed(s))
1063 break; 1063 break;
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 8f8411e7835f..a347faced959 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -13,6 +13,11 @@
13 13
14#include "trace_probe.h" 14#include "trace_probe.h"
15 15
16#undef C
17#define C(a, b) b
18
19static const char *trace_probe_err_text[] = { ERRORS };
20
16static const char *reserved_field_names[] = { 21static const char *reserved_field_names[] = {
17 "common_type", 22 "common_type",
18 "common_flags", 23 "common_flags",
@@ -133,6 +138,60 @@ fail:
133 return NULL; 138 return NULL;
134} 139}
135 140
141static struct trace_probe_log trace_probe_log;
142
143void trace_probe_log_init(const char *subsystem, int argc, const char **argv)
144{
145 trace_probe_log.subsystem = subsystem;
146 trace_probe_log.argc = argc;
147 trace_probe_log.argv = argv;
148 trace_probe_log.index = 0;
149}
150
151void trace_probe_log_clear(void)
152{
153 memset(&trace_probe_log, 0, sizeof(trace_probe_log));
154}
155
156void trace_probe_log_set_index(int index)
157{
158 trace_probe_log.index = index;
159}
160
161void __trace_probe_log_err(int offset, int err_type)
162{
163 char *command, *p;
164 int i, len = 0, pos = 0;
165
166 if (!trace_probe_log.argv)
167 return;
168
169 /* Recalcurate the length and allocate buffer */
170 for (i = 0; i < trace_probe_log.argc; i++) {
171 if (i == trace_probe_log.index)
172 pos = len;
173 len += strlen(trace_probe_log.argv[i]) + 1;
174 }
175 command = kzalloc(len, GFP_KERNEL);
176 if (!command)
177 return;
178
179 /* And make a command string from argv array */
180 p = command;
181 for (i = 0; i < trace_probe_log.argc; i++) {
182 len = strlen(trace_probe_log.argv[i]);
183 strcpy(p, trace_probe_log.argv[i]);
184 p[len] = ' ';
185 p += len + 1;
186 }
187 *(p - 1) = '\0';
188
189 tracing_log_err(NULL, trace_probe_log.subsystem, command,
190 trace_probe_err_text, err_type, pos + offset);
191
192 kfree(command);
193}
194
136/* Split symbol and offset. */ 195/* Split symbol and offset. */
137int traceprobe_split_symbol_offset(char *symbol, long *offset) 196int traceprobe_split_symbol_offset(char *symbol, long *offset)
138{ 197{
@@ -156,7 +215,7 @@ int traceprobe_split_symbol_offset(char *symbol, long *offset)
156 215
157/* @buf must has MAX_EVENT_NAME_LEN size */ 216/* @buf must has MAX_EVENT_NAME_LEN size */
158int traceprobe_parse_event_name(const char **pevent, const char **pgroup, 217int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
159 char *buf) 218 char *buf, int offset)
160{ 219{
161 const char *slash, *event = *pevent; 220 const char *slash, *event = *pevent;
162 int len; 221 int len;
@@ -164,32 +223,33 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
164 slash = strchr(event, '/'); 223 slash = strchr(event, '/');
165 if (slash) { 224 if (slash) {
166 if (slash == event) { 225 if (slash == event) {
167 pr_info("Group name is not specified\n"); 226 trace_probe_log_err(offset, NO_GROUP_NAME);
168 return -EINVAL; 227 return -EINVAL;
169 } 228 }
170 if (slash - event + 1 > MAX_EVENT_NAME_LEN) { 229 if (slash - event + 1 > MAX_EVENT_NAME_LEN) {
171 pr_info("Group name is too long\n"); 230 trace_probe_log_err(offset, GROUP_TOO_LONG);
172 return -E2BIG; 231 return -EINVAL;
173 } 232 }
174 strlcpy(buf, event, slash - event + 1); 233 strlcpy(buf, event, slash - event + 1);
175 if (!is_good_name(buf)) { 234 if (!is_good_name(buf)) {
176 pr_info("Group name must follow the same rules as C identifiers\n"); 235 trace_probe_log_err(offset, BAD_GROUP_NAME);
177 return -EINVAL; 236 return -EINVAL;
178 } 237 }
179 *pgroup = buf; 238 *pgroup = buf;
180 *pevent = slash + 1; 239 *pevent = slash + 1;
240 offset += slash - event + 1;
181 event = *pevent; 241 event = *pevent;
182 } 242 }
183 len = strlen(event); 243 len = strlen(event);
184 if (len == 0) { 244 if (len == 0) {
185 pr_info("Event name is not specified\n"); 245 trace_probe_log_err(offset, NO_EVENT_NAME);
186 return -EINVAL; 246 return -EINVAL;
187 } else if (len > MAX_EVENT_NAME_LEN) { 247 } else if (len > MAX_EVENT_NAME_LEN) {
188 pr_info("Event name is too long\n"); 248 trace_probe_log_err(offset, EVENT_TOO_LONG);
189 return -E2BIG; 249 return -EINVAL;
190 } 250 }
191 if (!is_good_name(event)) { 251 if (!is_good_name(event)) {
192 pr_info("Event name must follow the same rules as C identifiers\n"); 252 trace_probe_log_err(offset, BAD_EVENT_NAME);
193 return -EINVAL; 253 return -EINVAL;
194 } 254 }
195 return 0; 255 return 0;
@@ -198,56 +258,67 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
198#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) 258#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
199 259
200static int parse_probe_vars(char *arg, const struct fetch_type *t, 260static int parse_probe_vars(char *arg, const struct fetch_type *t,
201 struct fetch_insn *code, unsigned int flags) 261 struct fetch_insn *code, unsigned int flags, int offs)
202{ 262{
203 unsigned long param; 263 unsigned long param;
204 int ret = 0; 264 int ret = 0;
205 int len; 265 int len;
206 266
207 if (strcmp(arg, "retval") == 0) { 267 if (strcmp(arg, "retval") == 0) {
208 if (flags & TPARG_FL_RETURN) 268 if (flags & TPARG_FL_RETURN) {
209 code->op = FETCH_OP_RETVAL; 269 code->op = FETCH_OP_RETVAL;
210 else 270 } else {
271 trace_probe_log_err(offs, RETVAL_ON_PROBE);
211 ret = -EINVAL; 272 ret = -EINVAL;
273 }
212 } else if ((len = str_has_prefix(arg, "stack"))) { 274 } else if ((len = str_has_prefix(arg, "stack"))) {
213 if (arg[len] == '\0') { 275 if (arg[len] == '\0') {
214 code->op = FETCH_OP_STACKP; 276 code->op = FETCH_OP_STACKP;
215 } else if (isdigit(arg[len])) { 277 } else if (isdigit(arg[len])) {
216 ret = kstrtoul(arg + len, 10, &param); 278 ret = kstrtoul(arg + len, 10, &param);
217 if (ret || ((flags & TPARG_FL_KERNEL) && 279 if (ret) {
218 param > PARAM_MAX_STACK)) 280 goto inval_var;
281 } else if ((flags & TPARG_FL_KERNEL) &&
282 param > PARAM_MAX_STACK) {
283 trace_probe_log_err(offs, BAD_STACK_NUM);
219 ret = -EINVAL; 284 ret = -EINVAL;
220 else { 285 } else {
221 code->op = FETCH_OP_STACK; 286 code->op = FETCH_OP_STACK;
222 code->param = (unsigned int)param; 287 code->param = (unsigned int)param;
223 } 288 }
224 } else 289 } else
225 ret = -EINVAL; 290 goto inval_var;
226 } else if (strcmp(arg, "comm") == 0) { 291 } else if (strcmp(arg, "comm") == 0) {
227 code->op = FETCH_OP_COMM; 292 code->op = FETCH_OP_COMM;
228#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API 293#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
229 } else if (((flags & TPARG_FL_MASK) == 294 } else if (((flags & TPARG_FL_MASK) ==
230 (TPARG_FL_KERNEL | TPARG_FL_FENTRY)) && 295 (TPARG_FL_KERNEL | TPARG_FL_FENTRY)) &&
231 (len = str_has_prefix(arg, "arg"))) { 296 (len = str_has_prefix(arg, "arg"))) {
232 if (!isdigit(arg[len]))
233 return -EINVAL;
234 ret = kstrtoul(arg + len, 10, &param); 297 ret = kstrtoul(arg + len, 10, &param);
235 if (ret || !param || param > PARAM_MAX_STACK) 298 if (ret) {
299 goto inval_var;
300 } else if (!param || param > PARAM_MAX_STACK) {
301 trace_probe_log_err(offs, BAD_ARG_NUM);
236 return -EINVAL; 302 return -EINVAL;
303 }
237 code->op = FETCH_OP_ARG; 304 code->op = FETCH_OP_ARG;
238 code->param = (unsigned int)param - 1; 305 code->param = (unsigned int)param - 1;
239#endif 306#endif
240 } else 307 } else
241 ret = -EINVAL; 308 goto inval_var;
242 309
243 return ret; 310 return ret;
311
312inval_var:
313 trace_probe_log_err(offs, BAD_VAR);
314 return -EINVAL;
244} 315}
245 316
246/* Recursive argument parser */ 317/* Recursive argument parser */
247static int 318static int
248parse_probe_arg(char *arg, const struct fetch_type *type, 319parse_probe_arg(char *arg, const struct fetch_type *type,
249 struct fetch_insn **pcode, struct fetch_insn *end, 320 struct fetch_insn **pcode, struct fetch_insn *end,
250 unsigned int flags) 321 unsigned int flags, int offs)
251{ 322{
252 struct fetch_insn *code = *pcode; 323 struct fetch_insn *code = *pcode;
253 unsigned long param; 324 unsigned long param;
@@ -257,7 +328,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
257 328
258 switch (arg[0]) { 329 switch (arg[0]) {
259 case '$': 330 case '$':
260 ret = parse_probe_vars(arg + 1, type, code, flags); 331 ret = parse_probe_vars(arg + 1, type, code, flags, offs);
261 break; 332 break;
262 333
263 case '%': /* named register */ 334 case '%': /* named register */
@@ -266,47 +337,57 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
266 code->op = FETCH_OP_REG; 337 code->op = FETCH_OP_REG;
267 code->param = (unsigned int)ret; 338 code->param = (unsigned int)ret;
268 ret = 0; 339 ret = 0;
269 } 340 } else
341 trace_probe_log_err(offs, BAD_REG_NAME);
270 break; 342 break;
271 343
272 case '@': /* memory, file-offset or symbol */ 344 case '@': /* memory, file-offset or symbol */
273 if (isdigit(arg[1])) { 345 if (isdigit(arg[1])) {
274 ret = kstrtoul(arg + 1, 0, &param); 346 ret = kstrtoul(arg + 1, 0, &param);
275 if (ret) 347 if (ret) {
348 trace_probe_log_err(offs, BAD_MEM_ADDR);
276 break; 349 break;
350 }
277 /* load address */ 351 /* load address */
278 code->op = FETCH_OP_IMM; 352 code->op = FETCH_OP_IMM;
279 code->immediate = param; 353 code->immediate = param;
280 } else if (arg[1] == '+') { 354 } else if (arg[1] == '+') {
281 /* kprobes don't support file offsets */ 355 /* kprobes don't support file offsets */
282 if (flags & TPARG_FL_KERNEL) 356 if (flags & TPARG_FL_KERNEL) {
357 trace_probe_log_err(offs, FILE_ON_KPROBE);
283 return -EINVAL; 358 return -EINVAL;
284 359 }
285 ret = kstrtol(arg + 2, 0, &offset); 360 ret = kstrtol(arg + 2, 0, &offset);
286 if (ret) 361 if (ret) {
362 trace_probe_log_err(offs, BAD_FILE_OFFS);
287 break; 363 break;
364 }
288 365
289 code->op = FETCH_OP_FOFFS; 366 code->op = FETCH_OP_FOFFS;
290 code->immediate = (unsigned long)offset; // imm64? 367 code->immediate = (unsigned long)offset; // imm64?
291 } else { 368 } else {
292 /* uprobes don't support symbols */ 369 /* uprobes don't support symbols */
293 if (!(flags & TPARG_FL_KERNEL)) 370 if (!(flags & TPARG_FL_KERNEL)) {
371 trace_probe_log_err(offs, SYM_ON_UPROBE);
294 return -EINVAL; 372 return -EINVAL;
295 373 }
296 /* Preserve symbol for updating */ 374 /* Preserve symbol for updating */
297 code->op = FETCH_NOP_SYMBOL; 375 code->op = FETCH_NOP_SYMBOL;
298 code->data = kstrdup(arg + 1, GFP_KERNEL); 376 code->data = kstrdup(arg + 1, GFP_KERNEL);
299 if (!code->data) 377 if (!code->data)
300 return -ENOMEM; 378 return -ENOMEM;
301 if (++code == end) 379 if (++code == end) {
302 return -E2BIG; 380 trace_probe_log_err(offs, TOO_MANY_OPS);
303 381 return -EINVAL;
382 }
304 code->op = FETCH_OP_IMM; 383 code->op = FETCH_OP_IMM;
305 code->immediate = 0; 384 code->immediate = 0;
306 } 385 }
307 /* These are fetching from memory */ 386 /* These are fetching from memory */
308 if (++code == end) 387 if (++code == end) {
309 return -E2BIG; 388 trace_probe_log_err(offs, TOO_MANY_OPS);
389 return -EINVAL;
390 }
310 *pcode = code; 391 *pcode = code;
311 code->op = FETCH_OP_DEREF; 392 code->op = FETCH_OP_DEREF;
312 code->offset = offset; 393 code->offset = offset;
@@ -317,28 +398,38 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
317 /* fall through */ 398 /* fall through */
318 case '-': 399 case '-':
319 tmp = strchr(arg, '('); 400 tmp = strchr(arg, '(');
320 if (!tmp) 401 if (!tmp) {
402 trace_probe_log_err(offs, DEREF_NEED_BRACE);
321 return -EINVAL; 403 return -EINVAL;
322 404 }
323 *tmp = '\0'; 405 *tmp = '\0';
324 ret = kstrtol(arg, 0, &offset); 406 ret = kstrtol(arg, 0, &offset);
325 if (ret) 407 if (ret) {
408 trace_probe_log_err(offs, BAD_DEREF_OFFS);
326 break; 409 break;
327 410 }
411 offs += (tmp + 1 - arg) + (arg[0] != '-' ? 1 : 0);
328 arg = tmp + 1; 412 arg = tmp + 1;
329 tmp = strrchr(arg, ')'); 413 tmp = strrchr(arg, ')');
330 414 if (!tmp) {
331 if (tmp) { 415 trace_probe_log_err(offs + strlen(arg),
416 DEREF_OPEN_BRACE);
417 return -EINVAL;
418 } else {
332 const struct fetch_type *t2 = find_fetch_type(NULL); 419 const struct fetch_type *t2 = find_fetch_type(NULL);
333 420
334 *tmp = '\0'; 421 *tmp = '\0';
335 ret = parse_probe_arg(arg, t2, &code, end, flags); 422 ret = parse_probe_arg(arg, t2, &code, end, flags, offs);
336 if (ret) 423 if (ret)
337 break; 424 break;
338 if (code->op == FETCH_OP_COMM) 425 if (code->op == FETCH_OP_COMM) {
426 trace_probe_log_err(offs, COMM_CANT_DEREF);
339 return -EINVAL; 427 return -EINVAL;
340 if (++code == end) 428 }
341 return -E2BIG; 429 if (++code == end) {
430 trace_probe_log_err(offs, TOO_MANY_OPS);
431 return -EINVAL;
432 }
342 *pcode = code; 433 *pcode = code;
343 434
344 code->op = FETCH_OP_DEREF; 435 code->op = FETCH_OP_DEREF;
@@ -348,6 +439,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
348 } 439 }
349 if (!ret && code->op == FETCH_OP_NOP) { 440 if (!ret && code->op == FETCH_OP_NOP) {
350 /* Parsed, but do not find fetch method */ 441 /* Parsed, but do not find fetch method */
442 trace_probe_log_err(offs, BAD_FETCH_ARG);
351 ret = -EINVAL; 443 ret = -EINVAL;
352 } 444 }
353 return ret; 445 return ret;
@@ -379,7 +471,7 @@ static int __parse_bitfield_probe_arg(const char *bf,
379 return -EINVAL; 471 return -EINVAL;
380 code++; 472 code++;
381 if (code->op != FETCH_OP_NOP) 473 if (code->op != FETCH_OP_NOP)
382 return -E2BIG; 474 return -EINVAL;
383 *pcode = code; 475 *pcode = code;
384 476
385 code->op = FETCH_OP_MOD_BF; 477 code->op = FETCH_OP_MOD_BF;
@@ -392,44 +484,66 @@ static int __parse_bitfield_probe_arg(const char *bf,
392 484
393/* String length checking wrapper */ 485/* String length checking wrapper */
394static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, 486static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
395 struct probe_arg *parg, unsigned int flags) 487 struct probe_arg *parg, unsigned int flags, int offset)
396{ 488{
397 struct fetch_insn *code, *scode, *tmp = NULL; 489 struct fetch_insn *code, *scode, *tmp = NULL;
398 char *t, *t2; 490 char *t, *t2, *t3;
399 int ret, len; 491 int ret, len;
400 492
401 if (strlen(arg) > MAX_ARGSTR_LEN) { 493 len = strlen(arg);
402 pr_info("Argument is too long.: %s\n", arg); 494 if (len > MAX_ARGSTR_LEN) {
403 return -ENOSPC; 495 trace_probe_log_err(offset, ARG_TOO_LONG);
496 return -EINVAL;
497 } else if (len == 0) {
498 trace_probe_log_err(offset, NO_ARG_BODY);
499 return -EINVAL;
404 } 500 }
501
405 parg->comm = kstrdup(arg, GFP_KERNEL); 502 parg->comm = kstrdup(arg, GFP_KERNEL);
406 if (!parg->comm) { 503 if (!parg->comm)
407 pr_info("Failed to allocate memory for command '%s'.\n", arg);
408 return -ENOMEM; 504 return -ENOMEM;
409 } 505
410 t = strchr(arg, ':'); 506 t = strchr(arg, ':');
411 if (t) { 507 if (t) {
412 *t = '\0'; 508 *t = '\0';
413 t2 = strchr(++t, '['); 509 t2 = strchr(++t, '[');
414 if (t2) { 510 if (t2) {
415 *t2 = '\0'; 511 *t2++ = '\0';
416 parg->count = simple_strtoul(t2 + 1, &t2, 0); 512 t3 = strchr(t2, ']');
417 if (strcmp(t2, "]") || parg->count == 0) 513 if (!t3) {
514 offset += t2 + strlen(t2) - arg;
515 trace_probe_log_err(offset,
516 ARRAY_NO_CLOSE);
517 return -EINVAL;
518 } else if (t3[1] != '\0') {
519 trace_probe_log_err(offset + t3 + 1 - arg,
520 BAD_ARRAY_SUFFIX);
418 return -EINVAL; 521 return -EINVAL;
419 if (parg->count > MAX_ARRAY_LEN) 522 }
420 return -E2BIG; 523 *t3 = '\0';
524 if (kstrtouint(t2, 0, &parg->count) || !parg->count) {
525 trace_probe_log_err(offset + t2 - arg,
526 BAD_ARRAY_NUM);
527 return -EINVAL;
528 }
529 if (parg->count > MAX_ARRAY_LEN) {
530 trace_probe_log_err(offset + t2 - arg,
531 ARRAY_TOO_BIG);
532 return -EINVAL;
533 }
421 } 534 }
422 } 535 }
423 /* 536
424 * The default type of $comm should be "string", and it can't be 537 /* Since $comm can not be dereferred, we can find $comm by strcmp */
425 * dereferenced. 538 if (strcmp(arg, "$comm") == 0) {
426 */ 539 /* The type of $comm must be "string", and not an array. */
427 if (!t && strcmp(arg, "$comm") == 0) 540 if (parg->count || (t && strcmp(t, "string")))
541 return -EINVAL;
428 parg->type = find_fetch_type("string"); 542 parg->type = find_fetch_type("string");
429 else 543 } else
430 parg->type = find_fetch_type(t); 544 parg->type = find_fetch_type(t);
431 if (!parg->type) { 545 if (!parg->type) {
432 pr_info("Unsupported type: %s\n", t); 546 trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_TYPE);
433 return -EINVAL; 547 return -EINVAL;
434 } 548 }
435 parg->offset = *size; 549 parg->offset = *size;
@@ -444,13 +558,13 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
444 parg->count); 558 parg->count);
445 } 559 }
446 560
447 code = tmp = kzalloc(sizeof(*code) * FETCH_INSN_MAX, GFP_KERNEL); 561 code = tmp = kcalloc(FETCH_INSN_MAX, sizeof(*code), GFP_KERNEL);
448 if (!code) 562 if (!code)
449 return -ENOMEM; 563 return -ENOMEM;
450 code[FETCH_INSN_MAX - 1].op = FETCH_OP_END; 564 code[FETCH_INSN_MAX - 1].op = FETCH_OP_END;
451 565
452 ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1], 566 ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1],
453 flags); 567 flags, offset);
454 if (ret) 568 if (ret)
455 goto fail; 569 goto fail;
456 570
@@ -458,7 +572,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
458 if (!strcmp(parg->type->name, "string")) { 572 if (!strcmp(parg->type->name, "string")) {
459 if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_IMM && 573 if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_IMM &&
460 code->op != FETCH_OP_COMM) { 574 code->op != FETCH_OP_COMM) {
461 pr_info("string only accepts memory or address.\n"); 575 trace_probe_log_err(offset + (t ? (t - arg) : 0),
576 BAD_STRING);
462 ret = -EINVAL; 577 ret = -EINVAL;
463 goto fail; 578 goto fail;
464 } 579 }
@@ -470,7 +585,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
470 */ 585 */
471 code++; 586 code++;
472 if (code->op != FETCH_OP_NOP) { 587 if (code->op != FETCH_OP_NOP) {
473 ret = -E2BIG; 588 trace_probe_log_err(offset, TOO_MANY_OPS);
589 ret = -EINVAL;
474 goto fail; 590 goto fail;
475 } 591 }
476 } 592 }
@@ -483,7 +599,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
483 } else { 599 } else {
484 code++; 600 code++;
485 if (code->op != FETCH_OP_NOP) { 601 if (code->op != FETCH_OP_NOP) {
486 ret = -E2BIG; 602 trace_probe_log_err(offset, TOO_MANY_OPS);
603 ret = -EINVAL;
487 goto fail; 604 goto fail;
488 } 605 }
489 code->op = FETCH_OP_ST_RAW; 606 code->op = FETCH_OP_ST_RAW;
@@ -493,20 +610,24 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
493 /* Modify operation */ 610 /* Modify operation */
494 if (t != NULL) { 611 if (t != NULL) {
495 ret = __parse_bitfield_probe_arg(t, parg->type, &code); 612 ret = __parse_bitfield_probe_arg(t, parg->type, &code);
496 if (ret) 613 if (ret) {
614 trace_probe_log_err(offset + t - arg, BAD_BITFIELD);
497 goto fail; 615 goto fail;
616 }
498 } 617 }
499 /* Loop(Array) operation */ 618 /* Loop(Array) operation */
500 if (parg->count) { 619 if (parg->count) {
501 if (scode->op != FETCH_OP_ST_MEM && 620 if (scode->op != FETCH_OP_ST_MEM &&
502 scode->op != FETCH_OP_ST_STRING) { 621 scode->op != FETCH_OP_ST_STRING) {
503 pr_info("array only accepts memory or address\n"); 622 trace_probe_log_err(offset + (t ? (t - arg) : 0),
623 BAD_STRING);
504 ret = -EINVAL; 624 ret = -EINVAL;
505 goto fail; 625 goto fail;
506 } 626 }
507 code++; 627 code++;
508 if (code->op != FETCH_OP_NOP) { 628 if (code->op != FETCH_OP_NOP) {
509 ret = -E2BIG; 629 trace_probe_log_err(offset, TOO_MANY_OPS);
630 ret = -EINVAL;
510 goto fail; 631 goto fail;
511 } 632 }
512 code->op = FETCH_OP_LP_ARRAY; 633 code->op = FETCH_OP_LP_ARRAY;
@@ -516,7 +637,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
516 code->op = FETCH_OP_END; 637 code->op = FETCH_OP_END;
517 638
518 /* Shrink down the code buffer */ 639 /* Shrink down the code buffer */
519 parg->code = kzalloc(sizeof(*code) * (code - tmp + 1), GFP_KERNEL); 640 parg->code = kcalloc(code - tmp + 1, sizeof(*code), GFP_KERNEL);
520 if (!parg->code) 641 if (!parg->code)
521 ret = -ENOMEM; 642 ret = -ENOMEM;
522 else 643 else
@@ -555,15 +676,19 @@ int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, char *arg,
555{ 676{
556 struct probe_arg *parg = &tp->args[i]; 677 struct probe_arg *parg = &tp->args[i];
557 char *body; 678 char *body;
558 int ret;
559 679
560 /* Increment count for freeing args in error case */ 680 /* Increment count for freeing args in error case */
561 tp->nr_args++; 681 tp->nr_args++;
562 682
563 body = strchr(arg, '='); 683 body = strchr(arg, '=');
564 if (body) { 684 if (body) {
565 if (body - arg > MAX_ARG_NAME_LEN || body == arg) 685 if (body - arg > MAX_ARG_NAME_LEN) {
686 trace_probe_log_err(0, ARG_NAME_TOO_LONG);
687 return -EINVAL;
688 } else if (body == arg) {
689 trace_probe_log_err(0, NO_ARG_NAME);
566 return -EINVAL; 690 return -EINVAL;
691 }
567 parg->name = kmemdup_nul(arg, body - arg, GFP_KERNEL); 692 parg->name = kmemdup_nul(arg, body - arg, GFP_KERNEL);
568 body++; 693 body++;
569 } else { 694 } else {
@@ -575,22 +700,16 @@ int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, char *arg,
575 return -ENOMEM; 700 return -ENOMEM;
576 701
577 if (!is_good_name(parg->name)) { 702 if (!is_good_name(parg->name)) {
578 pr_info("Invalid argument[%d] name: %s\n", 703 trace_probe_log_err(0, BAD_ARG_NAME);
579 i, parg->name);
580 return -EINVAL; 704 return -EINVAL;
581 } 705 }
582
583 if (traceprobe_conflict_field_name(parg->name, tp->args, i)) { 706 if (traceprobe_conflict_field_name(parg->name, tp->args, i)) {
584 pr_info("Argument[%d]: '%s' conflicts with another field.\n", 707 trace_probe_log_err(0, USED_ARG_NAME);
585 i, parg->name);
586 return -EINVAL; 708 return -EINVAL;
587 } 709 }
588
589 /* Parse fetch argument */ 710 /* Parse fetch argument */
590 ret = traceprobe_parse_probe_arg_body(body, &tp->size, parg, flags); 711 return traceprobe_parse_probe_arg_body(body, &tp->size, parg, flags,
591 if (ret) 712 body - arg);
592 pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
593 return ret;
594} 713}
595 714
596void traceprobe_free_probe_arg(struct probe_arg *arg) 715void traceprobe_free_probe_arg(struct probe_arg *arg)
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 2177c206de15..f9a8c632188b 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -124,6 +124,7 @@ struct fetch_insn {
124 124
125/* fetch + deref*N + store + mod + end <= 16, this allows N=12, enough */ 125/* fetch + deref*N + store + mod + end <= 16, this allows N=12, enough */
126#define FETCH_INSN_MAX 16 126#define FETCH_INSN_MAX 16
127#define FETCH_TOKEN_COMM (-ECOMM)
127 128
128/* Fetch type information table */ 129/* Fetch type information table */
129struct fetch_type { 130struct fetch_type {
@@ -280,8 +281,8 @@ extern int traceprobe_update_arg(struct probe_arg *arg);
280extern void traceprobe_free_probe_arg(struct probe_arg *arg); 281extern void traceprobe_free_probe_arg(struct probe_arg *arg);
281 282
282extern int traceprobe_split_symbol_offset(char *symbol, long *offset); 283extern int traceprobe_split_symbol_offset(char *symbol, long *offset);
283extern int traceprobe_parse_event_name(const char **pevent, 284int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
284 const char **pgroup, char *buf); 285 char *buf, int offset);
285 286
286extern int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return); 287extern int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return);
287 288
@@ -298,3 +299,76 @@ extern void destroy_local_trace_uprobe(struct trace_event_call *event_call);
298#endif 299#endif
299extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, 300extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
300 size_t offset, struct trace_probe *tp); 301 size_t offset, struct trace_probe *tp);
302
303#undef ERRORS
304#define ERRORS \
305 C(FILE_NOT_FOUND, "Failed to find the given file"), \
306 C(NO_REGULAR_FILE, "Not a regular file"), \
307 C(BAD_REFCNT, "Invalid reference counter offset"), \
308 C(REFCNT_OPEN_BRACE, "Reference counter brace is not closed"), \
309 C(BAD_REFCNT_SUFFIX, "Reference counter has wrong suffix"), \
310 C(BAD_UPROBE_OFFS, "Invalid uprobe offset"), \
311 C(MAXACT_NO_KPROBE, "Maxactive is not for kprobe"), \
312 C(BAD_MAXACT, "Invalid maxactive number"), \
313 C(MAXACT_TOO_BIG, "Maxactive is too big"), \
314 C(BAD_PROBE_ADDR, "Invalid probed address or symbol"), \
315 C(BAD_RETPROBE, "Retprobe address must be an function entry"), \
316 C(NO_GROUP_NAME, "Group name is not specified"), \
317 C(GROUP_TOO_LONG, "Group name is too long"), \
318 C(BAD_GROUP_NAME, "Group name must follow the same rules as C identifiers"), \
319 C(NO_EVENT_NAME, "Event name is not specified"), \
320 C(EVENT_TOO_LONG, "Event name is too long"), \
321 C(BAD_EVENT_NAME, "Event name must follow the same rules as C identifiers"), \
322 C(RETVAL_ON_PROBE, "$retval is not available on probe"), \
323 C(BAD_STACK_NUM, "Invalid stack number"), \
324 C(BAD_ARG_NUM, "Invalid argument number"), \
325 C(BAD_VAR, "Invalid $-valiable specified"), \
326 C(BAD_REG_NAME, "Invalid register name"), \
327 C(BAD_MEM_ADDR, "Invalid memory address"), \
328 C(FILE_ON_KPROBE, "File offset is not available with kprobe"), \
329 C(BAD_FILE_OFFS, "Invalid file offset value"), \
330 C(SYM_ON_UPROBE, "Symbol is not available with uprobe"), \
331 C(TOO_MANY_OPS, "Dereference is too much nested"), \
332 C(DEREF_NEED_BRACE, "Dereference needs a brace"), \
333 C(BAD_DEREF_OFFS, "Invalid dereference offset"), \
334 C(DEREF_OPEN_BRACE, "Dereference brace is not closed"), \
335 C(COMM_CANT_DEREF, "$comm can not be dereferenced"), \
336 C(BAD_FETCH_ARG, "Invalid fetch argument"), \
337 C(ARRAY_NO_CLOSE, "Array is not closed"), \
338 C(BAD_ARRAY_SUFFIX, "Array has wrong suffix"), \
339 C(BAD_ARRAY_NUM, "Invalid array size"), \
340 C(ARRAY_TOO_BIG, "Array number is too big"), \
341 C(BAD_TYPE, "Unknown type is specified"), \
342 C(BAD_STRING, "String accepts only memory argument"), \
343 C(BAD_BITFIELD, "Invalid bitfield"), \
344 C(ARG_NAME_TOO_LONG, "Argument name is too long"), \
345 C(NO_ARG_NAME, "Argument name is not specified"), \
346 C(BAD_ARG_NAME, "Argument name must follow the same rules as C identifiers"), \
347 C(USED_ARG_NAME, "This argument name is already used"), \
348 C(ARG_TOO_LONG, "Argument expression is too long"), \
349 C(NO_ARG_BODY, "No argument expression"), \
350 C(BAD_INSN_BNDRY, "Probe point is not an instruction boundary"),\
351 C(FAIL_REG_PROBE, "Failed to register probe event"),
352
353#undef C
354#define C(a, b) TP_ERR_##a
355
356/* Define TP_ERR_ */
357enum { ERRORS };
358
359/* Error text is defined in trace_probe.c */
360
361struct trace_probe_log {
362 const char *subsystem;
363 const char **argv;
364 int argc;
365 int index;
366};
367
368void trace_probe_log_init(const char *subsystem, int argc, const char **argv);
369void trace_probe_log_set_index(int index);
370void trace_probe_log_clear(void);
371void __trace_probe_log_err(int offset, int err);
372
373#define trace_probe_log_err(offs, err) \
374 __trace_probe_log_err(offs, TP_ERR_##err)
diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h
index 4737bb8c07a3..c30c61f12ddd 100644
--- a/kernel/trace/trace_probe_tmpl.h
+++ b/kernel/trace/trace_probe_tmpl.h
@@ -88,7 +88,7 @@ stage3:
88 /* 3rd stage: store value to buffer */ 88 /* 3rd stage: store value to buffer */
89 if (unlikely(!dest)) { 89 if (unlikely(!dest)) {
90 if (code->op == FETCH_OP_ST_STRING) { 90 if (code->op == FETCH_OP_ST_STRING) {
91 ret += fetch_store_strlen(val + code->offset); 91 ret = fetch_store_strlen(val + code->offset);
92 code++; 92 code++;
93 goto array; 93 goto array;
94 } else 94 } else
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 9d402e7fc949..69ee8ef12cee 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -792,7 +792,10 @@ trace_selftest_startup_function_graph(struct tracer *trace,
792 /* check the trace buffer */ 792 /* check the trace buffer */
793 ret = trace_test_buffer(&tr->trace_buffer, &count); 793 ret = trace_test_buffer(&tr->trace_buffer, &count);
794 794
795 trace->reset(tr); 795 /* Need to also simulate the tr->reset to remove this fgraph_ops */
796 tracing_stop_cmdline_record();
797 unregister_ftrace_graph(&fgraph_ops);
798
796 tracing_start(); 799 tracing_start();
797 800
798 if (!ret && !count) { 801 if (!ret && !count) {
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index be78d99ee6bc..7860e3f59fad 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -156,7 +156,10 @@ fetch_store_string(unsigned long addr, void *dest, void *base)
156 if (unlikely(!maxlen)) 156 if (unlikely(!maxlen))
157 return -ENOMEM; 157 return -ENOMEM;
158 158
159 ret = strncpy_from_user(dst, src, maxlen); 159 if (addr == FETCH_TOKEN_COMM)
160 ret = strlcpy(dst, current->comm, maxlen);
161 else
162 ret = strncpy_from_user(dst, src, maxlen);
160 if (ret >= 0) { 163 if (ret >= 0) {
161 if (ret == maxlen) 164 if (ret == maxlen)
162 dst[ret - 1] = '\0'; 165 dst[ret - 1] = '\0';
@@ -180,7 +183,10 @@ fetch_store_strlen(unsigned long addr)
180 int len; 183 int len;
181 void __user *vaddr = (void __force __user *) addr; 184 void __user *vaddr = (void __force __user *) addr;
182 185
183 len = strnlen_user(vaddr, MAX_STRING_SIZE); 186 if (addr == FETCH_TOKEN_COMM)
187 len = strlen(current->comm) + 1;
188 else
189 len = strnlen_user(vaddr, MAX_STRING_SIZE);
184 190
185 return (len > MAX_STRING_SIZE) ? 0 : len; 191 return (len > MAX_STRING_SIZE) ? 0 : len;
186} 192}
@@ -220,6 +226,9 @@ process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest,
220 case FETCH_OP_IMM: 226 case FETCH_OP_IMM:
221 val = code->immediate; 227 val = code->immediate;
222 break; 228 break;
229 case FETCH_OP_COMM:
230 val = FETCH_TOKEN_COMM;
231 break;
223 case FETCH_OP_FOFFS: 232 case FETCH_OP_FOFFS:
224 val = translate_user_vaddr(code->immediate); 233 val = translate_user_vaddr(code->immediate);
225 break; 234 break;
@@ -417,8 +426,6 @@ end:
417/* 426/*
418 * Argument syntax: 427 * Argument syntax:
419 * - Add uprobe: p|r[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS] 428 * - Add uprobe: p|r[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS]
420 *
421 * - Remove uprobe: -:[GRP/]EVENT
422 */ 429 */
423static int trace_uprobe_create(int argc, const char **argv) 430static int trace_uprobe_create(int argc, const char **argv)
424{ 431{
@@ -434,10 +441,17 @@ static int trace_uprobe_create(int argc, const char **argv)
434 ret = 0; 441 ret = 0;
435 ref_ctr_offset = 0; 442 ref_ctr_offset = 0;
436 443
437 /* argc must be >= 1 */ 444 switch (argv[0][0]) {
438 if (argv[0][0] == 'r') 445 case 'r':
439 is_return = true; 446 is_return = true;
440 else if (argv[0][0] != 'p' || argc < 2) 447 break;
448 case 'p':
449 break;
450 default:
451 return -ECANCELED;
452 }
453
454 if (argc < 2)
441 return -ECANCELED; 455 return -ECANCELED;
442 456
443 if (argv[0][1] == ':') 457 if (argv[0][1] == ':')
@@ -457,13 +471,19 @@ static int trace_uprobe_create(int argc, const char **argv)
457 return -ECANCELED; 471 return -ECANCELED;
458 } 472 }
459 473
474 trace_probe_log_init("trace_uprobe", argc, argv);
475 trace_probe_log_set_index(1); /* filename is the 2nd argument */
476
460 *arg++ = '\0'; 477 *arg++ = '\0';
461 ret = kern_path(filename, LOOKUP_FOLLOW, &path); 478 ret = kern_path(filename, LOOKUP_FOLLOW, &path);
462 if (ret) { 479 if (ret) {
480 trace_probe_log_err(0, FILE_NOT_FOUND);
463 kfree(filename); 481 kfree(filename);
482 trace_probe_log_clear();
464 return ret; 483 return ret;
465 } 484 }
466 if (!d_is_reg(path.dentry)) { 485 if (!d_is_reg(path.dentry)) {
486 trace_probe_log_err(0, NO_REGULAR_FILE);
467 ret = -EINVAL; 487 ret = -EINVAL;
468 goto fail_address_parse; 488 goto fail_address_parse;
469 } 489 }
@@ -472,9 +492,16 @@ static int trace_uprobe_create(int argc, const char **argv)
472 rctr = strchr(arg, '('); 492 rctr = strchr(arg, '(');
473 if (rctr) { 493 if (rctr) {
474 rctr_end = strchr(rctr, ')'); 494 rctr_end = strchr(rctr, ')');
475 if (rctr > rctr_end || *(rctr_end + 1) != 0) { 495 if (!rctr_end) {
476 ret = -EINVAL; 496 ret = -EINVAL;
477 pr_info("Invalid reference counter offset.\n"); 497 rctr_end = rctr + strlen(rctr);
498 trace_probe_log_err(rctr_end - filename,
499 REFCNT_OPEN_BRACE);
500 goto fail_address_parse;
501 } else if (rctr_end[1] != '\0') {
502 ret = -EINVAL;
503 trace_probe_log_err(rctr_end + 1 - filename,
504 BAD_REFCNT_SUFFIX);
478 goto fail_address_parse; 505 goto fail_address_parse;
479 } 506 }
480 507
@@ -482,22 +509,23 @@ static int trace_uprobe_create(int argc, const char **argv)
482 *rctr_end = '\0'; 509 *rctr_end = '\0';
483 ret = kstrtoul(rctr, 0, &ref_ctr_offset); 510 ret = kstrtoul(rctr, 0, &ref_ctr_offset);
484 if (ret) { 511 if (ret) {
485 pr_info("Invalid reference counter offset.\n"); 512 trace_probe_log_err(rctr - filename, BAD_REFCNT);
486 goto fail_address_parse; 513 goto fail_address_parse;
487 } 514 }
488 } 515 }
489 516
490 /* Parse uprobe offset. */ 517 /* Parse uprobe offset. */
491 ret = kstrtoul(arg, 0, &offset); 518 ret = kstrtoul(arg, 0, &offset);
492 if (ret) 519 if (ret) {
520 trace_probe_log_err(arg - filename, BAD_UPROBE_OFFS);
493 goto fail_address_parse; 521 goto fail_address_parse;
494 522 }
495 argc -= 2;
496 argv += 2;
497 523
498 /* setup a probe */ 524 /* setup a probe */
525 trace_probe_log_set_index(0);
499 if (event) { 526 if (event) {
500 ret = traceprobe_parse_event_name(&event, &group, buf); 527 ret = traceprobe_parse_event_name(&event, &group, buf,
528 event - argv[0]);
501 if (ret) 529 if (ret)
502 goto fail_address_parse; 530 goto fail_address_parse;
503 } else { 531 } else {
@@ -519,6 +547,9 @@ static int trace_uprobe_create(int argc, const char **argv)
519 kfree(tail); 547 kfree(tail);
520 } 548 }
521 549
550 argc -= 2;
551 argv += 2;
552
522 tu = alloc_trace_uprobe(group, event, argc, is_return); 553 tu = alloc_trace_uprobe(group, event, argc, is_return);
523 if (IS_ERR(tu)) { 554 if (IS_ERR(tu)) {
524 ret = PTR_ERR(tu); 555 ret = PTR_ERR(tu);
@@ -539,6 +570,7 @@ static int trace_uprobe_create(int argc, const char **argv)
539 goto error; 570 goto error;
540 } 571 }
541 572
573 trace_probe_log_set_index(i + 2);
542 ret = traceprobe_parse_probe_arg(&tu->tp, i, tmp, 574 ret = traceprobe_parse_probe_arg(&tu->tp, i, tmp,
543 is_return ? TPARG_FL_RETURN : 0); 575 is_return ? TPARG_FL_RETURN : 0);
544 kfree(tmp); 576 kfree(tmp);
@@ -547,20 +579,20 @@ static int trace_uprobe_create(int argc, const char **argv)
547 } 579 }
548 580
549 ret = register_trace_uprobe(tu); 581 ret = register_trace_uprobe(tu);
550 if (ret) 582 if (!ret)
551 goto error; 583 goto out;
552 return 0;
553 584
554error: 585error:
555 free_trace_uprobe(tu); 586 free_trace_uprobe(tu);
587out:
588 trace_probe_log_clear();
556 return ret; 589 return ret;
557 590
558fail_address_parse: 591fail_address_parse:
592 trace_probe_log_clear();
559 path_put(&path); 593 path_put(&path);
560 kfree(filename); 594 kfree(filename);
561 595
562 pr_info("Failed to parse address or file.\n");
563
564 return ret; 596 return ret;
565} 597}
566 598
@@ -1304,7 +1336,7 @@ static inline void init_trace_event_call(struct trace_uprobe *tu,
1304 call->event.funcs = &uprobe_funcs; 1336 call->event.funcs = &uprobe_funcs;
1305 call->class->define_fields = uprobe_event_define_fields; 1337 call->class->define_fields = uprobe_event_define_fields;
1306 1338
1307 call->flags = TRACE_EVENT_FL_UPROBE; 1339 call->flags = TRACE_EVENT_FL_UPROBE | TRACE_EVENT_FL_CAP_ANY;
1308 call->class->reg = trace_uprobe_register; 1340 call->class->reg = trace_uprobe_register;
1309 call->data = tu; 1341 call->data = tu;
1310} 1342}
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 46f2ab1e08a9..df3ade14ccbd 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -1,19 +1,6 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
1/* 2/*
2 * Copyright (C) 2008-2014 Mathieu Desnoyers 3 * Copyright (C) 2008-2014 Mathieu Desnoyers
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 */ 4 */
18#include <linux/module.h> 5#include <linux/module.h>
19#include <linux/mutex.h> 6#include <linux/mutex.h>
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 370724b45391..7be3e7530841 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -1,19 +1,8 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
1/* 2/*
2 * tsacct.c - System accounting over taskstats interface 3 * tsacct.c - System accounting over taskstats interface
3 * 4 *
4 * Copyright (C) Jay Lan, <jlan@sgi.com> 5 * Copyright (C) Jay Lan, <jlan@sgi.com>
5 *
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 */ 6 */
18 7
19#include <linux/kernel.h> 8#include <linux/kernel.h>
diff --git a/kernel/ucount.c b/kernel/ucount.c
index f48d1b6376a4..feb128c7b5d9 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -1,9 +1,4 @@
1/* 1// SPDX-License-Identifier: GPL-2.0-only
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License as
4 * published by the Free Software Foundation, version 2 of the
5 * License.
6 */
7 2
8#include <linux/stat.h> 3#include <linux/stat.h>
9#include <linux/sysctl.h> 4#include <linux/sysctl.h>
diff --git a/kernel/umh.c b/kernel/umh.c
index d937cbad903a..7f255b5a8845 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * umh - the kernel usermode helper 3 * umh - the kernel usermode helper
3 */ 4 */
diff --git a/kernel/up.c b/kernel/up.c
index ff536f9cc8a2..862b460ab97a 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * Uniprocessor-only support functions. The counterpart to kernel/smp.c 3 * Uniprocessor-only support functions. The counterpart to kernel/smp.c
3 */ 4 */
@@ -34,14 +35,13 @@ int smp_call_function_single_async(int cpu, call_single_data_t *csd)
34} 35}
35EXPORT_SYMBOL(smp_call_function_single_async); 36EXPORT_SYMBOL(smp_call_function_single_async);
36 37
37int on_each_cpu(smp_call_func_t func, void *info, int wait) 38void on_each_cpu(smp_call_func_t func, void *info, int wait)
38{ 39{
39 unsigned long flags; 40 unsigned long flags;
40 41
41 local_irq_save(flags); 42 local_irq_save(flags);
42 func(info); 43 func(info);
43 local_irq_restore(flags); 44 local_irq_restore(flags);
44 return 0;
45} 45}
46EXPORT_SYMBOL(on_each_cpu); 46EXPORT_SYMBOL(on_each_cpu);
47 47
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
index 9586b670a5b2..870ecd7c63ed 100644
--- a/kernel/user-return-notifier.c
+++ b/kernel/user-return-notifier.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1 2
2#include <linux/user-return-notifier.h> 3#include <linux/user-return-notifier.h>
3#include <linux/percpu.h> 4#include <linux/percpu.h>
diff --git a/kernel/user.c b/kernel/user.c
index 0df9b1640b2a..5235d7f49982 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * The "user cache". 3 * The "user cache".
3 * 4 *
@@ -62,9 +63,9 @@ struct user_namespace init_user_ns = {
62 .ns.ops = &userns_operations, 63 .ns.ops = &userns_operations,
63#endif 64#endif
64 .flags = USERNS_INIT_FLAGS, 65 .flags = USERNS_INIT_FLAGS,
65#ifdef CONFIG_PERSISTENT_KEYRINGS 66#ifdef CONFIG_KEYS
66 .persistent_keyring_register_sem = 67 .keyring_name_list = LIST_HEAD_INIT(init_user_ns.keyring_name_list),
67 __RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem), 68 .keyring_sem = __RWSEM_INITIALIZER(init_user_ns.keyring_sem),
68#endif 69#endif
69}; 70};
70EXPORT_SYMBOL_GPL(init_user_ns); 71EXPORT_SYMBOL_GPL(init_user_ns);
@@ -140,8 +141,6 @@ static void free_user(struct user_struct *up, unsigned long flags)
140{ 141{
141 uid_hash_remove(up); 142 uid_hash_remove(up);
142 spin_unlock_irqrestore(&uidhash_lock, flags); 143 spin_unlock_irqrestore(&uidhash_lock, flags);
143 key_put(up->uid_keyring);
144 key_put(up->session_keyring);
145 kmem_cache_free(uid_cachep, up); 144 kmem_cache_free(uid_cachep, up);
146} 145}
147 146
@@ -185,7 +184,7 @@ struct user_struct *alloc_uid(kuid_t uid)
185 if (!up) { 184 if (!up) {
186 new = kmem_cache_zalloc(uid_cachep, GFP_KERNEL); 185 new = kmem_cache_zalloc(uid_cachep, GFP_KERNEL);
187 if (!new) 186 if (!new)
188 goto out_unlock; 187 return NULL;
189 188
190 new->uid = uid; 189 new->uid = uid;
191 refcount_set(&new->__count, 1); 190 refcount_set(&new->__count, 1);
@@ -199,8 +198,6 @@ struct user_struct *alloc_uid(kuid_t uid)
199 spin_lock_irq(&uidhash_lock); 198 spin_lock_irq(&uidhash_lock);
200 up = uid_hash_find(uid, hashent); 199 up = uid_hash_find(uid, hashent);
201 if (up) { 200 if (up) {
202 key_put(new->uid_keyring);
203 key_put(new->session_keyring);
204 kmem_cache_free(uid_cachep, new); 201 kmem_cache_free(uid_cachep, new);
205 } else { 202 } else {
206 uid_hash_insert(new, hashent); 203 uid_hash_insert(new, hashent);
@@ -210,9 +207,6 @@ struct user_struct *alloc_uid(kuid_t uid)
210 } 207 }
211 208
212 return up; 209 return up;
213
214out_unlock:
215 return NULL;
216} 210}
217 211
218static int __init uid_cache_init(void) 212static int __init uid_cache_init(void)
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 923414a246e9..8eadadc478f9 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -1,9 +1,4 @@
1/* 1// SPDX-License-Identifier: GPL-2.0-only
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License as
4 * published by the Free Software Foundation, version 2 of the
5 * License.
6 */
7 2
8#include <linux/export.h> 3#include <linux/export.h>
9#include <linux/nsproxy.h> 4#include <linux/nsproxy.h>
@@ -133,8 +128,9 @@ int create_user_ns(struct cred *new)
133 ns->flags = parent_ns->flags; 128 ns->flags = parent_ns->flags;
134 mutex_unlock(&userns_state_mutex); 129 mutex_unlock(&userns_state_mutex);
135 130
136#ifdef CONFIG_PERSISTENT_KEYRINGS 131#ifdef CONFIG_KEYS
137 init_rwsem(&ns->persistent_keyring_register_sem); 132 INIT_LIST_HEAD(&ns->keyring_name_list);
133 init_rwsem(&ns->keyring_sem);
138#endif 134#endif
139 ret = -ENOMEM; 135 ret = -ENOMEM;
140 if (!setup_userns_sysctls(ns)) 136 if (!setup_userns_sysctls(ns))
@@ -196,9 +192,7 @@ static void free_user_ns(struct work_struct *work)
196 kfree(ns->projid_map.reverse); 192 kfree(ns->projid_map.reverse);
197 } 193 }
198 retire_userns_sysctls(ns); 194 retire_userns_sysctls(ns);
199#ifdef CONFIG_PERSISTENT_KEYRINGS 195 key_free_user_ns(ns);
200 key_put(ns->persistent_keyring_register);
201#endif
202 ns_free_inum(&ns->ns); 196 ns_free_inum(&ns->ns);
203 kmem_cache_free(user_ns_cachep, ns); 197 kmem_cache_free(user_ns_cachep, ns);
204 dec_user_namespaces(ucounts); 198 dec_user_namespaces(ucounts);
diff --git a/kernel/utsname.c b/kernel/utsname.c
index dcd6be1996fe..f0e491193009 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -1,12 +1,8 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * Copyright (C) 2004 IBM Corporation 3 * Copyright (C) 2004 IBM Corporation
3 * 4 *
4 * Author: Serge Hallyn <serue@us.ibm.com> 5 * Author: Serge Hallyn <serue@us.ibm.com>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation, version 2 of the
9 * License.
10 */ 6 */
11 7
12#include <linux/export.h> 8#include <linux/export.h>
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 258033d62cb3..3732c888a949 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -1,12 +1,8 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * Copyright (C) 2007 3 * Copyright (C) 2007
3 * 4 *
4 * Author: Eric Biederman <ebiederm@xmision.com> 5 * Author: Eric Biederman <ebiederm@xmision.com>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation, version 2 of the
9 * License.
10 */ 6 */
11 7
12#include <linux/export.h> 8#include <linux/export.h>
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 9657315405de..601d61150b65 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-only
1/* 2/*
2 * kernel/workqueue.c - generic async execution with shared worker pool 3 * kernel/workqueue.c - generic async execution with shared worker pool
3 * 4 *
@@ -3328,7 +3329,7 @@ EXPORT_SYMBOL_GPL(execute_in_process_context);
3328 * 3329 *
3329 * Undo alloc_workqueue_attrs(). 3330 * Undo alloc_workqueue_attrs().
3330 */ 3331 */
3331void free_workqueue_attrs(struct workqueue_attrs *attrs) 3332static void free_workqueue_attrs(struct workqueue_attrs *attrs)
3332{ 3333{
3333 if (attrs) { 3334 if (attrs) {
3334 free_cpumask_var(attrs->cpumask); 3335 free_cpumask_var(attrs->cpumask);
@@ -3338,21 +3339,20 @@ void free_workqueue_attrs(struct workqueue_attrs *attrs)
3338 3339
3339/** 3340/**
3340 * alloc_workqueue_attrs - allocate a workqueue_attrs 3341 * alloc_workqueue_attrs - allocate a workqueue_attrs
3341 * @gfp_mask: allocation mask to use
3342 * 3342 *
3343 * Allocate a new workqueue_attrs, initialize with default settings and 3343 * Allocate a new workqueue_attrs, initialize with default settings and
3344 * return it. 3344 * return it.
3345 * 3345 *
3346 * Return: The allocated new workqueue_attr on success. %NULL on failure. 3346 * Return: The allocated new workqueue_attr on success. %NULL on failure.
3347 */ 3347 */
3348struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask) 3348static struct workqueue_attrs *alloc_workqueue_attrs(void)
3349{ 3349{
3350 struct workqueue_attrs *attrs; 3350 struct workqueue_attrs *attrs;
3351 3351
3352 attrs = kzalloc(sizeof(*attrs), gfp_mask); 3352 attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
3353 if (!attrs) 3353 if (!attrs)
3354 goto fail; 3354 goto fail;
3355 if (!alloc_cpumask_var(&attrs->cpumask, gfp_mask)) 3355 if (!alloc_cpumask_var(&attrs->cpumask, GFP_KERNEL))
3356 goto fail; 3356 goto fail;
3357 3357
3358 cpumask_copy(attrs->cpumask, cpu_possible_mask); 3358 cpumask_copy(attrs->cpumask, cpu_possible_mask);
@@ -3430,7 +3430,7 @@ static int init_worker_pool(struct worker_pool *pool)
3430 pool->refcnt = 1; 3430 pool->refcnt = 1;
3431 3431
3432 /* shouldn't fail above this point */ 3432 /* shouldn't fail above this point */
3433 pool->attrs = alloc_workqueue_attrs(GFP_KERNEL); 3433 pool->attrs = alloc_workqueue_attrs();
3434 if (!pool->attrs) 3434 if (!pool->attrs)
3435 return -ENOMEM; 3435 return -ENOMEM;
3436 return 0; 3436 return 0;
@@ -3895,8 +3895,8 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
3895 3895
3896 ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_node_ids), GFP_KERNEL); 3896 ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_node_ids), GFP_KERNEL);
3897 3897
3898 new_attrs = alloc_workqueue_attrs(GFP_KERNEL); 3898 new_attrs = alloc_workqueue_attrs();
3899 tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL); 3899 tmp_attrs = alloc_workqueue_attrs();
3900 if (!ctx || !new_attrs || !tmp_attrs) 3900 if (!ctx || !new_attrs || !tmp_attrs)
3901 goto out_free; 3901 goto out_free;
3902 3902
@@ -4032,7 +4032,7 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
4032 * 4032 *
4033 * Return: 0 on success and -errno on failure. 4033 * Return: 0 on success and -errno on failure.
4034 */ 4034 */
4035int apply_workqueue_attrs(struct workqueue_struct *wq, 4035static int apply_workqueue_attrs(struct workqueue_struct *wq,
4036 const struct workqueue_attrs *attrs) 4036 const struct workqueue_attrs *attrs)
4037{ 4037{
4038 int ret; 4038 int ret;
@@ -4043,7 +4043,6 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
4043 4043
4044 return ret; 4044 return ret;
4045} 4045}
4046EXPORT_SYMBOL_GPL(apply_workqueue_attrs);
4047 4046
4048/** 4047/**
4049 * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug 4048 * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug
@@ -4241,7 +4240,7 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
4241 return NULL; 4240 return NULL;
4242 4241
4243 if (flags & WQ_UNBOUND) { 4242 if (flags & WQ_UNBOUND) {
4244 wq->unbound_attrs = alloc_workqueue_attrs(GFP_KERNEL); 4243 wq->unbound_attrs = alloc_workqueue_attrs();
4245 if (!wq->unbound_attrs) 4244 if (!wq->unbound_attrs)
4246 goto err_free_wq; 4245 goto err_free_wq;
4247 } 4246 }
@@ -5394,7 +5393,7 @@ static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
5394 5393
5395 lockdep_assert_held(&wq_pool_mutex); 5394 lockdep_assert_held(&wq_pool_mutex);
5396 5395
5397 attrs = alloc_workqueue_attrs(GFP_KERNEL); 5396 attrs = alloc_workqueue_attrs();
5398 if (!attrs) 5397 if (!attrs)
5399 return NULL; 5398 return NULL;
5400 5399
@@ -5816,7 +5815,7 @@ static void __init wq_numa_init(void)
5816 return; 5815 return;
5817 } 5816 }
5818 5817
5819 wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL); 5818 wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs();
5820 BUG_ON(!wq_update_unbound_numa_attrs_buf); 5819 BUG_ON(!wq_update_unbound_numa_attrs_buf);
5821 5820
5822 /* 5821 /*
@@ -5891,7 +5890,7 @@ int __init workqueue_init_early(void)
5891 for (i = 0; i < NR_STD_WORKER_POOLS; i++) { 5890 for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
5892 struct workqueue_attrs *attrs; 5891 struct workqueue_attrs *attrs;
5893 5892
5894 BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL))); 5893 BUG_ON(!(attrs = alloc_workqueue_attrs()));
5895 attrs->nice = std_nice[i]; 5894 attrs->nice = std_nice[i];
5896 unbound_std_wq_attrs[i] = attrs; 5895 unbound_std_wq_attrs[i] = attrs;
5897 5896
@@ -5900,7 +5899,7 @@ int __init workqueue_init_early(void)
5900 * guaranteed by max_active which is enforced by pwqs. 5899 * guaranteed by max_active which is enforced by pwqs.
5901 * Turn off NUMA so that dfl_pwq is used for all nodes. 5900 * Turn off NUMA so that dfl_pwq is used for all nodes.
5902 */ 5901 */
5903 BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL))); 5902 BUG_ON(!(attrs = alloc_workqueue_attrs()));
5904 attrs->nice = std_nice[i]; 5903 attrs->nice = std_nice[i];
5905 attrs->no_numa = true; 5904 attrs->no_numa = true;
5906 ordered_wq_attrs[i] = attrs; 5905 ordered_wq_attrs[i] = attrs;