aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/.gitignore1
-rw-r--r--kernel/Kconfig.locks9
-rw-r--r--kernel/Makefile11
-rw-r--r--kernel/acct.c4
-rw-r--r--kernel/async.c4
-rw-r--r--kernel/backtracetest.c11
-rw-r--r--kernel/bpf/core.c1
-rw-r--r--kernel/bpf/cpumap.c13
-rw-r--r--kernel/bpf/inode.c26
-rw-r--r--kernel/bpf/syscall.c22
-rw-r--r--kernel/bpf/verifier.c235
-rw-r--r--kernel/cgroup/cpuset.c11
-rw-r--r--kernel/cpu.c111
-rw-r--r--kernel/dma/debug.c14
-rw-r--r--kernel/dma/swiotlb.c6
-rw-r--r--kernel/events/core.c103
-rw-r--r--kernel/events/ring_buffer.c40
-rw-r--r--kernel/events/uprobes.c10
-rw-r--r--kernel/fail_function.c2
-rw-r--r--kernel/fork.c132
-rw-r--r--kernel/futex.c192
-rwxr-xr-xkernel/gen_ikh_data.sh89
-rw-r--r--kernel/iomem.c4
-rw-r--r--kernel/irq/chip.c4
-rw-r--r--kernel/irq/debugfs.c2
-rw-r--r--kernel/irq/devres.c5
-rw-r--r--kernel/irq/handle.c2
-rw-r--r--kernel/irq/irqdesc.c4
-rw-r--r--kernel/irq/manage.c7
-rw-r--r--kernel/irq/spurious.c4
-rw-r--r--kernel/irq/timings.c522
-rw-r--r--kernel/irq_work.c75
-rw-r--r--kernel/jump_label.c63
-rw-r--r--kernel/kexec_core.c4
-rw-r--r--kernel/kexec_file.c1
-rw-r--r--kernel/kheaders.c74
-rw-r--r--kernel/kprobes.c6
-rw-r--r--kernel/latencytop.c29
-rw-r--r--kernel/livepatch/core.c94
-rw-r--r--kernel/livepatch/transition.c22
-rw-r--r--kernel/locking/Makefile5
-rw-r--r--kernel/locking/lock_events.c179
-rw-r--r--kernel/locking/lock_events.h59
-rw-r--r--kernel/locking/lock_events_list.h67
-rw-r--r--kernel/locking/lockdep.c377
-rw-r--r--kernel/locking/lockdep_internals.h34
-rw-r--r--kernel/locking/locktorture.c2
-rw-r--r--kernel/locking/percpu-rwsem.c2
-rw-r--r--kernel/locking/qspinlock.c8
-rw-r--r--kernel/locking/qspinlock_paravirt.h19
-rw-r--r--kernel/locking/qspinlock_stat.h242
-rw-r--r--kernel/locking/rwsem-spinlock.c339
-rw-r--r--kernel/locking/rwsem-xadd.c204
-rw-r--r--kernel/locking/rwsem.c25
-rw-r--r--kernel/locking/rwsem.h174
-rw-r--r--kernel/locking/spinlock.c7
-rw-r--r--kernel/locking/spinlock_debug.c6
-rw-r--r--kernel/module.c82
-rw-r--r--kernel/padata.c3
-rw-r--r--kernel/panic.c7
-rw-r--r--kernel/power/Kconfig9
-rw-r--r--kernel/power/hibernate.c17
-rw-r--r--kernel/power/main.c14
-rw-r--r--kernel/power/snapshot.c5
-rw-r--r--kernel/power/suspend.c17
-rw-r--r--kernel/power/user.c5
-rw-r--r--kernel/ptrace.c15
-rw-r--r--kernel/rcu/rcu.h1
-rw-r--r--kernel/rcu/rcuperf.c5
-rw-r--r--kernel/rcu/rcutorture.c21
-rw-r--r--kernel/rcu/srcutiny.c9
-rw-r--r--kernel/rcu/srcutree.c32
-rw-r--r--kernel/rcu/tiny.c2
-rw-r--r--kernel/rcu/tree.c510
-rw-r--r--kernel/rcu/tree.h14
-rw-r--r--kernel/rcu/tree_exp.h36
-rw-r--r--kernel/rcu/tree_plugin.h257
-rw-r--r--kernel/rcu/tree_stall.h709
-rw-r--r--kernel/rcu/update.c59
-rw-r--r--kernel/resource.c11
-rw-r--r--kernel/rseq.c9
-rw-r--r--kernel/sched/core.c130
-rw-r--r--kernel/sched/cpufreq.c2
-rw-r--r--kernel/sched/cpufreq_schedutil.c72
-rw-r--r--kernel/sched/deadline.c3
-rw-r--r--kernel/sched/debug.c2
-rw-r--r--kernel/sched/fair.c144
-rw-r--r--kernel/sched/isolation.c18
-rw-r--r--kernel/sched/rt.c5
-rw-r--r--kernel/sched/sched.h18
-rw-r--r--kernel/sched/topology.c31
-rw-r--r--kernel/seccomp.c21
-rw-r--r--kernel/signal.c27
-rw-r--r--kernel/softirq.c51
-rw-r--r--kernel/stacktrace.c333
-rw-r--r--kernel/stop_machine.c2
-rw-r--r--kernel/sys_ni.c3
-rw-r--r--kernel/sysctl.c3
-rw-r--r--kernel/time/alarmtimer.c2
-rw-r--r--kernel/time/clockevents.c18
-rw-r--r--kernel/time/jiffies.c4
-rw-r--r--kernel/time/sched_clock.c10
-rw-r--r--kernel/time/tick-broadcast.c48
-rw-r--r--kernel/time/tick-common.c54
-rw-r--r--kernel/time/tick-internal.h10
-rw-r--r--kernel/time/tick-sched.c49
-rw-r--r--kernel/time/tick-sched.h13
-rw-r--r--kernel/time/time.c2
-rw-r--r--kernel/time/timekeeping.c24
-rw-r--r--kernel/time/timekeeping.h7
-rw-r--r--kernel/time/timer.c32
-rw-r--r--kernel/torture.c2
-rw-r--r--kernel/trace/bpf_trace.c8
-rw-r--r--kernel/trace/ftrace.c18
-rw-r--r--kernel/trace/ring_buffer.c2
-rw-r--r--kernel/trace/trace.c144
-rw-r--r--kernel/trace/trace.h8
-rw-r--r--kernel/trace/trace_branch.c4
-rw-r--r--kernel/trace/trace_dynevent.c2
-rw-r--r--kernel/trace/trace_events_hist.c15
-rw-r--r--kernel/trace/trace_stack.c85
-rw-r--r--kernel/trace/trace_syscalls.c9
-rw-r--r--kernel/watchdog.c12
-rw-r--r--kernel/watchdog_hld.c3
-rw-r--r--kernel/workqueue.c71
-rw-r--r--kernel/workqueue_internal.h5
126 files changed, 4138 insertions, 2955 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
index 6e699100872f..34d1e77ee9df 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -1,5 +1,6 @@
1# 1#
2# Generated files 2# Generated files
3# 3#
4kheaders.md5
4timeconst.h 5timeconst.h
5hz.bc 6hz.bc
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index fbba478ae522..bf770d7556f7 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -229,7 +229,7 @@ config MUTEX_SPIN_ON_OWNER
229 229
230config RWSEM_SPIN_ON_OWNER 230config RWSEM_SPIN_ON_OWNER
231 def_bool y 231 def_bool y
232 depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW 232 depends on SMP && ARCH_SUPPORTS_ATOMIC_RMW
233 233
234config LOCK_SPIN_ON_OWNER 234config LOCK_SPIN_ON_OWNER
235 def_bool y 235 def_bool y
@@ -251,3 +251,10 @@ config ARCH_USE_QUEUED_RWLOCKS
251config QUEUED_RWLOCKS 251config QUEUED_RWLOCKS
252 def_bool y if ARCH_USE_QUEUED_RWLOCKS 252 def_bool y if ARCH_USE_QUEUED_RWLOCKS
253 depends on SMP 253 depends on SMP
254
255config ARCH_HAS_MMIOWB
256 bool
257
258config MMIOWB
259 def_bool y if ARCH_HAS_MMIOWB
260 depends on SMP
diff --git a/kernel/Makefile b/kernel/Makefile
index 6c57e78817da..298437bb2c6a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -30,6 +30,7 @@ KCOV_INSTRUMENT_extable.o := n
30# Don't self-instrument. 30# Don't self-instrument.
31KCOV_INSTRUMENT_kcov.o := n 31KCOV_INSTRUMENT_kcov.o := n
32KASAN_SANITIZE_kcov.o := n 32KASAN_SANITIZE_kcov.o := n
33CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
33 34
34# cond_syscall is currently not LTO compatible 35# cond_syscall is currently not LTO compatible
35CFLAGS_sys_ni.o = $(DISABLE_LTO) 36CFLAGS_sys_ni.o = $(DISABLE_LTO)
@@ -70,6 +71,7 @@ obj-$(CONFIG_UTS_NS) += utsname.o
70obj-$(CONFIG_USER_NS) += user_namespace.o 71obj-$(CONFIG_USER_NS) += user_namespace.o
71obj-$(CONFIG_PID_NS) += pid_namespace.o 72obj-$(CONFIG_PID_NS) += pid_namespace.o
72obj-$(CONFIG_IKCONFIG) += configs.o 73obj-$(CONFIG_IKCONFIG) += configs.o
74obj-$(CONFIG_IKHEADERS_PROC) += kheaders.o
73obj-$(CONFIG_SMP) += stop_machine.o 75obj-$(CONFIG_SMP) += stop_machine.o
74obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o 76obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
75obj-$(CONFIG_AUDIT) += audit.o auditfilter.o 77obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
@@ -121,3 +123,12 @@ $(obj)/configs.o: $(obj)/config_data.gz
121targets += config_data.gz 123targets += config_data.gz
122$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE 124$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
123 $(call if_changed,gzip) 125 $(call if_changed,gzip)
126
127$(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz
128
129quiet_cmd_genikh = CHK $(obj)/kheaders_data.tar.xz
130cmd_genikh = $(srctree)/kernel/gen_ikh_data.sh $@
131$(obj)/kheaders_data.tar.xz: FORCE
132 $(call cmd,genikh)
133
134clean-files := kheaders_data.tar.xz kheaders.md5
diff --git a/kernel/acct.c b/kernel/acct.c
index addf7732fb56..81f9831a7859 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -227,7 +227,7 @@ static int acct_on(struct filename *pathname)
227 filp_close(file, NULL); 227 filp_close(file, NULL);
228 return PTR_ERR(internal); 228 return PTR_ERR(internal);
229 } 229 }
230 err = mnt_want_write(internal); 230 err = __mnt_want_write(internal);
231 if (err) { 231 if (err) {
232 mntput(internal); 232 mntput(internal);
233 kfree(acct); 233 kfree(acct);
@@ -252,7 +252,7 @@ static int acct_on(struct filename *pathname)
252 old = xchg(&ns->bacct, &acct->pin); 252 old = xchg(&ns->bacct, &acct->pin);
253 mutex_unlock(&acct->lock); 253 mutex_unlock(&acct->lock);
254 pin_kill(old); 254 pin_kill(old);
255 mnt_drop_write(mnt); 255 __mnt_drop_write(mnt);
256 mntput(mnt); 256 mntput(mnt);
257 return 0; 257 return 0;
258} 258}
diff --git a/kernel/async.c b/kernel/async.c
index f6bd0d9885e1..12c332e4e13e 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -119,7 +119,7 @@ static void async_run_entry_fn(struct work_struct *work)
119 119
120 /* 1) run (and print duration) */ 120 /* 1) run (and print duration) */
121 if (initcall_debug && system_state < SYSTEM_RUNNING) { 121 if (initcall_debug && system_state < SYSTEM_RUNNING) {
122 pr_debug("calling %lli_%pF @ %i\n", 122 pr_debug("calling %lli_%pS @ %i\n",
123 (long long)entry->cookie, 123 (long long)entry->cookie,
124 entry->func, task_pid_nr(current)); 124 entry->func, task_pid_nr(current));
125 calltime = ktime_get(); 125 calltime = ktime_get();
@@ -128,7 +128,7 @@ static void async_run_entry_fn(struct work_struct *work)
128 if (initcall_debug && system_state < SYSTEM_RUNNING) { 128 if (initcall_debug && system_state < SYSTEM_RUNNING) {
129 rettime = ktime_get(); 129 rettime = ktime_get();
130 delta = ktime_sub(rettime, calltime); 130 delta = ktime_sub(rettime, calltime);
131 pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n", 131 pr_debug("initcall %lli_%pS returned 0 after %lld usecs\n",
132 (long long)entry->cookie, 132 (long long)entry->cookie,
133 entry->func, 133 entry->func,
134 (long long)ktime_to_ns(delta) >> 10); 134 (long long)ktime_to_ns(delta) >> 10);
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
index 1323360d90e3..a563c8fdad0d 100644
--- a/kernel/backtracetest.c
+++ b/kernel/backtracetest.c
@@ -48,19 +48,14 @@ static void backtrace_test_irq(void)
48#ifdef CONFIG_STACKTRACE 48#ifdef CONFIG_STACKTRACE
49static void backtrace_test_saved(void) 49static void backtrace_test_saved(void)
50{ 50{
51 struct stack_trace trace;
52 unsigned long entries[8]; 51 unsigned long entries[8];
52 unsigned int nr_entries;
53 53
54 pr_info("Testing a saved backtrace.\n"); 54 pr_info("Testing a saved backtrace.\n");
55 pr_info("The following trace is a kernel self test and not a bug!\n"); 55 pr_info("The following trace is a kernel self test and not a bug!\n");
56 56
57 trace.nr_entries = 0; 57 nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
58 trace.max_entries = ARRAY_SIZE(entries); 58 stack_trace_print(entries, nr_entries, 0);
59 trace.entries = entries;
60 trace.skip = 0;
61
62 save_stack_trace(&trace);
63 print_stack_trace(&trace, 0);
64} 59}
65#else 60#else
66static void backtrace_test_saved(void) 61static void backtrace_test_saved(void)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ff09d32a8a1b..c605397c79f0 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -848,7 +848,6 @@ void __weak bpf_jit_free(struct bpf_prog *fp)
848 if (fp->jited) { 848 if (fp->jited) {
849 struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); 849 struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
850 850
851 bpf_jit_binary_unlock_ro(hdr);
852 bpf_jit_binary_free(hdr); 851 bpf_jit_binary_free(hdr);
853 852
854 WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp)); 853 WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp));
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 8974b3755670..3c18260403dd 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -162,10 +162,14 @@ static void cpu_map_kthread_stop(struct work_struct *work)
162static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, 162static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
163 struct xdp_frame *xdpf) 163 struct xdp_frame *xdpf)
164{ 164{
165 unsigned int hard_start_headroom;
165 unsigned int frame_size; 166 unsigned int frame_size;
166 void *pkt_data_start; 167 void *pkt_data_start;
167 struct sk_buff *skb; 168 struct sk_buff *skb;
168 169
170 /* Part of headroom was reserved to xdpf */
171 hard_start_headroom = sizeof(struct xdp_frame) + xdpf->headroom;
172
169 /* build_skb need to place skb_shared_info after SKB end, and 173 /* build_skb need to place skb_shared_info after SKB end, and
170 * also want to know the memory "truesize". Thus, need to 174 * also want to know the memory "truesize". Thus, need to
171 * know the memory frame size backing xdp_buff. 175 * know the memory frame size backing xdp_buff.
@@ -183,15 +187,15 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
183 * is not at a fixed memory location, with mixed length 187 * is not at a fixed memory location, with mixed length
184 * packets, which is bad for cache-line hotness. 188 * packets, which is bad for cache-line hotness.
185 */ 189 */
186 frame_size = SKB_DATA_ALIGN(xdpf->len + xdpf->headroom) + 190 frame_size = SKB_DATA_ALIGN(xdpf->len + hard_start_headroom) +
187 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 191 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
188 192
189 pkt_data_start = xdpf->data - xdpf->headroom; 193 pkt_data_start = xdpf->data - hard_start_headroom;
190 skb = build_skb(pkt_data_start, frame_size); 194 skb = build_skb(pkt_data_start, frame_size);
191 if (!skb) 195 if (!skb)
192 return NULL; 196 return NULL;
193 197
194 skb_reserve(skb, xdpf->headroom); 198 skb_reserve(skb, hard_start_headroom);
195 __skb_put(skb, xdpf->len); 199 __skb_put(skb, xdpf->len);
196 if (xdpf->metasize) 200 if (xdpf->metasize)
197 skb_metadata_set(skb, xdpf->metasize); 201 skb_metadata_set(skb, xdpf->metasize);
@@ -205,6 +209,9 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
205 * - RX ring dev queue index (skb_record_rx_queue) 209 * - RX ring dev queue index (skb_record_rx_queue)
206 */ 210 */
207 211
212 /* Allow SKB to reuse area used by xdp_frame */
213 xdp_scrub_frame(xdpf);
214
208 return skb; 215 return skb;
209} 216}
210 217
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 2ada5e21dfa6..bc53e5b20ddc 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -554,19 +554,6 @@ struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type typ
554} 554}
555EXPORT_SYMBOL(bpf_prog_get_type_path); 555EXPORT_SYMBOL(bpf_prog_get_type_path);
556 556
557static void bpf_evict_inode(struct inode *inode)
558{
559 enum bpf_type type;
560
561 truncate_inode_pages_final(&inode->i_data);
562 clear_inode(inode);
563
564 if (S_ISLNK(inode->i_mode))
565 kfree(inode->i_link);
566 if (!bpf_inode_type(inode, &type))
567 bpf_any_put(inode->i_private, type);
568}
569
570/* 557/*
571 * Display the mount options in /proc/mounts. 558 * Display the mount options in /proc/mounts.
572 */ 559 */
@@ -579,11 +566,22 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root)
579 return 0; 566 return 0;
580} 567}
581 568
569static void bpf_free_inode(struct inode *inode)
570{
571 enum bpf_type type;
572
573 if (S_ISLNK(inode->i_mode))
574 kfree(inode->i_link);
575 if (!bpf_inode_type(inode, &type))
576 bpf_any_put(inode->i_private, type);
577 free_inode_nonrcu(inode);
578}
579
582static const struct super_operations bpf_super_ops = { 580static const struct super_operations bpf_super_ops = {
583 .statfs = simple_statfs, 581 .statfs = simple_statfs,
584 .drop_inode = generic_delete_inode, 582 .drop_inode = generic_delete_inode,
585 .show_options = bpf_show_options, 583 .show_options = bpf_show_options,
586 .evict_inode = bpf_evict_inode, 584 .free_inode = bpf_free_inode,
587}; 585};
588 586
589enum { 587enum {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 62f6bced3a3c..afca36f53c49 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -136,21 +136,29 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
136 136
137void *bpf_map_area_alloc(size_t size, int numa_node) 137void *bpf_map_area_alloc(size_t size, int numa_node)
138{ 138{
139 /* We definitely need __GFP_NORETRY, so OOM killer doesn't 139 /* We really just want to fail instead of triggering OOM killer
140 * trigger under memory pressure as we really just want to 140 * under memory pressure, therefore we set __GFP_NORETRY to kmalloc,
141 * fail instead. 141 * which is used for lower order allocation requests.
142 *
143 * It has been observed that higher order allocation requests done by
144 * vmalloc with __GFP_NORETRY being set might fail due to not trying
145 * to reclaim memory from the page cache, thus we set
146 * __GFP_RETRY_MAYFAIL to avoid such situations.
142 */ 147 */
143 const gfp_t flags = __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO; 148
149 const gfp_t flags = __GFP_NOWARN | __GFP_ZERO;
144 void *area; 150 void *area;
145 151
146 if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { 152 if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
147 area = kmalloc_node(size, GFP_USER | flags, numa_node); 153 area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags,
154 numa_node);
148 if (area != NULL) 155 if (area != NULL)
149 return area; 156 return area;
150 } 157 }
151 158
152 return __vmalloc_node_flags_caller(size, numa_node, GFP_KERNEL | flags, 159 return __vmalloc_node_flags_caller(size, numa_node,
153 __builtin_return_address(0)); 160 GFP_KERNEL | __GFP_RETRY_MAYFAIL |
161 flags, __builtin_return_address(0));
154} 162}
155 163
156void bpf_map_area_free(void *area) 164void bpf_map_area_free(void *area)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ce166a002d16..09d5d972c9ff 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -212,7 +212,7 @@ struct bpf_call_arg_meta {
212 int access_size; 212 int access_size;
213 s64 msize_smax_value; 213 s64 msize_smax_value;
214 u64 msize_umax_value; 214 u64 msize_umax_value;
215 int ptr_id; 215 int ref_obj_id;
216 int func_id; 216 int func_id;
217}; 217};
218 218
@@ -346,35 +346,23 @@ static bool reg_type_may_be_null(enum bpf_reg_type type)
346 type == PTR_TO_TCP_SOCK_OR_NULL; 346 type == PTR_TO_TCP_SOCK_OR_NULL;
347} 347}
348 348
349static bool type_is_refcounted(enum bpf_reg_type type)
350{
351 return type == PTR_TO_SOCKET;
352}
353
354static bool type_is_refcounted_or_null(enum bpf_reg_type type)
355{
356 return type == PTR_TO_SOCKET || type == PTR_TO_SOCKET_OR_NULL;
357}
358
359static bool reg_is_refcounted(const struct bpf_reg_state *reg)
360{
361 return type_is_refcounted(reg->type);
362}
363
364static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) 349static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
365{ 350{
366 return reg->type == PTR_TO_MAP_VALUE && 351 return reg->type == PTR_TO_MAP_VALUE &&
367 map_value_has_spin_lock(reg->map_ptr); 352 map_value_has_spin_lock(reg->map_ptr);
368} 353}
369 354
370static bool reg_is_refcounted_or_null(const struct bpf_reg_state *reg) 355static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type)
371{ 356{
372 return type_is_refcounted_or_null(reg->type); 357 return type == PTR_TO_SOCKET ||
358 type == PTR_TO_SOCKET_OR_NULL ||
359 type == PTR_TO_TCP_SOCK ||
360 type == PTR_TO_TCP_SOCK_OR_NULL;
373} 361}
374 362
375static bool arg_type_is_refcounted(enum bpf_arg_type type) 363static bool arg_type_may_be_refcounted(enum bpf_arg_type type)
376{ 364{
377 return type == ARG_PTR_TO_SOCKET; 365 return type == ARG_PTR_TO_SOCK_COMMON;
378} 366}
379 367
380/* Determine whether the function releases some resources allocated by another 368/* Determine whether the function releases some resources allocated by another
@@ -392,6 +380,12 @@ static bool is_acquire_function(enum bpf_func_id func_id)
392 func_id == BPF_FUNC_sk_lookup_udp; 380 func_id == BPF_FUNC_sk_lookup_udp;
393} 381}
394 382
383static bool is_ptr_cast_function(enum bpf_func_id func_id)
384{
385 return func_id == BPF_FUNC_tcp_sock ||
386 func_id == BPF_FUNC_sk_fullsock;
387}
388
395/* string representation of 'enum bpf_reg_type' */ 389/* string representation of 'enum bpf_reg_type' */
396static const char * const reg_type_str[] = { 390static const char * const reg_type_str[] = {
397 [NOT_INIT] = "?", 391 [NOT_INIT] = "?",
@@ -466,6 +460,8 @@ static void print_verifier_state(struct bpf_verifier_env *env,
466 verbose(env, ",call_%d", func(env, reg)->callsite); 460 verbose(env, ",call_%d", func(env, reg)->callsite);
467 } else { 461 } else {
468 verbose(env, "(id=%d", reg->id); 462 verbose(env, "(id=%d", reg->id);
463 if (reg_type_may_be_refcounted_or_null(t))
464 verbose(env, ",ref_obj_id=%d", reg->ref_obj_id);
469 if (t != SCALAR_VALUE) 465 if (t != SCALAR_VALUE)
470 verbose(env, ",off=%d", reg->off); 466 verbose(env, ",off=%d", reg->off);
471 if (type_is_pkt_pointer(t)) 467 if (type_is_pkt_pointer(t))
@@ -1901,8 +1897,9 @@ continue_func:
1901 } 1897 }
1902 frame++; 1898 frame++;
1903 if (frame >= MAX_CALL_FRAMES) { 1899 if (frame >= MAX_CALL_FRAMES) {
1904 WARN_ONCE(1, "verifier bug. Call stack is too deep\n"); 1900 verbose(env, "the call stack of %d frames is too deep !\n",
1905 return -EFAULT; 1901 frame);
1902 return -E2BIG;
1906 } 1903 }
1907 goto process_func; 1904 goto process_func;
1908 } 1905 }
@@ -2414,16 +2411,15 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
2414 /* Any sk pointer can be ARG_PTR_TO_SOCK_COMMON */ 2411 /* Any sk pointer can be ARG_PTR_TO_SOCK_COMMON */
2415 if (!type_is_sk_pointer(type)) 2412 if (!type_is_sk_pointer(type))
2416 goto err_type; 2413 goto err_type;
2417 } else if (arg_type == ARG_PTR_TO_SOCKET) { 2414 if (reg->ref_obj_id) {
2418 expected_type = PTR_TO_SOCKET; 2415 if (meta->ref_obj_id) {
2419 if (type != expected_type) 2416 verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
2420 goto err_type; 2417 regno, reg->ref_obj_id,
2421 if (meta->ptr_id || !reg->id) { 2418 meta->ref_obj_id);
2422 verbose(env, "verifier internal error: mismatched references meta=%d, reg=%d\n", 2419 return -EFAULT;
2423 meta->ptr_id, reg->id); 2420 }
2424 return -EFAULT; 2421 meta->ref_obj_id = reg->ref_obj_id;
2425 } 2422 }
2426 meta->ptr_id = reg->id;
2427 } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { 2423 } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) {
2428 if (meta->func_id == BPF_FUNC_spin_lock) { 2424 if (meta->func_id == BPF_FUNC_spin_lock) {
2429 if (process_spin_lock(env, regno, true)) 2425 if (process_spin_lock(env, regno, true))
@@ -2740,32 +2736,38 @@ static bool check_arg_pair_ok(const struct bpf_func_proto *fn)
2740 return true; 2736 return true;
2741} 2737}
2742 2738
2743static bool check_refcount_ok(const struct bpf_func_proto *fn) 2739static bool check_refcount_ok(const struct bpf_func_proto *fn, int func_id)
2744{ 2740{
2745 int count = 0; 2741 int count = 0;
2746 2742
2747 if (arg_type_is_refcounted(fn->arg1_type)) 2743 if (arg_type_may_be_refcounted(fn->arg1_type))
2748 count++; 2744 count++;
2749 if (arg_type_is_refcounted(fn->arg2_type)) 2745 if (arg_type_may_be_refcounted(fn->arg2_type))
2750 count++; 2746 count++;
2751 if (arg_type_is_refcounted(fn->arg3_type)) 2747 if (arg_type_may_be_refcounted(fn->arg3_type))
2752 count++; 2748 count++;
2753 if (arg_type_is_refcounted(fn->arg4_type)) 2749 if (arg_type_may_be_refcounted(fn->arg4_type))
2754 count++; 2750 count++;
2755 if (arg_type_is_refcounted(fn->arg5_type)) 2751 if (arg_type_may_be_refcounted(fn->arg5_type))
2756 count++; 2752 count++;
2757 2753
2754 /* A reference acquiring function cannot acquire
2755 * another refcounted ptr.
2756 */
2757 if (is_acquire_function(func_id) && count)
2758 return false;
2759
2758 /* We only support one arg being unreferenced at the moment, 2760 /* We only support one arg being unreferenced at the moment,
2759 * which is sufficient for the helper functions we have right now. 2761 * which is sufficient for the helper functions we have right now.
2760 */ 2762 */
2761 return count <= 1; 2763 return count <= 1;
2762} 2764}
2763 2765
2764static int check_func_proto(const struct bpf_func_proto *fn) 2766static int check_func_proto(const struct bpf_func_proto *fn, int func_id)
2765{ 2767{
2766 return check_raw_mode_ok(fn) && 2768 return check_raw_mode_ok(fn) &&
2767 check_arg_pair_ok(fn) && 2769 check_arg_pair_ok(fn) &&
2768 check_refcount_ok(fn) ? 0 : -EINVAL; 2770 check_refcount_ok(fn, func_id) ? 0 : -EINVAL;
2769} 2771}
2770 2772
2771/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] 2773/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
@@ -2799,19 +2801,20 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
2799} 2801}
2800 2802
2801static void release_reg_references(struct bpf_verifier_env *env, 2803static void release_reg_references(struct bpf_verifier_env *env,
2802 struct bpf_func_state *state, int id) 2804 struct bpf_func_state *state,
2805 int ref_obj_id)
2803{ 2806{
2804 struct bpf_reg_state *regs = state->regs, *reg; 2807 struct bpf_reg_state *regs = state->regs, *reg;
2805 int i; 2808 int i;
2806 2809
2807 for (i = 0; i < MAX_BPF_REG; i++) 2810 for (i = 0; i < MAX_BPF_REG; i++)
2808 if (regs[i].id == id) 2811 if (regs[i].ref_obj_id == ref_obj_id)
2809 mark_reg_unknown(env, regs, i); 2812 mark_reg_unknown(env, regs, i);
2810 2813
2811 bpf_for_each_spilled_reg(i, state, reg) { 2814 bpf_for_each_spilled_reg(i, state, reg) {
2812 if (!reg) 2815 if (!reg)
2813 continue; 2816 continue;
2814 if (reg_is_refcounted(reg) && reg->id == id) 2817 if (reg->ref_obj_id == ref_obj_id)
2815 __mark_reg_unknown(reg); 2818 __mark_reg_unknown(reg);
2816 } 2819 }
2817} 2820}
@@ -2820,15 +2823,20 @@ static void release_reg_references(struct bpf_verifier_env *env,
2820 * resources. Identify all copies of the same pointer and clear the reference. 2823 * resources. Identify all copies of the same pointer and clear the reference.
2821 */ 2824 */
2822static int release_reference(struct bpf_verifier_env *env, 2825static int release_reference(struct bpf_verifier_env *env,
2823 struct bpf_call_arg_meta *meta) 2826 int ref_obj_id)
2824{ 2827{
2825 struct bpf_verifier_state *vstate = env->cur_state; 2828 struct bpf_verifier_state *vstate = env->cur_state;
2829 int err;
2826 int i; 2830 int i;
2827 2831
2832 err = release_reference_state(cur_func(env), ref_obj_id);
2833 if (err)
2834 return err;
2835
2828 for (i = 0; i <= vstate->curframe; i++) 2836 for (i = 0; i <= vstate->curframe; i++)
2829 release_reg_references(env, vstate->frame[i], meta->ptr_id); 2837 release_reg_references(env, vstate->frame[i], ref_obj_id);
2830 2838
2831 return release_reference_state(cur_func(env), meta->ptr_id); 2839 return 0;
2832} 2840}
2833 2841
2834static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, 2842static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
@@ -3047,7 +3055,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
3047 memset(&meta, 0, sizeof(meta)); 3055 memset(&meta, 0, sizeof(meta));
3048 meta.pkt_access = fn->pkt_access; 3056 meta.pkt_access = fn->pkt_access;
3049 3057
3050 err = check_func_proto(fn); 3058 err = check_func_proto(fn, func_id);
3051 if (err) { 3059 if (err) {
3052 verbose(env, "kernel subsystem misconfigured func %s#%d\n", 3060 verbose(env, "kernel subsystem misconfigured func %s#%d\n",
3053 func_id_name(func_id), func_id); 3061 func_id_name(func_id), func_id);
@@ -3093,7 +3101,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
3093 return err; 3101 return err;
3094 } 3102 }
3095 } else if (is_release_function(func_id)) { 3103 } else if (is_release_function(func_id)) {
3096 err = release_reference(env, &meta); 3104 err = release_reference(env, meta.ref_obj_id);
3097 if (err) { 3105 if (err) {
3098 verbose(env, "func %s#%d reference has not been acquired before\n", 3106 verbose(env, "func %s#%d reference has not been acquired before\n",
3099 func_id_name(func_id), func_id); 3107 func_id_name(func_id), func_id);
@@ -3154,8 +3162,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
3154 3162
3155 if (id < 0) 3163 if (id < 0)
3156 return id; 3164 return id;
3157 /* For release_reference() */ 3165 /* For mark_ptr_or_null_reg() */
3158 regs[BPF_REG_0].id = id; 3166 regs[BPF_REG_0].id = id;
3167 /* For release_reference() */
3168 regs[BPF_REG_0].ref_obj_id = id;
3159 } else { 3169 } else {
3160 /* For mark_ptr_or_null_reg() */ 3170 /* For mark_ptr_or_null_reg() */
3161 regs[BPF_REG_0].id = ++env->id_gen; 3171 regs[BPF_REG_0].id = ++env->id_gen;
@@ -3170,6 +3180,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
3170 return -EINVAL; 3180 return -EINVAL;
3171 } 3181 }
3172 3182
3183 if (is_ptr_cast_function(func_id))
3184 /* For release_reference() */
3185 regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
3186
3173 do_refine_retval_range(regs, fn->ret_type, func_id, &meta); 3187 do_refine_retval_range(regs, fn->ret_type, func_id, &meta);
3174 3188
3175 err = check_map_func_compatibility(env, meta.map_ptr, func_id); 3189 err = check_map_func_compatibility(env, meta.map_ptr, func_id);
@@ -3368,7 +3382,7 @@ do_sim:
3368 *dst_reg = *ptr_reg; 3382 *dst_reg = *ptr_reg;
3369 } 3383 }
3370 ret = push_stack(env, env->insn_idx + 1, env->insn_idx, true); 3384 ret = push_stack(env, env->insn_idx + 1, env->insn_idx, true);
3371 if (!ptr_is_dst_reg) 3385 if (!ptr_is_dst_reg && ret)
3372 *dst_reg = tmp; 3386 *dst_reg = tmp;
3373 return !ret ? -EFAULT : 0; 3387 return !ret ? -EFAULT : 0;
3374} 3388}
@@ -4124,15 +4138,35 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
4124 return 0; 4138 return 0;
4125} 4139}
4126 4140
4141static void __find_good_pkt_pointers(struct bpf_func_state *state,
4142 struct bpf_reg_state *dst_reg,
4143 enum bpf_reg_type type, u16 new_range)
4144{
4145 struct bpf_reg_state *reg;
4146 int i;
4147
4148 for (i = 0; i < MAX_BPF_REG; i++) {
4149 reg = &state->regs[i];
4150 if (reg->type == type && reg->id == dst_reg->id)
4151 /* keep the maximum range already checked */
4152 reg->range = max(reg->range, new_range);
4153 }
4154
4155 bpf_for_each_spilled_reg(i, state, reg) {
4156 if (!reg)
4157 continue;
4158 if (reg->type == type && reg->id == dst_reg->id)
4159 reg->range = max(reg->range, new_range);
4160 }
4161}
4162
4127static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, 4163static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
4128 struct bpf_reg_state *dst_reg, 4164 struct bpf_reg_state *dst_reg,
4129 enum bpf_reg_type type, 4165 enum bpf_reg_type type,
4130 bool range_right_open) 4166 bool range_right_open)
4131{ 4167{
4132 struct bpf_func_state *state = vstate->frame[vstate->curframe];
4133 struct bpf_reg_state *regs = state->regs, *reg;
4134 u16 new_range; 4168 u16 new_range;
4135 int i, j; 4169 int i;
4136 4170
4137 if (dst_reg->off < 0 || 4171 if (dst_reg->off < 0 ||
4138 (dst_reg->off == 0 && range_right_open)) 4172 (dst_reg->off == 0 && range_right_open))
@@ -4197,20 +4231,9 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
4197 * the range won't allow anything. 4231 * the range won't allow anything.
4198 * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16. 4232 * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16.
4199 */ 4233 */
4200 for (i = 0; i < MAX_BPF_REG; i++) 4234 for (i = 0; i <= vstate->curframe; i++)
4201 if (regs[i].type == type && regs[i].id == dst_reg->id) 4235 __find_good_pkt_pointers(vstate->frame[i], dst_reg, type,
4202 /* keep the maximum range already checked */ 4236 new_range);
4203 regs[i].range = max(regs[i].range, new_range);
4204
4205 for (j = 0; j <= vstate->curframe; j++) {
4206 state = vstate->frame[j];
4207 bpf_for_each_spilled_reg(i, state, reg) {
4208 if (!reg)
4209 continue;
4210 if (reg->type == type && reg->id == dst_reg->id)
4211 reg->range = max(reg->range, new_range);
4212 }
4213 }
4214} 4237}
4215 4238
4216/* compute branch direction of the expression "if (reg opcode val) goto target;" 4239/* compute branch direction of the expression "if (reg opcode val) goto target;"
@@ -4665,17 +4688,41 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
4665 } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) { 4688 } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) {
4666 reg->type = PTR_TO_TCP_SOCK; 4689 reg->type = PTR_TO_TCP_SOCK;
4667 } 4690 }
4668 if (is_null || !(reg_is_refcounted(reg) || 4691 if (is_null) {
4669 reg_may_point_to_spin_lock(reg))) { 4692 /* We don't need id and ref_obj_id from this point
4670 /* We don't need id from this point onwards anymore, 4693 * onwards anymore, thus we should better reset it,
4671 * thus we should better reset it, so that state 4694 * so that state pruning has chances to take effect.
4672 * pruning has chances to take effect. 4695 */
4696 reg->id = 0;
4697 reg->ref_obj_id = 0;
4698 } else if (!reg_may_point_to_spin_lock(reg)) {
4699 /* For not-NULL ptr, reg->ref_obj_id will be reset
4700 * in release_reg_references().
4701 *
4702 * reg->id is still used by spin_lock ptr. Other
4703 * than spin_lock ptr type, reg->id can be reset.
4673 */ 4704 */
4674 reg->id = 0; 4705 reg->id = 0;
4675 } 4706 }
4676 } 4707 }
4677} 4708}
4678 4709
4710static void __mark_ptr_or_null_regs(struct bpf_func_state *state, u32 id,
4711 bool is_null)
4712{
4713 struct bpf_reg_state *reg;
4714 int i;
4715
4716 for (i = 0; i < MAX_BPF_REG; i++)
4717 mark_ptr_or_null_reg(state, &state->regs[i], id, is_null);
4718
4719 bpf_for_each_spilled_reg(i, state, reg) {
4720 if (!reg)
4721 continue;
4722 mark_ptr_or_null_reg(state, reg, id, is_null);
4723 }
4724}
4725
4679/* The logic is similar to find_good_pkt_pointers(), both could eventually 4726/* The logic is similar to find_good_pkt_pointers(), both could eventually
4680 * be folded together at some point. 4727 * be folded together at some point.
4681 */ 4728 */
@@ -4683,24 +4730,20 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,
4683 bool is_null) 4730 bool is_null)
4684{ 4731{
4685 struct bpf_func_state *state = vstate->frame[vstate->curframe]; 4732 struct bpf_func_state *state = vstate->frame[vstate->curframe];
4686 struct bpf_reg_state *reg, *regs = state->regs; 4733 struct bpf_reg_state *regs = state->regs;
4734 u32 ref_obj_id = regs[regno].ref_obj_id;
4687 u32 id = regs[regno].id; 4735 u32 id = regs[regno].id;
4688 int i, j; 4736 int i;
4689
4690 if (reg_is_refcounted_or_null(&regs[regno]) && is_null)
4691 release_reference_state(state, id);
4692 4737
4693 for (i = 0; i < MAX_BPF_REG; i++) 4738 if (ref_obj_id && ref_obj_id == id && is_null)
4694 mark_ptr_or_null_reg(state, &regs[i], id, is_null); 4739 /* regs[regno] is in the " == NULL" branch.
4740 * No one could have freed the reference state before
4741 * doing the NULL check.
4742 */
4743 WARN_ON_ONCE(release_reference_state(state, id));
4695 4744
4696 for (j = 0; j <= vstate->curframe; j++) { 4745 for (i = 0; i <= vstate->curframe; i++)
4697 state = vstate->frame[j]; 4746 __mark_ptr_or_null_regs(vstate->frame[i], id, is_null);
4698 bpf_for_each_spilled_reg(i, state, reg) {
4699 if (!reg)
4700 continue;
4701 mark_ptr_or_null_reg(state, reg, id, is_null);
4702 }
4703 }
4704} 4747}
4705 4748
4706static bool try_match_pkt_pointers(const struct bpf_insn *insn, 4749static bool try_match_pkt_pointers(const struct bpf_insn *insn,
@@ -6052,15 +6095,17 @@ static int propagate_liveness(struct bpf_verifier_env *env,
6052 } 6095 }
6053 /* Propagate read liveness of registers... */ 6096 /* Propagate read liveness of registers... */
6054 BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); 6097 BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);
6055 /* We don't need to worry about FP liveness because it's read-only */ 6098 for (frame = 0; frame <= vstate->curframe; frame++) {
6056 for (i = 0; i < BPF_REG_FP; i++) { 6099 /* We don't need to worry about FP liveness, it's read-only */
6057 if (vparent->frame[vparent->curframe]->regs[i].live & REG_LIVE_READ) 6100 for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) {
6058 continue; 6101 if (vparent->frame[frame]->regs[i].live & REG_LIVE_READ)
6059 if (vstate->frame[vstate->curframe]->regs[i].live & REG_LIVE_READ) { 6102 continue;
6060 err = mark_reg_read(env, &vstate->frame[vstate->curframe]->regs[i], 6103 if (vstate->frame[frame]->regs[i].live & REG_LIVE_READ) {
6061 &vparent->frame[vstate->curframe]->regs[i]); 6104 err = mark_reg_read(env, &vstate->frame[frame]->regs[i],
6062 if (err) 6105 &vparent->frame[frame]->regs[i]);
6063 return err; 6106 if (err)
6107 return err;
6108 }
6064 } 6109 }
6065 } 6110 }
6066 6111
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 4834c4214e9c..6a1942ed781c 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -740,11 +740,10 @@ static inline int nr_cpusets(void)
740 * Must be called with cpuset_mutex held. 740 * Must be called with cpuset_mutex held.
741 * 741 *
742 * The three key local variables below are: 742 * The three key local variables below are:
743 * q - a linked-list queue of cpuset pointers, used to implement a 743 * cp - cpuset pointer, used (together with pos_css) to perform a
744 * top-down scan of all cpusets. This scan loads a pointer 744 * top-down scan of all cpusets. For our purposes, rebuilding
745 * to each cpuset marked is_sched_load_balance into the 745 * the schedulers sched domains, we can ignore !is_sched_load_
746 * array 'csa'. For our purposes, rebuilding the schedulers 746 * balance cpusets.
747 * sched domains, we can ignore !is_sched_load_balance cpusets.
748 * csa - (for CpuSet Array) Array of pointers to all the cpusets 747 * csa - (for CpuSet Array) Array of pointers to all the cpusets
749 * that need to be load balanced, for convenient iterative 748 * that need to be load balanced, for convenient iterative
750 * access by the subsequent code that finds the best partition, 749 * access by the subsequent code that finds the best partition,
@@ -775,7 +774,7 @@ static inline int nr_cpusets(void)
775static int generate_sched_domains(cpumask_var_t **domains, 774static int generate_sched_domains(cpumask_var_t **domains,
776 struct sched_domain_attr **attributes) 775 struct sched_domain_attr **attributes)
777{ 776{
778 struct cpuset *cp; /* scans q */ 777 struct cpuset *cp; /* top-down scan of cpusets */
779 struct cpuset **csa; /* array of all cpuset ptrs */ 778 struct cpuset **csa; /* array of all cpuset ptrs */
780 int csn; /* how many cpuset ptrs in csa so far */ 779 int csn; /* how many cpuset ptrs in csa so far */
781 int i, j, k; /* indices for partition finding loops */ 780 int i, j, k; /* indices for partition finding loops */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 025f419d16f6..f2ef10460698 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -9,6 +9,7 @@
9#include <linux/notifier.h> 9#include <linux/notifier.h>
10#include <linux/sched/signal.h> 10#include <linux/sched/signal.h>
11#include <linux/sched/hotplug.h> 11#include <linux/sched/hotplug.h>
12#include <linux/sched/isolation.h>
12#include <linux/sched/task.h> 13#include <linux/sched/task.h>
13#include <linux/sched/smt.h> 14#include <linux/sched/smt.h>
14#include <linux/unistd.h> 15#include <linux/unistd.h>
@@ -564,6 +565,20 @@ static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)
564 cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); 565 cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
565} 566}
566 567
568static inline bool can_rollback_cpu(struct cpuhp_cpu_state *st)
569{
570 if (IS_ENABLED(CONFIG_HOTPLUG_CPU))
571 return true;
572 /*
573 * When CPU hotplug is disabled, then taking the CPU down is not
574 * possible because takedown_cpu() and the architecture and
575 * subsystem specific mechanisms are not available. So the CPU
576 * which would be completely unplugged again needs to stay around
577 * in the current state.
578 */
579 return st->state <= CPUHP_BRINGUP_CPU;
580}
581
567static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, 582static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
568 enum cpuhp_state target) 583 enum cpuhp_state target)
569{ 584{
@@ -574,8 +589,10 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
574 st->state++; 589 st->state++;
575 ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); 590 ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
576 if (ret) { 591 if (ret) {
577 st->target = prev_state; 592 if (can_rollback_cpu(st)) {
578 undo_cpu_up(cpu, st); 593 st->target = prev_state;
594 undo_cpu_up(cpu, st);
595 }
579 break; 596 break;
580 } 597 }
581 } 598 }
@@ -844,6 +861,8 @@ static int take_cpu_down(void *_param)
844 861
845 /* Give up timekeeping duties */ 862 /* Give up timekeeping duties */
846 tick_handover_do_timer(); 863 tick_handover_do_timer();
864 /* Remove CPU from timer broadcasting */
865 tick_offline_cpu(cpu);
847 /* Park the stopper thread */ 866 /* Park the stopper thread */
848 stop_machine_park(cpu); 867 stop_machine_park(cpu);
849 return 0; 868 return 0;
@@ -1183,8 +1202,15 @@ int freeze_secondary_cpus(int primary)
1183 int cpu, error = 0; 1202 int cpu, error = 0;
1184 1203
1185 cpu_maps_update_begin(); 1204 cpu_maps_update_begin();
1186 if (!cpu_online(primary)) 1205 if (primary == -1) {
1187 primary = cpumask_first(cpu_online_mask); 1206 primary = cpumask_first(cpu_online_mask);
1207 if (!housekeeping_cpu(primary, HK_FLAG_TIMER))
1208 primary = housekeeping_any_cpu(HK_FLAG_TIMER);
1209 } else {
1210 if (!cpu_online(primary))
1211 primary = cpumask_first(cpu_online_mask);
1212 }
1213
1188 /* 1214 /*
1189 * We take down all of the non-boot CPUs in one shot to avoid races 1215 * We take down all of the non-boot CPUs in one shot to avoid races
1190 * with the userspace trying to use the CPU hotplug at the same time 1216 * with the userspace trying to use the CPU hotplug at the same time
@@ -2017,19 +2043,6 @@ static const struct attribute_group cpuhp_cpu_root_attr_group = {
2017 2043
2018#ifdef CONFIG_HOTPLUG_SMT 2044#ifdef CONFIG_HOTPLUG_SMT
2019 2045
2020static const char *smt_states[] = {
2021 [CPU_SMT_ENABLED] = "on",
2022 [CPU_SMT_DISABLED] = "off",
2023 [CPU_SMT_FORCE_DISABLED] = "forceoff",
2024 [CPU_SMT_NOT_SUPPORTED] = "notsupported",
2025};
2026
2027static ssize_t
2028show_smt_control(struct device *dev, struct device_attribute *attr, char *buf)
2029{
2030 return snprintf(buf, PAGE_SIZE - 2, "%s\n", smt_states[cpu_smt_control]);
2031}
2032
2033static void cpuhp_offline_cpu_device(unsigned int cpu) 2046static void cpuhp_offline_cpu_device(unsigned int cpu)
2034{ 2047{
2035 struct device *dev = get_cpu_device(cpu); 2048 struct device *dev = get_cpu_device(cpu);
@@ -2100,9 +2113,10 @@ static int cpuhp_smt_enable(void)
2100 return ret; 2113 return ret;
2101} 2114}
2102 2115
2116
2103static ssize_t 2117static ssize_t
2104store_smt_control(struct device *dev, struct device_attribute *attr, 2118__store_smt_control(struct device *dev, struct device_attribute *attr,
2105 const char *buf, size_t count) 2119 const char *buf, size_t count)
2106{ 2120{
2107 int ctrlval, ret; 2121 int ctrlval, ret;
2108 2122
@@ -2140,14 +2154,44 @@ store_smt_control(struct device *dev, struct device_attribute *attr,
2140 unlock_device_hotplug(); 2154 unlock_device_hotplug();
2141 return ret ? ret : count; 2155 return ret ? ret : count;
2142} 2156}
2157
2158#else /* !CONFIG_HOTPLUG_SMT */
2159static ssize_t
2160__store_smt_control(struct device *dev, struct device_attribute *attr,
2161 const char *buf, size_t count)
2162{
2163 return -ENODEV;
2164}
2165#endif /* CONFIG_HOTPLUG_SMT */
2166
2167static const char *smt_states[] = {
2168 [CPU_SMT_ENABLED] = "on",
2169 [CPU_SMT_DISABLED] = "off",
2170 [CPU_SMT_FORCE_DISABLED] = "forceoff",
2171 [CPU_SMT_NOT_SUPPORTED] = "notsupported",
2172 [CPU_SMT_NOT_IMPLEMENTED] = "notimplemented",
2173};
2174
2175static ssize_t
2176show_smt_control(struct device *dev, struct device_attribute *attr, char *buf)
2177{
2178 const char *state = smt_states[cpu_smt_control];
2179
2180 return snprintf(buf, PAGE_SIZE - 2, "%s\n", state);
2181}
2182
2183static ssize_t
2184store_smt_control(struct device *dev, struct device_attribute *attr,
2185 const char *buf, size_t count)
2186{
2187 return __store_smt_control(dev, attr, buf, count);
2188}
2143static DEVICE_ATTR(control, 0644, show_smt_control, store_smt_control); 2189static DEVICE_ATTR(control, 0644, show_smt_control, store_smt_control);
2144 2190
2145static ssize_t 2191static ssize_t
2146show_smt_active(struct device *dev, struct device_attribute *attr, char *buf) 2192show_smt_active(struct device *dev, struct device_attribute *attr, char *buf)
2147{ 2193{
2148 bool active = topology_max_smt_threads() > 1; 2194 return snprintf(buf, PAGE_SIZE - 2, "%d\n", sched_smt_active());
2149
2150 return snprintf(buf, PAGE_SIZE - 2, "%d\n", active);
2151} 2195}
2152static DEVICE_ATTR(active, 0444, show_smt_active, NULL); 2196static DEVICE_ATTR(active, 0444, show_smt_active, NULL);
2153 2197
@@ -2163,21 +2207,17 @@ static const struct attribute_group cpuhp_smt_attr_group = {
2163 NULL 2207 NULL
2164}; 2208};
2165 2209
2166static int __init cpu_smt_state_init(void) 2210static int __init cpu_smt_sysfs_init(void)
2167{ 2211{
2168 return sysfs_create_group(&cpu_subsys.dev_root->kobj, 2212 return sysfs_create_group(&cpu_subsys.dev_root->kobj,
2169 &cpuhp_smt_attr_group); 2213 &cpuhp_smt_attr_group);
2170} 2214}
2171 2215
2172#else
2173static inline int cpu_smt_state_init(void) { return 0; }
2174#endif
2175
2176static int __init cpuhp_sysfs_init(void) 2216static int __init cpuhp_sysfs_init(void)
2177{ 2217{
2178 int cpu, ret; 2218 int cpu, ret;
2179 2219
2180 ret = cpu_smt_state_init(); 2220 ret = cpu_smt_sysfs_init();
2181 if (ret) 2221 if (ret)
2182 return ret; 2222 return ret;
2183 2223
@@ -2198,7 +2238,7 @@ static int __init cpuhp_sysfs_init(void)
2198 return 0; 2238 return 0;
2199} 2239}
2200device_initcall(cpuhp_sysfs_init); 2240device_initcall(cpuhp_sysfs_init);
2201#endif 2241#endif /* CONFIG_SYSFS && CONFIG_HOTPLUG_CPU */
2202 2242
2203/* 2243/*
2204 * cpu_bit_bitmap[] is a special, "compressed" data structure that 2244 * cpu_bit_bitmap[] is a special, "compressed" data structure that
@@ -2288,3 +2328,18 @@ void __init boot_cpu_hotplug_init(void)
2288#endif 2328#endif
2289 this_cpu_write(cpuhp_state.state, CPUHP_ONLINE); 2329 this_cpu_write(cpuhp_state.state, CPUHP_ONLINE);
2290} 2330}
2331
2332enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO;
2333
2334static int __init mitigations_parse_cmdline(char *arg)
2335{
2336 if (!strcmp(arg, "off"))
2337 cpu_mitigations = CPU_MITIGATIONS_OFF;
2338 else if (!strcmp(arg, "auto"))
2339 cpu_mitigations = CPU_MITIGATIONS_AUTO;
2340 else if (!strcmp(arg, "auto,nosmt"))
2341 cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT;
2342
2343 return 0;
2344}
2345early_param("mitigations", mitigations_parse_cmdline);
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 45d51e8e26f6..badd77670d00 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -89,8 +89,8 @@ struct dma_debug_entry {
89 int sg_mapped_ents; 89 int sg_mapped_ents;
90 enum map_err_types map_err_type; 90 enum map_err_types map_err_type;
91#ifdef CONFIG_STACKTRACE 91#ifdef CONFIG_STACKTRACE
92 struct stack_trace stacktrace; 92 unsigned int stack_len;
93 unsigned long st_entries[DMA_DEBUG_STACKTRACE_ENTRIES]; 93 unsigned long stack_entries[DMA_DEBUG_STACKTRACE_ENTRIES];
94#endif 94#endif
95}; 95};
96 96
@@ -174,7 +174,7 @@ static inline void dump_entry_trace(struct dma_debug_entry *entry)
174#ifdef CONFIG_STACKTRACE 174#ifdef CONFIG_STACKTRACE
175 if (entry) { 175 if (entry) {
176 pr_warning("Mapped at:\n"); 176 pr_warning("Mapped at:\n");
177 print_stack_trace(&entry->stacktrace, 0); 177 stack_trace_print(entry->stack_entries, entry->stack_len, 0);
178 } 178 }
179#endif 179#endif
180} 180}
@@ -704,12 +704,10 @@ static struct dma_debug_entry *dma_entry_alloc(void)
704 spin_unlock_irqrestore(&free_entries_lock, flags); 704 spin_unlock_irqrestore(&free_entries_lock, flags);
705 705
706#ifdef CONFIG_STACKTRACE 706#ifdef CONFIG_STACKTRACE
707 entry->stacktrace.max_entries = DMA_DEBUG_STACKTRACE_ENTRIES; 707 entry->stack_len = stack_trace_save(entry->stack_entries,
708 entry->stacktrace.entries = entry->st_entries; 708 ARRAY_SIZE(entry->stack_entries),
709 entry->stacktrace.skip = 2; 709 1);
710 save_stack_trace(&entry->stacktrace);
711#endif 710#endif
712
713 return entry; 711 return entry;
714} 712}
715 713
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 53012db1e53c..6f7619c1f877 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -452,6 +452,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
452 unsigned long mask; 452 unsigned long mask;
453 unsigned long offset_slots; 453 unsigned long offset_slots;
454 unsigned long max_slots; 454 unsigned long max_slots;
455 unsigned long tmp_io_tlb_used;
455 456
456 if (no_iotlb_memory) 457 if (no_iotlb_memory)
457 panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer"); 458 panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
@@ -538,9 +539,12 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
538 } while (index != wrap); 539 } while (index != wrap);
539 540
540not_found: 541not_found:
542 tmp_io_tlb_used = io_tlb_used;
543
541 spin_unlock_irqrestore(&io_tlb_lock, flags); 544 spin_unlock_irqrestore(&io_tlb_lock, flags);
542 if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit()) 545 if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit())
543 dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes)\n", size); 546 dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n",
547 size, io_tlb_nslabs, tmp_io_tlb_used);
544 return DMA_MAPPING_ERROR; 548 return DMA_MAPPING_ERROR;
545found: 549found:
546 io_tlb_used += nslots; 550 io_tlb_used += nslots;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1032a16bd186..abbd4b3b96c2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2009,8 +2009,8 @@ event_sched_out(struct perf_event *event,
2009 event->pmu->del(event, 0); 2009 event->pmu->del(event, 0);
2010 event->oncpu = -1; 2010 event->oncpu = -1;
2011 2011
2012 if (event->pending_disable) { 2012 if (READ_ONCE(event->pending_disable) >= 0) {
2013 event->pending_disable = 0; 2013 WRITE_ONCE(event->pending_disable, -1);
2014 state = PERF_EVENT_STATE_OFF; 2014 state = PERF_EVENT_STATE_OFF;
2015 } 2015 }
2016 perf_event_set_state(event, state); 2016 perf_event_set_state(event, state);
@@ -2198,7 +2198,8 @@ EXPORT_SYMBOL_GPL(perf_event_disable);
2198 2198
2199void perf_event_disable_inatomic(struct perf_event *event) 2199void perf_event_disable_inatomic(struct perf_event *event)
2200{ 2200{
2201 event->pending_disable = 1; 2201 WRITE_ONCE(event->pending_disable, smp_processor_id());
2202 /* can fail, see perf_pending_event_disable() */
2202 irq_work_queue(&event->pending); 2203 irq_work_queue(&event->pending);
2203} 2204}
2204 2205
@@ -2477,6 +2478,16 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
2477 perf_pmu_enable(cpuctx->ctx.pmu); 2478 perf_pmu_enable(cpuctx->ctx.pmu);
2478} 2479}
2479 2480
2481void perf_pmu_resched(struct pmu *pmu)
2482{
2483 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2484 struct perf_event_context *task_ctx = cpuctx->task_ctx;
2485
2486 perf_ctx_lock(cpuctx, task_ctx);
2487 ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
2488 perf_ctx_unlock(cpuctx, task_ctx);
2489}
2490
2480/* 2491/*
2481 * Cross CPU call to install and enable a performance event 2492 * Cross CPU call to install and enable a performance event
2482 * 2493 *
@@ -5810,10 +5821,45 @@ void perf_event_wakeup(struct perf_event *event)
5810 } 5821 }
5811} 5822}
5812 5823
5824static void perf_pending_event_disable(struct perf_event *event)
5825{
5826 int cpu = READ_ONCE(event->pending_disable);
5827
5828 if (cpu < 0)
5829 return;
5830
5831 if (cpu == smp_processor_id()) {
5832 WRITE_ONCE(event->pending_disable, -1);
5833 perf_event_disable_local(event);
5834 return;
5835 }
5836
5837 /*
5838 * CPU-A CPU-B
5839 *
5840 * perf_event_disable_inatomic()
5841 * @pending_disable = CPU-A;
5842 * irq_work_queue();
5843 *
5844 * sched-out
5845 * @pending_disable = -1;
5846 *
5847 * sched-in
5848 * perf_event_disable_inatomic()
5849 * @pending_disable = CPU-B;
5850 * irq_work_queue(); // FAILS
5851 *
5852 * irq_work_run()
5853 * perf_pending_event()
5854 *
5855 * But the event runs on CPU-B and wants disabling there.
5856 */
5857 irq_work_queue_on(&event->pending, cpu);
5858}
5859
5813static void perf_pending_event(struct irq_work *entry) 5860static void perf_pending_event(struct irq_work *entry)
5814{ 5861{
5815 struct perf_event *event = container_of(entry, 5862 struct perf_event *event = container_of(entry, struct perf_event, pending);
5816 struct perf_event, pending);
5817 int rctx; 5863 int rctx;
5818 5864
5819 rctx = perf_swevent_get_recursion_context(); 5865 rctx = perf_swevent_get_recursion_context();
@@ -5822,10 +5868,7 @@ static void perf_pending_event(struct irq_work *entry)
5822 * and we won't recurse 'further'. 5868 * and we won't recurse 'further'.
5823 */ 5869 */
5824 5870
5825 if (event->pending_disable) { 5871 perf_pending_event_disable(event);
5826 event->pending_disable = 0;
5827 perf_event_disable_local(event);
5828 }
5829 5872
5830 if (event->pending_wakeup) { 5873 if (event->pending_wakeup) {
5831 event->pending_wakeup = 0; 5874 event->pending_wakeup = 0;
@@ -7189,6 +7232,7 @@ static void perf_event_mmap_output(struct perf_event *event,
7189 struct perf_output_handle handle; 7232 struct perf_output_handle handle;
7190 struct perf_sample_data sample; 7233 struct perf_sample_data sample;
7191 int size = mmap_event->event_id.header.size; 7234 int size = mmap_event->event_id.header.size;
7235 u32 type = mmap_event->event_id.header.type;
7192 int ret; 7236 int ret;
7193 7237
7194 if (!perf_event_mmap_match(event, data)) 7238 if (!perf_event_mmap_match(event, data))
@@ -7232,6 +7276,7 @@ static void perf_event_mmap_output(struct perf_event *event,
7232 perf_output_end(&handle); 7276 perf_output_end(&handle);
7233out: 7277out:
7234 mmap_event->event_id.header.size = size; 7278 mmap_event->event_id.header.size = size;
7279 mmap_event->event_id.header.type = type;
7235} 7280}
7236 7281
7237static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) 7282static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
@@ -9042,26 +9087,29 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
9042 if (task == TASK_TOMBSTONE) 9087 if (task == TASK_TOMBSTONE)
9043 return; 9088 return;
9044 9089
9045 if (!ifh->nr_file_filters) 9090 if (ifh->nr_file_filters) {
9046 return; 9091 mm = get_task_mm(event->ctx->task);
9047 9092 if (!mm)
9048 mm = get_task_mm(event->ctx->task); 9093 goto restart;
9049 if (!mm)
9050 goto restart;
9051 9094
9052 down_read(&mm->mmap_sem); 9095 down_read(&mm->mmap_sem);
9096 }
9053 9097
9054 raw_spin_lock_irqsave(&ifh->lock, flags); 9098 raw_spin_lock_irqsave(&ifh->lock, flags);
9055 list_for_each_entry(filter, &ifh->list, entry) { 9099 list_for_each_entry(filter, &ifh->list, entry) {
9056 event->addr_filter_ranges[count].start = 0; 9100 if (filter->path.dentry) {
9057 event->addr_filter_ranges[count].size = 0; 9101 /*
9102 * Adjust base offset if the filter is associated to a
9103 * binary that needs to be mapped:
9104 */
9105 event->addr_filter_ranges[count].start = 0;
9106 event->addr_filter_ranges[count].size = 0;
9058 9107
9059 /*
9060 * Adjust base offset if the filter is associated to a binary
9061 * that needs to be mapped:
9062 */
9063 if (filter->path.dentry)
9064 perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]); 9108 perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]);
9109 } else {
9110 event->addr_filter_ranges[count].start = filter->offset;
9111 event->addr_filter_ranges[count].size = filter->size;
9112 }
9065 9113
9066 count++; 9114 count++;
9067 } 9115 }
@@ -9069,9 +9117,11 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
9069 event->addr_filters_gen++; 9117 event->addr_filters_gen++;
9070 raw_spin_unlock_irqrestore(&ifh->lock, flags); 9118 raw_spin_unlock_irqrestore(&ifh->lock, flags);
9071 9119
9072 up_read(&mm->mmap_sem); 9120 if (ifh->nr_file_filters) {
9121 up_read(&mm->mmap_sem);
9073 9122
9074 mmput(mm); 9123 mmput(mm);
9124 }
9075 9125
9076restart: 9126restart:
9077 perf_event_stop(event, 1); 9127 perf_event_stop(event, 1);
@@ -10234,6 +10284,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
10234 10284
10235 10285
10236 init_waitqueue_head(&event->waitq); 10286 init_waitqueue_head(&event->waitq);
10287 event->pending_disable = -1;
10237 init_irq_work(&event->pending, perf_pending_event); 10288 init_irq_work(&event->pending, perf_pending_event);
10238 10289
10239 mutex_init(&event->mmap_mutex); 10290 mutex_init(&event->mmap_mutex);
@@ -11876,7 +11927,7 @@ static void __init perf_event_init_all_cpus(void)
11876 } 11927 }
11877} 11928}
11878 11929
11879void perf_swevent_init_cpu(unsigned int cpu) 11930static void perf_swevent_init_cpu(unsigned int cpu)
11880{ 11931{
11881 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); 11932 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
11882 11933
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index a4047321d7d8..674b35383491 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -392,7 +392,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
392 * store that will be enabled on successful return 392 * store that will be enabled on successful return
393 */ 393 */
394 if (!handle->size) { /* A, matches D */ 394 if (!handle->size) { /* A, matches D */
395 event->pending_disable = 1; 395 event->pending_disable = smp_processor_id();
396 perf_output_wakeup(handle); 396 perf_output_wakeup(handle);
397 local_set(&rb->aux_nest, 0); 397 local_set(&rb->aux_nest, 0);
398 goto err_put; 398 goto err_put;
@@ -455,24 +455,21 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
455 rb->aux_head += size; 455 rb->aux_head += size;
456 } 456 }
457 457
458 if (size || handle->aux_flags) { 458 /*
459 /* 459 * Only send RECORD_AUX if we have something useful to communicate
460 * Only send RECORD_AUX if we have something useful to communicate 460 *
461 * 461 * Note: the OVERWRITE records by themselves are not considered
462 * Note: the OVERWRITE records by themselves are not considered 462 * useful, as they don't communicate any *new* information,
463 * useful, as they don't communicate any *new* information, 463 * aside from the short-lived offset, that becomes history at
464 * aside from the short-lived offset, that becomes history at 464 * the next event sched-in and therefore isn't useful.
465 * the next event sched-in and therefore isn't useful. 465 * The userspace that needs to copy out AUX data in overwrite
466 * The userspace that needs to copy out AUX data in overwrite 466 * mode should know to use user_page::aux_head for the actual
467 * mode should know to use user_page::aux_head for the actual 467 * offset. So, from now on we don't output AUX records that
468 * offset. So, from now on we don't output AUX records that 468 * have *only* OVERWRITE flag set.
469 * have *only* OVERWRITE flag set. 469 */
470 */ 470 if (size || (handle->aux_flags & ~(u64)PERF_AUX_FLAG_OVERWRITE))
471 471 perf_event_aux_event(handle->event, aux_head, size,
472 if (handle->aux_flags & ~(u64)PERF_AUX_FLAG_OVERWRITE) 472 handle->aux_flags);
473 perf_event_aux_event(handle->event, aux_head, size,
474 handle->aux_flags);
475 }
476 473
477 rb->user_page->aux_head = rb->aux_head; 474 rb->user_page->aux_head = rb->aux_head;
478 if (rb_need_aux_wakeup(rb)) 475 if (rb_need_aux_wakeup(rb))
@@ -480,7 +477,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
480 477
481 if (wakeup) { 478 if (wakeup) {
482 if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED) 479 if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)
483 handle->event->pending_disable = 1; 480 handle->event->pending_disable = smp_processor_id();
484 perf_output_wakeup(handle); 481 perf_output_wakeup(handle);
485 } 482 }
486 483
@@ -613,8 +610,7 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
613 * PMU requests more than one contiguous chunks of memory 610 * PMU requests more than one contiguous chunks of memory
614 * for SW double buffering 611 * for SW double buffering
615 */ 612 */
616 if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_SW_DOUBLEBUF) && 613 if (!overwrite) {
617 !overwrite) {
618 if (!max_order) 614 if (!max_order)
619 return -EINVAL; 615 return -EINVAL;
620 616
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index c5cde87329c7..4ca7364c956d 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -2028,7 +2028,7 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
2028 if (uc->handler) { 2028 if (uc->handler) {
2029 rc = uc->handler(uc, regs); 2029 rc = uc->handler(uc, regs);
2030 WARN(rc & ~UPROBE_HANDLER_MASK, 2030 WARN(rc & ~UPROBE_HANDLER_MASK,
2031 "bad rc=0x%x from %pf()\n", rc, uc->handler); 2031 "bad rc=0x%x from %ps()\n", rc, uc->handler);
2032 } 2032 }
2033 2033
2034 if (uc->ret_handler) 2034 if (uc->ret_handler)
@@ -2294,16 +2294,14 @@ static struct notifier_block uprobe_exception_nb = {
2294 .priority = INT_MAX-1, /* notified after kprobes, kgdb */ 2294 .priority = INT_MAX-1, /* notified after kprobes, kgdb */
2295}; 2295};
2296 2296
2297static int __init init_uprobes(void) 2297void __init uprobes_init(void)
2298{ 2298{
2299 int i; 2299 int i;
2300 2300
2301 for (i = 0; i < UPROBES_HASH_SZ; i++) 2301 for (i = 0; i < UPROBES_HASH_SZ; i++)
2302 mutex_init(&uprobes_mmap_mutex[i]); 2302 mutex_init(&uprobes_mmap_mutex[i]);
2303 2303
2304 if (percpu_init_rwsem(&dup_mmap_sem)) 2304 BUG_ON(percpu_init_rwsem(&dup_mmap_sem));
2305 return -ENOMEM;
2306 2305
2307 return register_die_notifier(&uprobe_exception_nb); 2306 BUG_ON(register_die_notifier(&uprobe_exception_nb));
2308} 2307}
2309__initcall(init_uprobes);
diff --git a/kernel/fail_function.c b/kernel/fail_function.c
index 17f75b545f66..feb80712b913 100644
--- a/kernel/fail_function.c
+++ b/kernel/fail_function.c
@@ -210,7 +210,7 @@ static int fei_seq_show(struct seq_file *m, void *v)
210{ 210{
211 struct fei_attr *attr = list_entry(v, struct fei_attr, list); 211 struct fei_attr *attr = list_entry(v, struct fei_attr, list);
212 212
213 seq_printf(m, "%pf\n", attr->kp.addr); 213 seq_printf(m, "%ps\n", attr->kp.addr);
214 return 0; 214 return 0;
215} 215}
216 216
diff --git a/kernel/fork.c b/kernel/fork.c
index 9dcd18aa210b..8b03d93ba068 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -11,6 +11,7 @@
11 * management can be a bitch. See 'mm/memory.c': 'copy_page_range()' 11 * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
12 */ 12 */
13 13
14#include <linux/anon_inodes.h>
14#include <linux/slab.h> 15#include <linux/slab.h>
15#include <linux/sched/autogroup.h> 16#include <linux/sched/autogroup.h>
16#include <linux/sched/mm.h> 17#include <linux/sched/mm.h>
@@ -21,6 +22,7 @@
21#include <linux/sched/task.h> 22#include <linux/sched/task.h>
22#include <linux/sched/task_stack.h> 23#include <linux/sched/task_stack.h>
23#include <linux/sched/cputime.h> 24#include <linux/sched/cputime.h>
25#include <linux/seq_file.h>
24#include <linux/rtmutex.h> 26#include <linux/rtmutex.h>
25#include <linux/init.h> 27#include <linux/init.h>
26#include <linux/unistd.h> 28#include <linux/unistd.h>
@@ -815,6 +817,7 @@ void __init fork_init(void)
815#endif 817#endif
816 818
817 lockdep_init_task(&init_task); 819 lockdep_init_task(&init_task);
820 uprobes_init();
818} 821}
819 822
820int __weak arch_dup_task_struct(struct task_struct *dst, 823int __weak arch_dup_task_struct(struct task_struct *dst,
@@ -1298,13 +1301,20 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
1298 complete_vfork_done(tsk); 1301 complete_vfork_done(tsk);
1299} 1302}
1300 1303
1301/* 1304/**
1302 * Allocate a new mm structure and copy contents from the 1305 * dup_mm() - duplicates an existing mm structure
1303 * mm structure of the passed in task structure. 1306 * @tsk: the task_struct with which the new mm will be associated.
1307 * @oldmm: the mm to duplicate.
1308 *
1309 * Allocates a new mm structure and duplicates the provided @oldmm structure
1310 * content into it.
1311 *
1312 * Return: the duplicated mm or NULL on failure.
1304 */ 1313 */
1305static struct mm_struct *dup_mm(struct task_struct *tsk) 1314static struct mm_struct *dup_mm(struct task_struct *tsk,
1315 struct mm_struct *oldmm)
1306{ 1316{
1307 struct mm_struct *mm, *oldmm = current->mm; 1317 struct mm_struct *mm;
1308 int err; 1318 int err;
1309 1319
1310 mm = allocate_mm(); 1320 mm = allocate_mm();
@@ -1371,7 +1381,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
1371 } 1381 }
1372 1382
1373 retval = -ENOMEM; 1383 retval = -ENOMEM;
1374 mm = dup_mm(tsk); 1384 mm = dup_mm(tsk, current->mm);
1375 if (!mm) 1385 if (!mm)
1376 goto fail_nomem; 1386 goto fail_nomem;
1377 1387
@@ -1662,6 +1672,58 @@ static inline void rcu_copy_process(struct task_struct *p)
1662#endif /* #ifdef CONFIG_TASKS_RCU */ 1672#endif /* #ifdef CONFIG_TASKS_RCU */
1663} 1673}
1664 1674
1675static int pidfd_release(struct inode *inode, struct file *file)
1676{
1677 struct pid *pid = file->private_data;
1678
1679 file->private_data = NULL;
1680 put_pid(pid);
1681 return 0;
1682}
1683
1684#ifdef CONFIG_PROC_FS
1685static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
1686{
1687 struct pid_namespace *ns = proc_pid_ns(file_inode(m->file));
1688 struct pid *pid = f->private_data;
1689
1690 seq_put_decimal_ull(m, "Pid:\t", pid_nr_ns(pid, ns));
1691 seq_putc(m, '\n');
1692}
1693#endif
1694
1695const struct file_operations pidfd_fops = {
1696 .release = pidfd_release,
1697#ifdef CONFIG_PROC_FS
1698 .show_fdinfo = pidfd_show_fdinfo,
1699#endif
1700};
1701
1702/**
1703 * pidfd_create() - Create a new pid file descriptor.
1704 *
1705 * @pid: struct pid that the pidfd will reference
1706 *
1707 * This creates a new pid file descriptor with the O_CLOEXEC flag set.
1708 *
1709 * Note, that this function can only be called after the fd table has
1710 * been unshared to avoid leaking the pidfd to the new process.
1711 *
1712 * Return: On success, a cloexec pidfd is returned.
1713 * On error, a negative errno number will be returned.
1714 */
1715static int pidfd_create(struct pid *pid)
1716{
1717 int fd;
1718
1719 fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
1720 O_RDWR | O_CLOEXEC);
1721 if (fd < 0)
1722 put_pid(pid);
1723
1724 return fd;
1725}
1726
1665/* 1727/*
1666 * This creates a new process as a copy of the old one, 1728 * This creates a new process as a copy of the old one,
1667 * but does not actually start it yet. 1729 * but does not actually start it yet.
@@ -1674,13 +1736,14 @@ static __latent_entropy struct task_struct *copy_process(
1674 unsigned long clone_flags, 1736 unsigned long clone_flags,
1675 unsigned long stack_start, 1737 unsigned long stack_start,
1676 unsigned long stack_size, 1738 unsigned long stack_size,
1739 int __user *parent_tidptr,
1677 int __user *child_tidptr, 1740 int __user *child_tidptr,
1678 struct pid *pid, 1741 struct pid *pid,
1679 int trace, 1742 int trace,
1680 unsigned long tls, 1743 unsigned long tls,
1681 int node) 1744 int node)
1682{ 1745{
1683 int retval; 1746 int pidfd = -1, retval;
1684 struct task_struct *p; 1747 struct task_struct *p;
1685 struct multiprocess_signals delayed; 1748 struct multiprocess_signals delayed;
1686 1749
@@ -1730,6 +1793,31 @@ static __latent_entropy struct task_struct *copy_process(
1730 return ERR_PTR(-EINVAL); 1793 return ERR_PTR(-EINVAL);
1731 } 1794 }
1732 1795
1796 if (clone_flags & CLONE_PIDFD) {
1797 int reserved;
1798
1799 /*
1800 * - CLONE_PARENT_SETTID is useless for pidfds and also
1801 * parent_tidptr is used to return pidfds.
1802 * - CLONE_DETACHED is blocked so that we can potentially
1803 * reuse it later for CLONE_PIDFD.
1804 * - CLONE_THREAD is blocked until someone really needs it.
1805 */
1806 if (clone_flags &
1807 (CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD))
1808 return ERR_PTR(-EINVAL);
1809
1810 /*
1811 * Verify that parent_tidptr is sane so we can potentially
1812 * reuse it later.
1813 */
1814 if (get_user(reserved, parent_tidptr))
1815 return ERR_PTR(-EFAULT);
1816
1817 if (reserved != 0)
1818 return ERR_PTR(-EINVAL);
1819 }
1820
1733 /* 1821 /*
1734 * Force any signals received before this point to be delivered 1822 * Force any signals received before this point to be delivered
1735 * before the fork happens. Collect up signals sent to multiple 1823 * before the fork happens. Collect up signals sent to multiple
@@ -1936,6 +2024,22 @@ static __latent_entropy struct task_struct *copy_process(
1936 } 2024 }
1937 } 2025 }
1938 2026
2027 /*
2028 * This has to happen after we've potentially unshared the file
2029 * descriptor table (so that the pidfd doesn't leak into the child
2030 * if the fd table isn't shared).
2031 */
2032 if (clone_flags & CLONE_PIDFD) {
2033 retval = pidfd_create(pid);
2034 if (retval < 0)
2035 goto bad_fork_free_pid;
2036
2037 pidfd = retval;
2038 retval = put_user(pidfd, parent_tidptr);
2039 if (retval)
2040 goto bad_fork_put_pidfd;
2041 }
2042
1939#ifdef CONFIG_BLOCK 2043#ifdef CONFIG_BLOCK
1940 p->plug = NULL; 2044 p->plug = NULL;
1941#endif 2045#endif
@@ -1996,7 +2100,7 @@ static __latent_entropy struct task_struct *copy_process(
1996 */ 2100 */
1997 retval = cgroup_can_fork(p); 2101 retval = cgroup_can_fork(p);
1998 if (retval) 2102 if (retval)
1999 goto bad_fork_free_pid; 2103 goto bad_fork_put_pidfd;
2000 2104
2001 /* 2105 /*
2002 * From this point on we must avoid any synchronous user-space 2106 * From this point on we must avoid any synchronous user-space
@@ -2111,6 +2215,9 @@ bad_fork_cancel_cgroup:
2111 spin_unlock(&current->sighand->siglock); 2215 spin_unlock(&current->sighand->siglock);
2112 write_unlock_irq(&tasklist_lock); 2216 write_unlock_irq(&tasklist_lock);
2113 cgroup_cancel_fork(p); 2217 cgroup_cancel_fork(p);
2218bad_fork_put_pidfd:
2219 if (clone_flags & CLONE_PIDFD)
2220 ksys_close(pidfd);
2114bad_fork_free_pid: 2221bad_fork_free_pid:
2115 cgroup_threadgroup_change_end(current); 2222 cgroup_threadgroup_change_end(current);
2116 if (pid != &init_struct_pid) 2223 if (pid != &init_struct_pid)
@@ -2176,7 +2283,7 @@ static inline void init_idle_pids(struct task_struct *idle)
2176struct task_struct *fork_idle(int cpu) 2283struct task_struct *fork_idle(int cpu)
2177{ 2284{
2178 struct task_struct *task; 2285 struct task_struct *task;
2179 task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0, 2286 task = copy_process(CLONE_VM, 0, 0, NULL, NULL, &init_struct_pid, 0, 0,
2180 cpu_to_node(cpu)); 2287 cpu_to_node(cpu));
2181 if (!IS_ERR(task)) { 2288 if (!IS_ERR(task)) {
2182 init_idle_pids(task); 2289 init_idle_pids(task);
@@ -2186,6 +2293,11 @@ struct task_struct *fork_idle(int cpu)
2186 return task; 2293 return task;
2187} 2294}
2188 2295
2296struct mm_struct *copy_init_mm(void)
2297{
2298 return dup_mm(NULL, &init_mm);
2299}
2300
2189/* 2301/*
2190 * Ok, this is the main fork-routine. 2302 * Ok, this is the main fork-routine.
2191 * 2303 *
@@ -2223,7 +2335,7 @@ long _do_fork(unsigned long clone_flags,
2223 trace = 0; 2335 trace = 0;
2224 } 2336 }
2225 2337
2226 p = copy_process(clone_flags, stack_start, stack_size, 2338 p = copy_process(clone_flags, stack_start, stack_size, parent_tidptr,
2227 child_tidptr, NULL, trace, tls, NUMA_NO_NODE); 2339 child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
2228 add_latent_entropy(); 2340 add_latent_entropy();
2229 2341
diff --git a/kernel/futex.c b/kernel/futex.c
index c3b73b0311bc..6262f1534ac9 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1311,13 +1311,15 @@ static int lookup_pi_state(u32 __user *uaddr, u32 uval,
1311 1311
1312static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) 1312static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
1313{ 1313{
1314 int err;
1314 u32 uninitialized_var(curval); 1315 u32 uninitialized_var(curval);
1315 1316
1316 if (unlikely(should_fail_futex(true))) 1317 if (unlikely(should_fail_futex(true)))
1317 return -EFAULT; 1318 return -EFAULT;
1318 1319
1319 if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) 1320 err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
1320 return -EFAULT; 1321 if (unlikely(err))
1322 return err;
1321 1323
1322 /* If user space value changed, let the caller retry */ 1324 /* If user space value changed, let the caller retry */
1323 return curval != uval ? -EAGAIN : 0; 1325 return curval != uval ? -EAGAIN : 0;
@@ -1502,10 +1504,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_
1502 if (unlikely(should_fail_futex(true))) 1504 if (unlikely(should_fail_futex(true)))
1503 ret = -EFAULT; 1505 ret = -EFAULT;
1504 1506
1505 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) { 1507 ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
1506 ret = -EFAULT; 1508 if (!ret && (curval != uval)) {
1507
1508 } else if (curval != uval) {
1509 /* 1509 /*
1510 * If a unconditional UNLOCK_PI operation (user space did not 1510 * If a unconditional UNLOCK_PI operation (user space did not
1511 * try the TID->0 transition) raced with a waiter setting the 1511 * try the TID->0 transition) raced with a waiter setting the
@@ -1700,32 +1700,32 @@ retry_private:
1700 double_lock_hb(hb1, hb2); 1700 double_lock_hb(hb1, hb2);
1701 op_ret = futex_atomic_op_inuser(op, uaddr2); 1701 op_ret = futex_atomic_op_inuser(op, uaddr2);
1702 if (unlikely(op_ret < 0)) { 1702 if (unlikely(op_ret < 0)) {
1703
1704 double_unlock_hb(hb1, hb2); 1703 double_unlock_hb(hb1, hb2);
1705 1704
1706#ifndef CONFIG_MMU 1705 if (!IS_ENABLED(CONFIG_MMU) ||
1707 /* 1706 unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {
1708 * we don't get EFAULT from MMU faults if we don't have an MMU, 1707 /*
1709 * but we might get them from range checking 1708 * we don't get EFAULT from MMU faults if we don't have
1710 */ 1709 * an MMU, but we might get them from range checking
1711 ret = op_ret; 1710 */
1712 goto out_put_keys;
1713#endif
1714
1715 if (unlikely(op_ret != -EFAULT)) {
1716 ret = op_ret; 1711 ret = op_ret;
1717 goto out_put_keys; 1712 goto out_put_keys;
1718 } 1713 }
1719 1714
1720 ret = fault_in_user_writeable(uaddr2); 1715 if (op_ret == -EFAULT) {
1721 if (ret) 1716 ret = fault_in_user_writeable(uaddr2);
1722 goto out_put_keys; 1717 if (ret)
1718 goto out_put_keys;
1719 }
1723 1720
1724 if (!(flags & FLAGS_SHARED)) 1721 if (!(flags & FLAGS_SHARED)) {
1722 cond_resched();
1725 goto retry_private; 1723 goto retry_private;
1724 }
1726 1725
1727 put_futex_key(&key2); 1726 put_futex_key(&key2);
1728 put_futex_key(&key1); 1727 put_futex_key(&key1);
1728 cond_resched();
1729 goto retry; 1729 goto retry;
1730 } 1730 }
1731 1731
@@ -2350,7 +2350,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
2350 u32 uval, uninitialized_var(curval), newval; 2350 u32 uval, uninitialized_var(curval), newval;
2351 struct task_struct *oldowner, *newowner; 2351 struct task_struct *oldowner, *newowner;
2352 u32 newtid; 2352 u32 newtid;
2353 int ret; 2353 int ret, err = 0;
2354 2354
2355 lockdep_assert_held(q->lock_ptr); 2355 lockdep_assert_held(q->lock_ptr);
2356 2356
@@ -2421,14 +2421,17 @@ retry:
2421 if (!pi_state->owner) 2421 if (!pi_state->owner)
2422 newtid |= FUTEX_OWNER_DIED; 2422 newtid |= FUTEX_OWNER_DIED;
2423 2423
2424 if (get_futex_value_locked(&uval, uaddr)) 2424 err = get_futex_value_locked(&uval, uaddr);
2425 goto handle_fault; 2425 if (err)
2426 goto handle_err;
2426 2427
2427 for (;;) { 2428 for (;;) {
2428 newval = (uval & FUTEX_OWNER_DIED) | newtid; 2429 newval = (uval & FUTEX_OWNER_DIED) | newtid;
2429 2430
2430 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) 2431 err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
2431 goto handle_fault; 2432 if (err)
2433 goto handle_err;
2434
2432 if (curval == uval) 2435 if (curval == uval)
2433 break; 2436 break;
2434 uval = curval; 2437 uval = curval;
@@ -2456,23 +2459,37 @@ retry:
2456 return 0; 2459 return 0;
2457 2460
2458 /* 2461 /*
2459 * To handle the page fault we need to drop the locks here. That gives 2462 * In order to reschedule or handle a page fault, we need to drop the
2460 * the other task (either the highest priority waiter itself or the 2463 * locks here. In the case of a fault, this gives the other task
2461 * task which stole the rtmutex) the chance to try the fixup of the 2464 * (either the highest priority waiter itself or the task which stole
2462 * pi_state. So once we are back from handling the fault we need to 2465 * the rtmutex) the chance to try the fixup of the pi_state. So once we
2463 * check the pi_state after reacquiring the locks and before trying to 2466 * are back from handling the fault we need to check the pi_state after
2464 * do another fixup. When the fixup has been done already we simply 2467 * reacquiring the locks and before trying to do another fixup. When
2465 * return. 2468 * the fixup has been done already we simply return.
2466 * 2469 *
2467 * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely 2470 * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
2468 * drop hb->lock since the caller owns the hb -> futex_q relation. 2471 * drop hb->lock since the caller owns the hb -> futex_q relation.
2469 * Dropping the pi_mutex->wait_lock requires the state revalidate. 2472 * Dropping the pi_mutex->wait_lock requires the state revalidate.
2470 */ 2473 */
2471handle_fault: 2474handle_err:
2472 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 2475 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
2473 spin_unlock(q->lock_ptr); 2476 spin_unlock(q->lock_ptr);
2474 2477
2475 ret = fault_in_user_writeable(uaddr); 2478 switch (err) {
2479 case -EFAULT:
2480 ret = fault_in_user_writeable(uaddr);
2481 break;
2482
2483 case -EAGAIN:
2484 cond_resched();
2485 ret = 0;
2486 break;
2487
2488 default:
2489 WARN_ON_ONCE(1);
2490 ret = err;
2491 break;
2492 }
2476 2493
2477 spin_lock(q->lock_ptr); 2494 spin_lock(q->lock_ptr);
2478 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 2495 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
@@ -3041,10 +3058,8 @@ retry:
3041 * A unconditional UNLOCK_PI op raced against a waiter 3058 * A unconditional UNLOCK_PI op raced against a waiter
3042 * setting the FUTEX_WAITERS bit. Try again. 3059 * setting the FUTEX_WAITERS bit. Try again.
3043 */ 3060 */
3044 if (ret == -EAGAIN) { 3061 if (ret == -EAGAIN)
3045 put_futex_key(&key); 3062 goto pi_retry;
3046 goto retry;
3047 }
3048 /* 3063 /*
3049 * wake_futex_pi has detected invalid state. Tell user 3064 * wake_futex_pi has detected invalid state. Tell user
3050 * space. 3065 * space.
@@ -3059,9 +3074,19 @@ retry:
3059 * preserve the WAITERS bit not the OWNER_DIED one. We are the 3074 * preserve the WAITERS bit not the OWNER_DIED one. We are the
3060 * owner. 3075 * owner.
3061 */ 3076 */
3062 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) { 3077 if ((ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))) {
3063 spin_unlock(&hb->lock); 3078 spin_unlock(&hb->lock);
3064 goto pi_faulted; 3079 switch (ret) {
3080 case -EFAULT:
3081 goto pi_faulted;
3082
3083 case -EAGAIN:
3084 goto pi_retry;
3085
3086 default:
3087 WARN_ON_ONCE(1);
3088 goto out_putkey;
3089 }
3065 } 3090 }
3066 3091
3067 /* 3092 /*
@@ -3075,6 +3100,11 @@ out_putkey:
3075 put_futex_key(&key); 3100 put_futex_key(&key);
3076 return ret; 3101 return ret;
3077 3102
3103pi_retry:
3104 put_futex_key(&key);
3105 cond_resched();
3106 goto retry;
3107
3078pi_faulted: 3108pi_faulted:
3079 put_futex_key(&key); 3109 put_futex_key(&key);
3080 3110
@@ -3435,47 +3465,67 @@ err_unlock:
3435static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) 3465static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
3436{ 3466{
3437 u32 uval, uninitialized_var(nval), mval; 3467 u32 uval, uninitialized_var(nval), mval;
3468 int err;
3469
3470 /* Futex address must be 32bit aligned */
3471 if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0)
3472 return -1;
3438 3473
3439retry: 3474retry:
3440 if (get_user(uval, uaddr)) 3475 if (get_user(uval, uaddr))
3441 return -1; 3476 return -1;
3442 3477
3443 if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) { 3478 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr))
3444 /* 3479 return 0;
3445 * Ok, this dying thread is truly holding a futex 3480
3446 * of interest. Set the OWNER_DIED bit atomically 3481 /*
3447 * via cmpxchg, and if the value had FUTEX_WAITERS 3482 * Ok, this dying thread is truly holding a futex
3448 * set, wake up a waiter (if any). (We have to do a 3483 * of interest. Set the OWNER_DIED bit atomically
3449 * futex_wake() even if OWNER_DIED is already set - 3484 * via cmpxchg, and if the value had FUTEX_WAITERS
3450 * to handle the rare but possible case of recursive 3485 * set, wake up a waiter (if any). (We have to do a
3451 * thread-death.) The rest of the cleanup is done in 3486 * futex_wake() even if OWNER_DIED is already set -
3452 * userspace. 3487 * to handle the rare but possible case of recursive
3453 */ 3488 * thread-death.) The rest of the cleanup is done in
3454 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; 3489 * userspace.
3455 /* 3490 */
3456 * We are not holding a lock here, but we want to have 3491 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
3457 * the pagefault_disable/enable() protection because 3492
3458 * we want to handle the fault gracefully. If the 3493 /*
3459 * access fails we try to fault in the futex with R/W 3494 * We are not holding a lock here, but we want to have
3460 * verification via get_user_pages. get_user() above 3495 * the pagefault_disable/enable() protection because
3461 * does not guarantee R/W access. If that fails we 3496 * we want to handle the fault gracefully. If the
3462 * give up and leave the futex locked. 3497 * access fails we try to fault in the futex with R/W
3463 */ 3498 * verification via get_user_pages. get_user() above
3464 if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) { 3499 * does not guarantee R/W access. If that fails we
3500 * give up and leave the futex locked.
3501 */
3502 if ((err = cmpxchg_futex_value_locked(&nval, uaddr, uval, mval))) {
3503 switch (err) {
3504 case -EFAULT:
3465 if (fault_in_user_writeable(uaddr)) 3505 if (fault_in_user_writeable(uaddr))
3466 return -1; 3506 return -1;
3467 goto retry; 3507 goto retry;
3468 } 3508
3469 if (nval != uval) 3509 case -EAGAIN:
3510 cond_resched();
3470 goto retry; 3511 goto retry;
3471 3512
3472 /* 3513 default:
3473 * Wake robust non-PI futexes here. The wakeup of 3514 WARN_ON_ONCE(1);
3474 * PI futexes happens in exit_pi_state(): 3515 return err;
3475 */ 3516 }
3476 if (!pi && (uval & FUTEX_WAITERS))
3477 futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
3478 } 3517 }
3518
3519 if (nval != uval)
3520 goto retry;
3521
3522 /*
3523 * Wake robust non-PI futexes here. The wakeup of
3524 * PI futexes happens in exit_pi_state():
3525 */
3526 if (!pi && (uval & FUTEX_WAITERS))
3527 futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
3528
3479 return 0; 3529 return 0;
3480} 3530}
3481 3531
diff --git a/kernel/gen_ikh_data.sh b/kernel/gen_ikh_data.sh
new file mode 100755
index 000000000000..591a94f7b387
--- /dev/null
+++ b/kernel/gen_ikh_data.sh
@@ -0,0 +1,89 @@
1#!/bin/bash
2# SPDX-License-Identifier: GPL-2.0
3
4# This script generates an archive consisting of kernel headers
5# for CONFIG_IKHEADERS_PROC.
6set -e
7spath="$(dirname "$(readlink -f "$0")")"
8kroot="$spath/.."
9outdir="$(pwd)"
10tarfile=$1
11cpio_dir=$outdir/$tarfile.tmp
12
13# Script filename relative to the kernel source root
14# We add it to the archive because it is small and any changes
15# to this script will also cause a rebuild of the archive.
16sfile="$(realpath --relative-to $kroot "$(readlink -f "$0")")"
17
18src_file_list="
19include/
20arch/$SRCARCH/include/
21$sfile
22"
23
24obj_file_list="
25include/
26arch/$SRCARCH/include/
27"
28
29# Support incremental builds by skipping archive generation
30# if timestamps of files being archived are not changed.
31
32# This block is useful for debugging the incremental builds.
33# Uncomment it for debugging.
34# iter=1
35# if [ ! -f /tmp/iter ]; then echo 1 > /tmp/iter;
36# else; iter=$(($(cat /tmp/iter) + 1)); fi
37# find $src_file_list -type f | xargs ls -lR > /tmp/src-ls-$iter
38# find $obj_file_list -type f | xargs ls -lR > /tmp/obj-ls-$iter
39
40# include/generated/compile.h is ignored because it is touched even when none
41# of the source files changed. This causes pointless regeneration, so let us
42# ignore them for md5 calculation.
43pushd $kroot > /dev/null
44src_files_md5="$(find $src_file_list -type f |
45 grep -v "include/generated/compile.h" |
46 xargs ls -lR | md5sum | cut -d ' ' -f1)"
47popd > /dev/null
48obj_files_md5="$(find $obj_file_list -type f |
49 grep -v "include/generated/compile.h" |
50 xargs ls -lR | md5sum | cut -d ' ' -f1)"
51
52if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi
53if [ -f kernel/kheaders.md5 ] &&
54 [ "$(cat kernel/kheaders.md5|head -1)" == "$src_files_md5" ] &&
55 [ "$(cat kernel/kheaders.md5|head -2|tail -1)" == "$obj_files_md5" ] &&
56 [ "$(cat kernel/kheaders.md5|tail -1)" == "$tarfile_md5" ]; then
57 exit
58fi
59
60if [ "${quiet}" != "silent_" ]; then
61 echo " GEN $tarfile"
62fi
63
64rm -rf $cpio_dir
65mkdir $cpio_dir
66
67pushd $kroot > /dev/null
68for f in $src_file_list;
69 do find "$f" ! -name "*.cmd" ! -name ".*";
70done | cpio --quiet -pd $cpio_dir
71popd > /dev/null
72
73# The second CPIO can complain if files already exist which can
74# happen with out of tree builds. Just silence CPIO for now.
75for f in $obj_file_list;
76 do find "$f" ! -name "*.cmd" ! -name ".*";
77done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1
78
79# Remove comments except SDPX lines
80find $cpio_dir -type f -print0 |
81 xargs -0 -P8 -n1 perl -pi -e 'BEGIN {undef $/;}; s/\/\*((?!SPDX).)*?\*\///smg;'
82
83tar -Jcf $tarfile -C $cpio_dir/ . > /dev/null
84
85echo "$src_files_md5" > kernel/kheaders.md5
86echo "$obj_files_md5" >> kernel/kheaders.md5
87echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5
88
89rm -rf $cpio_dir
diff --git a/kernel/iomem.c b/kernel/iomem.c
index f7525e14ebc6..93c264444510 100644
--- a/kernel/iomem.c
+++ b/kernel/iomem.c
@@ -55,7 +55,7 @@ static void *try_ram_remap(resource_size_t offset, size_t size,
55 * 55 *
56 * MEMREMAP_WB - matches the default mapping for System RAM on 56 * MEMREMAP_WB - matches the default mapping for System RAM on
57 * the architecture. This is usually a read-allocate write-back cache. 57 * the architecture. This is usually a read-allocate write-back cache.
58 * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM 58 * Moreover, if MEMREMAP_WB is specified and the requested remap region is RAM
59 * memremap() will bypass establishing a new mapping and instead return 59 * memremap() will bypass establishing a new mapping and instead return
60 * a pointer into the direct map. 60 * a pointer into the direct map.
61 * 61 *
@@ -86,7 +86,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)
86 /* Try all mapping types requested until one returns non-NULL */ 86 /* Try all mapping types requested until one returns non-NULL */
87 if (flags & MEMREMAP_WB) { 87 if (flags & MEMREMAP_WB) {
88 /* 88 /*
89 * MEMREMAP_WB is special in that it can be satisifed 89 * MEMREMAP_WB is special in that it can be satisfied
90 * from the direct map. Some archs depend on the 90 * from the direct map. Some archs depend on the
91 * capability of memremap() to autodetect cases where 91 * capability of memremap() to autodetect cases where
92 * the requested range is potentially in System RAM. 92 * the requested range is potentially in System RAM.
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 3faef4a77f71..51128bea3846 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -1449,6 +1449,10 @@ int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info)
1449int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on) 1449int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on)
1450{ 1450{
1451 data = data->parent_data; 1451 data = data->parent_data;
1452
1453 if (data->chip->flags & IRQCHIP_SKIP_SET_WAKE)
1454 return 0;
1455
1452 if (data->chip->irq_set_wake) 1456 if (data->chip->irq_set_wake)
1453 return data->chip->irq_set_wake(data, on); 1457 return data->chip->irq_set_wake(data, on);
1454 1458
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index 516c00a5e867..c1eccd4f6520 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -152,7 +152,7 @@ static int irq_debug_show(struct seq_file *m, void *p)
152 152
153 raw_spin_lock_irq(&desc->lock); 153 raw_spin_lock_irq(&desc->lock);
154 data = irq_desc_get_irq_data(desc); 154 data = irq_desc_get_irq_data(desc);
155 seq_printf(m, "handler: %pf\n", desc->handle_irq); 155 seq_printf(m, "handler: %ps\n", desc->handle_irq);
156 seq_printf(m, "device: %s\n", desc->dev_name); 156 seq_printf(m, "device: %s\n", desc->dev_name);
157 seq_printf(m, "status: 0x%08x\n", desc->status_use_accessors); 157 seq_printf(m, "status: 0x%08x\n", desc->status_use_accessors);
158 irq_debug_show_bits(m, 0, desc->status_use_accessors, irqdesc_states, 158 irq_debug_show_bits(m, 0, desc->status_use_accessors, irqdesc_states,
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 5d5378ea0afe..f6e5515ee077 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -84,8 +84,6 @@ EXPORT_SYMBOL(devm_request_threaded_irq);
84 * @dev: device to request interrupt for 84 * @dev: device to request interrupt for
85 * @irq: Interrupt line to allocate 85 * @irq: Interrupt line to allocate
86 * @handler: Function to be called when the IRQ occurs 86 * @handler: Function to be called when the IRQ occurs
87 * @thread_fn: function to be called in a threaded interrupt context. NULL
88 * for devices which handle everything in @handler
89 * @irqflags: Interrupt type flags 87 * @irqflags: Interrupt type flags
90 * @devname: An ascii name for the claiming device, dev_name(dev) if NULL 88 * @devname: An ascii name for the claiming device, dev_name(dev) if NULL
91 * @dev_id: A cookie passed back to the handler function 89 * @dev_id: A cookie passed back to the handler function
@@ -222,9 +220,8 @@ devm_irq_alloc_generic_chip(struct device *dev, const char *name, int num_ct,
222 irq_flow_handler_t handler) 220 irq_flow_handler_t handler)
223{ 221{
224 struct irq_chip_generic *gc; 222 struct irq_chip_generic *gc;
225 unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
226 223
227 gc = devm_kzalloc(dev, sz, GFP_KERNEL); 224 gc = devm_kzalloc(dev, struct_size(gc, chip_types, num_ct), GFP_KERNEL);
228 if (gc) 225 if (gc)
229 irq_init_generic_chip(gc, name, num_ct, 226 irq_init_generic_chip(gc, name, num_ct,
230 irq_base, reg_base, handler); 227 irq_base, reg_base, handler);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 6df5ddfdb0f8..a4ace611f47f 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -149,7 +149,7 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags
149 res = action->handler(irq, action->dev_id); 149 res = action->handler(irq, action->dev_id);
150 trace_irq_handler_exit(irq, action, res); 150 trace_irq_handler_exit(irq, action, res);
151 151
152 if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n", 152 if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pS enabled interrupts\n",
153 irq, action->handler)) 153 irq, action->handler))
154 local_irq_disable(); 154 local_irq_disable();
155 155
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 13539e12cd80..c52b737ab8e3 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -275,11 +275,12 @@ static struct attribute *irq_attrs[] = {
275 &actions_attr.attr, 275 &actions_attr.attr,
276 NULL 276 NULL
277}; 277};
278ATTRIBUTE_GROUPS(irq);
278 279
279static struct kobj_type irq_kobj_type = { 280static struct kobj_type irq_kobj_type = {
280 .release = irq_kobj_release, 281 .release = irq_kobj_release,
281 .sysfs_ops = &kobj_sysfs_ops, 282 .sysfs_ops = &kobj_sysfs_ops,
282 .default_attrs = irq_attrs, 283 .default_groups = irq_groups,
283}; 284};
284 285
285static void irq_sysfs_add(int irq, struct irq_desc *desc) 286static void irq_sysfs_add(int irq, struct irq_desc *desc)
@@ -558,6 +559,7 @@ int __init early_irq_init(void)
558 alloc_masks(&desc[i], node); 559 alloc_masks(&desc[i], node);
559 raw_spin_lock_init(&desc[i].lock); 560 raw_spin_lock_init(&desc[i].lock);
560 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 561 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
562 mutex_init(&desc[i].request_mutex);
561 desc_set_defaults(i, &desc[i], node, NULL, NULL); 563 desc_set_defaults(i, &desc[i], node, NULL, NULL);
562 } 564 }
563 return arch_early_irq_init(); 565 return arch_early_irq_init();
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 9ec34a2a6638..78f3ddeb7fe4 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -196,6 +196,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
196 case IRQ_SET_MASK_OK: 196 case IRQ_SET_MASK_OK:
197 case IRQ_SET_MASK_OK_DONE: 197 case IRQ_SET_MASK_OK_DONE:
198 cpumask_copy(desc->irq_common_data.affinity, mask); 198 cpumask_copy(desc->irq_common_data.affinity, mask);
199 /* fall through */
199 case IRQ_SET_MASK_OK_NOCOPY: 200 case IRQ_SET_MASK_OK_NOCOPY:
200 irq_validate_effective_affinity(data); 201 irq_validate_effective_affinity(data);
201 irq_set_thread_affinity(desc); 202 irq_set_thread_affinity(desc);
@@ -356,8 +357,10 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
356 desc->affinity_notify = notify; 357 desc->affinity_notify = notify;
357 raw_spin_unlock_irqrestore(&desc->lock, flags); 358 raw_spin_unlock_irqrestore(&desc->lock, flags);
358 359
359 if (old_notify) 360 if (old_notify) {
361 cancel_work_sync(&old_notify->work);
360 kref_put(&old_notify->kref, old_notify->release); 362 kref_put(&old_notify->kref, old_notify->release);
363 }
361 364
362 return 0; 365 return 0;
363} 366}
@@ -778,7 +781,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags)
778 ret = 0; 781 ret = 0;
779 break; 782 break;
780 default: 783 default:
781 pr_err("Setting trigger mode %lu for irq %u failed (%pF)\n", 784 pr_err("Setting trigger mode %lu for irq %u failed (%pS)\n",
782 flags, irq_desc_get_irq(desc), chip->irq_set_type); 785 flags, irq_desc_get_irq(desc), chip->irq_set_type);
783 } 786 }
784 if (unmask) 787 if (unmask)
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 6d2fa6914b30..2ed97a7c9b2a 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -212,9 +212,9 @@ static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
212 */ 212 */
213 raw_spin_lock_irqsave(&desc->lock, flags); 213 raw_spin_lock_irqsave(&desc->lock, flags);
214 for_each_action_of_desc(desc, action) { 214 for_each_action_of_desc(desc, action) {
215 printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler); 215 printk(KERN_ERR "[<%p>] %ps", action->handler, action->handler);
216 if (action->thread_fn) 216 if (action->thread_fn)
217 printk(KERN_CONT " threaded [<%p>] %pf", 217 printk(KERN_CONT " threaded [<%p>] %ps",
218 action->thread_fn, action->thread_fn); 218 action->thread_fn, action->thread_fn);
219 printk(KERN_CONT "\n"); 219 printk(KERN_CONT "\n");
220 } 220 }
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
index 1e4cb63a5c82..90c735da15d0 100644
--- a/kernel/irq/timings.c
+++ b/kernel/irq/timings.c
@@ -9,6 +9,7 @@
9#include <linux/idr.h> 9#include <linux/idr.h>
10#include <linux/irq.h> 10#include <linux/irq.h>
11#include <linux/math64.h> 11#include <linux/math64.h>
12#include <linux/log2.h>
12 13
13#include <trace/events/irq.h> 14#include <trace/events/irq.h>
14 15
@@ -18,16 +19,6 @@ DEFINE_STATIC_KEY_FALSE(irq_timing_enabled);
18 19
19DEFINE_PER_CPU(struct irq_timings, irq_timings); 20DEFINE_PER_CPU(struct irq_timings, irq_timings);
20 21
21struct irqt_stat {
22 u64 next_evt;
23 u64 last_ts;
24 u64 variance;
25 u32 avg;
26 u32 nr_samples;
27 int anomalies;
28 int valid;
29};
30
31static DEFINE_IDR(irqt_stats); 22static DEFINE_IDR(irqt_stats);
32 23
33void irq_timings_enable(void) 24void irq_timings_enable(void)
@@ -40,75 +31,360 @@ void irq_timings_disable(void)
40 static_branch_disable(&irq_timing_enabled); 31 static_branch_disable(&irq_timing_enabled);
41} 32}
42 33
43/** 34/*
44 * irqs_update - update the irq timing statistics with a new timestamp 35 * The main goal of this algorithm is to predict the next interrupt
36 * occurrence on the current CPU.
37 *
38 * Currently, the interrupt timings are stored in a circular array
39 * buffer every time there is an interrupt, as a tuple: the interrupt
40 * number and the associated timestamp when the event occurred <irq,
41 * timestamp>.
42 *
43 * For every interrupt occurring in a short period of time, we can
44 * measure the elapsed time between the occurrences for the same
45 * interrupt and we end up with a suite of intervals. The experience
46 * showed the interrupts are often coming following a periodic
47 * pattern.
48 *
49 * The objective of the algorithm is to find out this periodic pattern
50 * in a fastest way and use its period to predict the next irq event.
51 *
52 * When the next interrupt event is requested, we are in the situation
53 * where the interrupts are disabled and the circular buffer
54 * containing the timings is filled with the events which happened
55 * after the previous next-interrupt-event request.
56 *
57 * At this point, we read the circular buffer and we fill the irq
58 * related statistics structure. After this step, the circular array
59 * containing the timings is empty because all the values are
60 * dispatched in their corresponding buffers.
61 *
62 * Now for each interrupt, we can predict the next event by using the
63 * suffix array, log interval and exponential moving average
64 *
65 * 1. Suffix array
66 *
67 * Suffix array is an array of all the suffixes of a string. It is
68 * widely used as a data structure for compression, text search, ...
69 * For instance for the word 'banana', the suffixes will be: 'banana'
70 * 'anana' 'nana' 'ana' 'na' 'a'
71 *
72 * Usually, the suffix array is sorted but for our purpose it is
73 * not necessary and won't provide any improvement in the context of
74 * the solved problem where we clearly define the boundaries of the
75 * search by a max period and min period.
76 *
77 * The suffix array will build a suite of intervals of different
78 * length and will look for the repetition of each suite. If the suite
79 * is repeating then we have the period because it is the length of
80 * the suite whatever its position in the buffer.
81 *
82 * 2. Log interval
83 *
84 * We saw the irq timings allow to compute the interval of the
85 * occurrences for a specific interrupt. We can reasonibly assume the
86 * longer is the interval, the higher is the error for the next event
87 * and we can consider storing those interval values into an array
88 * where each slot in the array correspond to an interval at the power
89 * of 2 of the index. For example, index 12 will contain values
90 * between 2^11 and 2^12.
91 *
92 * At the end we have an array of values where at each index defines a
93 * [2^index - 1, 2 ^ index] interval values allowing to store a large
94 * number of values inside a small array.
95 *
96 * For example, if we have the value 1123, then we store it at
97 * ilog2(1123) = 10 index value.
98 *
99 * Storing those value at the specific index is done by computing an
100 * exponential moving average for this specific slot. For instance,
101 * for values 1800, 1123, 1453, ... fall under the same slot (10) and
102 * the exponential moving average is computed every time a new value
103 * is stored at this slot.
104 *
105 * 3. Exponential Moving Average
106 *
107 * The EMA is largely used to track a signal for stocks or as a low
108 * pass filter. The magic of the formula, is it is very simple and the
109 * reactivity of the average can be tuned with the factors called
110 * alpha.
111 *
112 * The higher the alphas are, the faster the average respond to the
113 * signal change. In our case, if a slot in the array is a big
114 * interval, we can have numbers with a big difference between
115 * them. The impact of those differences in the average computation
116 * can be tuned by changing the alpha value.
117 *
118 *
119 * -- The algorithm --
120 *
121 * We saw the different processing above, now let's see how they are
122 * used together.
123 *
124 * For each interrupt:
125 * For each interval:
126 * Compute the index = ilog2(interval)
127 * Compute a new_ema(buffer[index], interval)
128 * Store the index in a circular buffer
129 *
130 * Compute the suffix array of the indexes
131 *
132 * For each suffix:
133 * If the suffix is reverse-found 3 times
134 * Return suffix
135 *
136 * Return Not found
137 *
138 * However we can not have endless suffix array to be build, it won't
139 * make sense and it will add an extra overhead, so we can restrict
140 * this to a maximum suffix length of 5 and a minimum suffix length of
141 * 2. The experience showed 5 is the majority of the maximum pattern
142 * period found for different devices.
143 *
144 * The result is a pattern finding less than 1us for an interrupt.
45 * 145 *
46 * @irqs: an irqt_stat struct pointer 146 * Example based on real values:
47 * @ts: the new timestamp
48 * 147 *
49 * The statistics are computed online, in other words, the code is 148 * Example 1 : MMC write/read interrupt interval:
50 * designed to compute the statistics on a stream of values rather
51 * than doing multiple passes on the values to compute the average,
52 * then the variance. The integer division introduces a loss of
53 * precision but with an acceptable error margin regarding the results
54 * we would have with the double floating precision: we are dealing
55 * with nanosec, so big numbers, consequently the mantisse is
56 * negligeable, especially when converting the time in usec
57 * afterwards.
58 * 149 *
59 * The computation happens at idle time. When the CPU is not idle, the 150 * 223947, 1240, 1384, 1386, 1386,
60 * interrupts' timestamps are stored in the circular buffer, when the 151 * 217416, 1236, 1384, 1386, 1387,
61 * CPU goes idle and this routine is called, all the buffer's values 152 * 214719, 1241, 1386, 1387, 1384,
62 * are injected in the statistical model continuying to extend the 153 * 213696, 1234, 1384, 1386, 1388,
63 * statistics from the previous busy-idle cycle. 154 * 219904, 1240, 1385, 1389, 1385,
155 * 212240, 1240, 1386, 1386, 1386,
156 * 214415, 1236, 1384, 1386, 1387,
157 * 214276, 1234, 1384, 1388, ?
64 * 158 *
65 * The observations showed a device will trigger a burst of periodic 159 * For each element, apply ilog2(value)
66 * interrupts followed by one or two peaks of longer time, for
67 * instance when a SD card device flushes its cache, then the periodic
68 * intervals occur again. A one second inactivity period resets the
69 * stats, that gives us the certitude the statistical values won't
70 * exceed 1x10^9, thus the computation won't overflow.
71 * 160 *
72 * Basically, the purpose of the algorithm is to watch the periodic 161 * 15, 8, 8, 8, 8,
73 * interrupts and eliminate the peaks. 162 * 15, 8, 8, 8, 8,
163 * 15, 8, 8, 8, 8,
164 * 15, 8, 8, 8, 8,
165 * 15, 8, 8, 8, 8,
166 * 15, 8, 8, 8, 8,
167 * 15, 8, 8, 8, 8,
168 * 15, 8, 8, 8, ?
74 * 169 *
75 * An interrupt is considered periodically stable if the interval of 170 * Max period of 5, we take the last (max_period * 3) 15 elements as
76 * its occurences follow the normal distribution, thus the values 171 * we can be confident if the pattern repeats itself three times it is
77 * comply with: 172 * a repeating pattern.
78 * 173 *
79 * avg - 3 x stddev < value < avg + 3 x stddev 174 * 8,
175 * 15, 8, 8, 8, 8,
176 * 15, 8, 8, 8, 8,
177 * 15, 8, 8, 8, ?
80 * 178 *
81 * Which can be simplified to: 179 * Suffixes are:
82 * 180 *
83 * -3 x stddev < value - avg < 3 x stddev 181 * 1) 8, 15, 8, 8, 8 <- max period
182 * 2) 8, 15, 8, 8
183 * 3) 8, 15, 8
184 * 4) 8, 15 <- min period
84 * 185 *
85 * abs(value - avg) < 3 x stddev 186 * From there we search the repeating pattern for each suffix.
86 * 187 *
87 * In order to save a costly square root computation, we use the 188 * buffer: 8, 15, 8, 8, 8, 8, 15, 8, 8, 8, 8, 15, 8, 8, 8
88 * variance. For the record, stddev = sqrt(variance). The equation 189 * | | | | | | | | | | | | | | |
89 * above becomes: 190 * 8, 15, 8, 8, 8 | | | | | | | | | |
191 * 8, 15, 8, 8, 8 | | | | |
192 * 8, 15, 8, 8, 8
90 * 193 *
91 * abs(value - avg) < 3 x sqrt(variance) 194 * When moving the suffix, we found exactly 3 matches.
92 * 195 *
93 * And finally we square it: 196 * The first suffix with period 5 is repeating.
94 * 197 *
95 * (value - avg) ^ 2 < (3 x sqrt(variance)) ^ 2 198 * The next event is (3 * max_period) % suffix_period
96 * 199 *
97 * (value - avg) x (value - avg) < 9 x variance 200 * In this example, the result 0, so the next event is suffix[0] => 8
98 * 201 *
99 * Statistically speaking, any values out of this interval is 202 * However, 8 is the index in the array of exponential moving average
100 * considered as an anomaly and is discarded. However, a normal 203 * which was calculated on the fly when storing the values, so the
101 * distribution appears when the number of samples is 30 (it is the 204 * interval is ema[8] = 1366
102 * rule of thumb in statistics, cf. "30 samples" on Internet). When
103 * there are three consecutive anomalies, the statistics are resetted.
104 * 205 *
206 *
207 * Example 2:
208 *
209 * 4, 3, 5, 100,
210 * 3, 3, 5, 117,
211 * 4, 4, 5, 112,
212 * 4, 3, 4, 110,
213 * 3, 5, 3, 117,
214 * 4, 4, 5, 112,
215 * 4, 3, 4, 110,
216 * 3, 4, 5, 112,
217 * 4, 3, 4, 110
218 *
219 * ilog2
220 *
221 * 0, 0, 0, 4,
222 * 0, 0, 0, 4,
223 * 0, 0, 0, 4,
224 * 0, 0, 0, 4,
225 * 0, 0, 0, 4,
226 * 0, 0, 0, 4,
227 * 0, 0, 0, 4,
228 * 0, 0, 0, 4,
229 * 0, 0, 0, 4
230 *
231 * Max period 5:
232 * 0, 0, 4,
233 * 0, 0, 0, 4,
234 * 0, 0, 0, 4,
235 * 0, 0, 0, 4
236 *
237 * Suffixes:
238 *
239 * 1) 0, 0, 4, 0, 0
240 * 2) 0, 0, 4, 0
241 * 3) 0, 0, 4
242 * 4) 0, 0
243 *
244 * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4
245 * | | | | | | X
246 * 0, 0, 4, 0, 0, | X
247 * 0, 0
248 *
249 * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4
250 * | | | | | | | | | | | | | | |
251 * 0, 0, 4, 0, | | | | | | | | | | |
252 * 0, 0, 4, 0, | | | | | | |
253 * 0, 0, 4, 0, | | |
254 * 0 0 4
255 *
256 * Pattern is found 3 times, the remaining is 1 which results from
257 * (max_period * 3) % suffix_period. This value is the index in the
258 * suffix arrays. The suffix array for a period 4 has the value 4
259 * at index 1.
260 */
261#define EMA_ALPHA_VAL 64
262#define EMA_ALPHA_SHIFT 7
263
264#define PREDICTION_PERIOD_MIN 2
265#define PREDICTION_PERIOD_MAX 5
266#define PREDICTION_FACTOR 4
267#define PREDICTION_MAX 10 /* 2 ^ PREDICTION_MAX useconds */
268#define PREDICTION_BUFFER_SIZE 16 /* slots for EMAs, hardly more than 16 */
269
270struct irqt_stat {
271 u64 last_ts;
272 u64 ema_time[PREDICTION_BUFFER_SIZE];
273 int timings[IRQ_TIMINGS_SIZE];
274 int circ_timings[IRQ_TIMINGS_SIZE];
275 int count;
276};
277
278/*
279 * Exponential moving average computation
105 */ 280 */
106static void irqs_update(struct irqt_stat *irqs, u64 ts) 281static u64 irq_timings_ema_new(u64 value, u64 ema_old)
282{
283 s64 diff;
284
285 if (unlikely(!ema_old))
286 return value;
287
288 diff = (value - ema_old) * EMA_ALPHA_VAL;
289 /*
290 * We can use a s64 type variable to be added with the u64
291 * ema_old variable as this one will never have its topmost
292 * bit set, it will be always smaller than 2^63 nanosec
293 * interrupt interval (292 years).
294 */
295 return ema_old + (diff >> EMA_ALPHA_SHIFT);
296}
297
298static int irq_timings_next_event_index(int *buffer, size_t len, int period_max)
299{
300 int i;
301
302 /*
303 * The buffer contains the suite of intervals, in a ilog2
304 * basis, we are looking for a repetition. We point the
305 * beginning of the search three times the length of the
306 * period beginning at the end of the buffer. We do that for
307 * each suffix.
308 */
309 for (i = period_max; i >= PREDICTION_PERIOD_MIN ; i--) {
310
311 int *begin = &buffer[len - (i * 3)];
312 int *ptr = begin;
313
314 /*
315 * We look if the suite with period 'i' repeat
316 * itself. If it is truncated at the end, as it
317 * repeats we can use the period to find out the next
318 * element.
319 */
320 while (!memcmp(ptr, begin, i * sizeof(*ptr))) {
321 ptr += i;
322 if (ptr >= &buffer[len])
323 return begin[((i * 3) % i)];
324 }
325 }
326
327 return -1;
328}
329
330static u64 __irq_timings_next_event(struct irqt_stat *irqs, int irq, u64 now)
331{
332 int index, i, period_max, count, start, min = INT_MAX;
333
334 if ((now - irqs->last_ts) >= NSEC_PER_SEC) {
335 irqs->count = irqs->last_ts = 0;
336 return U64_MAX;
337 }
338
339 /*
340 * As we want to find three times the repetition, we need a
341 * number of intervals greater or equal to three times the
342 * maximum period, otherwise we truncate the max period.
343 */
344 period_max = irqs->count > (3 * PREDICTION_PERIOD_MAX) ?
345 PREDICTION_PERIOD_MAX : irqs->count / 3;
346
347 /*
348 * If we don't have enough irq timings for this prediction,
349 * just bail out.
350 */
351 if (period_max <= PREDICTION_PERIOD_MIN)
352 return U64_MAX;
353
354 /*
355 * 'count' will depends if the circular buffer wrapped or not
356 */
357 count = irqs->count < IRQ_TIMINGS_SIZE ?
358 irqs->count : IRQ_TIMINGS_SIZE;
359
360 start = irqs->count < IRQ_TIMINGS_SIZE ?
361 0 : (irqs->count & IRQ_TIMINGS_MASK);
362
363 /*
364 * Copy the content of the circular buffer into another buffer
365 * in order to linearize the buffer instead of dealing with
366 * wrapping indexes and shifted array which will be prone to
367 * error and extremelly difficult to debug.
368 */
369 for (i = 0; i < count; i++) {
370 int index = (start + i) & IRQ_TIMINGS_MASK;
371
372 irqs->timings[i] = irqs->circ_timings[index];
373 min = min_t(int, irqs->timings[i], min);
374 }
375
376 index = irq_timings_next_event_index(irqs->timings, count, period_max);
377 if (index < 0)
378 return irqs->last_ts + irqs->ema_time[min];
379
380 return irqs->last_ts + irqs->ema_time[index];
381}
382
383static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts)
107{ 384{
108 u64 old_ts = irqs->last_ts; 385 u64 old_ts = irqs->last_ts;
109 u64 variance = 0;
110 u64 interval; 386 u64 interval;
111 s64 diff; 387 int index;
112 388
113 /* 389 /*
114 * The timestamps are absolute time values, we need to compute 390 * The timestamps are absolute time values, we need to compute
@@ -135,87 +411,28 @@ static void irqs_update(struct irqt_stat *irqs, u64 ts)
135 * want as we need another timestamp to compute an interval. 411 * want as we need another timestamp to compute an interval.
136 */ 412 */
137 if (interval >= NSEC_PER_SEC) { 413 if (interval >= NSEC_PER_SEC) {
138 memset(irqs, 0, sizeof(*irqs)); 414 irqs->count = 0;
139 irqs->last_ts = ts;
140 return; 415 return;
141 } 416 }
142 417
143 /* 418 /*
144 * Pre-compute the delta with the average as the result is 419 * Get the index in the ema table for this interrupt. The
145 * used several times in this function. 420 * PREDICTION_FACTOR increase the interval size for the array
146 */ 421 * of exponential average.
147 diff = interval - irqs->avg;
148
149 /*
150 * Increment the number of samples.
151 */
152 irqs->nr_samples++;
153
154 /*
155 * Online variance divided by the number of elements if there
156 * is more than one sample. Normally the formula is division
157 * by nr_samples - 1 but we assume the number of element will be
158 * more than 32 and dividing by 32 instead of 31 is enough
159 * precise.
160 */
161 if (likely(irqs->nr_samples > 1))
162 variance = irqs->variance >> IRQ_TIMINGS_SHIFT;
163
164 /*
165 * The rule of thumb in statistics for the normal distribution
166 * is having at least 30 samples in order to have the model to
167 * apply. Values outside the interval are considered as an
168 * anomaly.
169 */
170 if ((irqs->nr_samples >= 30) && ((diff * diff) > (9 * variance))) {
171 /*
172 * After three consecutive anomalies, we reset the
173 * stats as it is no longer stable enough.
174 */
175 if (irqs->anomalies++ >= 3) {
176 memset(irqs, 0, sizeof(*irqs));
177 irqs->last_ts = ts;
178 return;
179 }
180 } else {
181 /*
182 * The anomalies must be consecutives, so at this
183 * point, we reset the anomalies counter.
184 */
185 irqs->anomalies = 0;
186 }
187
188 /*
189 * The interrupt is considered stable enough to try to predict
190 * the next event on it.
191 */ 422 */
192 irqs->valid = 1; 423 index = likely(interval) ?
424 ilog2((interval >> 10) / PREDICTION_FACTOR) : 0;
193 425
194 /* 426 /*
195 * Online average algorithm: 427 * Store the index as an element of the pattern in another
196 * 428 * circular array.
197 * new_average = average + ((value - average) / count)
198 *
199 * The variance computation depends on the new average
200 * to be computed here first.
201 *
202 */ 429 */
203 irqs->avg = irqs->avg + (diff >> IRQ_TIMINGS_SHIFT); 430 irqs->circ_timings[irqs->count & IRQ_TIMINGS_MASK] = index;
204 431
205 /* 432 irqs->ema_time[index] = irq_timings_ema_new(interval,
206 * Online variance algorithm: 433 irqs->ema_time[index]);
207 *
208 * new_variance = variance + (value - average) x (value - new_average)
209 *
210 * Warning: irqs->avg is updated with the line above, hence
211 * 'interval - irqs->avg' is no longer equal to 'diff'
212 */
213 irqs->variance = irqs->variance + (diff * (interval - irqs->avg));
214 434
215 /* 435 irqs->count++;
216 * Update the next event
217 */
218 irqs->next_evt = ts + irqs->avg;
219} 436}
220 437
221/** 438/**
@@ -259,6 +476,9 @@ u64 irq_timings_next_event(u64 now)
259 */ 476 */
260 lockdep_assert_irqs_disabled(); 477 lockdep_assert_irqs_disabled();
261 478
479 if (!irqts->count)
480 return next_evt;
481
262 /* 482 /*
263 * Number of elements in the circular buffer: If it happens it 483 * Number of elements in the circular buffer: If it happens it
264 * was flushed before, then the number of elements could be 484 * was flushed before, then the number of elements could be
@@ -269,21 +489,19 @@ u64 irq_timings_next_event(u64 now)
269 * type but with the cost of extra computation in the 489 * type but with the cost of extra computation in the
270 * interrupt handler hot path. We choose efficiency. 490 * interrupt handler hot path. We choose efficiency.
271 * 491 *
272 * Inject measured irq/timestamp to the statistical model 492 * Inject measured irq/timestamp to the pattern prediction
273 * while decrementing the counter because we consume the data 493 * model while decrementing the counter because we consume the
274 * from our circular buffer. 494 * data from our circular buffer.
275 */ 495 */
276 for (i = irqts->count & IRQ_TIMINGS_MASK,
277 irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count);
278 irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) {
279 496
280 irq = irq_timing_decode(irqts->values[i], &ts); 497 i = (irqts->count & IRQ_TIMINGS_MASK) - 1;
498 irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count);
281 499
500 for (; irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) {
501 irq = irq_timing_decode(irqts->values[i], &ts);
282 s = idr_find(&irqt_stats, irq); 502 s = idr_find(&irqt_stats, irq);
283 if (s) { 503 if (s)
284 irqs = this_cpu_ptr(s); 504 irq_timings_store(irq, this_cpu_ptr(s), ts);
285 irqs_update(irqs, ts);
286 }
287 } 505 }
288 506
289 /* 507 /*
@@ -294,26 +512,12 @@ u64 irq_timings_next_event(u64 now)
294 512
295 irqs = this_cpu_ptr(s); 513 irqs = this_cpu_ptr(s);
296 514
297 if (!irqs->valid) 515 ts = __irq_timings_next_event(irqs, i, now);
298 continue; 516 if (ts <= now)
517 return now;
299 518
300 if (irqs->next_evt <= now) { 519 if (ts < next_evt)
301 irq = i; 520 next_evt = ts;
302 next_evt = now;
303
304 /*
305 * This interrupt mustn't use in the future
306 * until new events occur and update the
307 * statistics.
308 */
309 irqs->valid = 0;
310 break;
311 }
312
313 if (irqs->next_evt < next_evt) {
314 irq = i;
315 next_evt = irqs->next_evt;
316 }
317 } 521 }
318 522
319 return next_evt; 523 return next_evt;
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 6b7cdf17ccf8..73288914ed5e 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -56,61 +56,70 @@ void __weak arch_irq_work_raise(void)
56 */ 56 */
57} 57}
58 58
59/* 59/* Enqueue on current CPU, work must already be claimed and preempt disabled */
60 * Enqueue the irq_work @work on @cpu unless it's already pending 60static void __irq_work_queue_local(struct irq_work *work)
61 * somewhere.
62 *
63 * Can be re-enqueued while the callback is still in progress.
64 */
65bool irq_work_queue_on(struct irq_work *work, int cpu)
66{ 61{
67 /* All work should have been flushed before going offline */ 62 /* If the work is "lazy", handle it from next tick if any */
68 WARN_ON_ONCE(cpu_is_offline(cpu)); 63 if (work->flags & IRQ_WORK_LAZY) {
69 64 if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
70#ifdef CONFIG_SMP 65 tick_nohz_tick_stopped())
71 66 arch_irq_work_raise();
72 /* Arch remote IPI send/receive backend aren't NMI safe */ 67 } else {
73 WARN_ON_ONCE(in_nmi()); 68 if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
69 arch_irq_work_raise();
70 }
71}
74 72
73/* Enqueue the irq work @work on the current CPU */
74bool irq_work_queue(struct irq_work *work)
75{
75 /* Only queue if not already pending */ 76 /* Only queue if not already pending */
76 if (!irq_work_claim(work)) 77 if (!irq_work_claim(work))
77 return false; 78 return false;
78 79
79 if (llist_add(&work->llnode, &per_cpu(raised_list, cpu))) 80 /* Queue the entry and raise the IPI if needed. */
80 arch_send_call_function_single_ipi(cpu); 81 preempt_disable();
81 82 __irq_work_queue_local(work);
82#else /* #ifdef CONFIG_SMP */ 83 preempt_enable();
83 irq_work_queue(work);
84#endif /* #else #ifdef CONFIG_SMP */
85 84
86 return true; 85 return true;
87} 86}
87EXPORT_SYMBOL_GPL(irq_work_queue);
88 88
89/* Enqueue the irq work @work on the current CPU */ 89/*
90bool irq_work_queue(struct irq_work *work) 90 * Enqueue the irq_work @work on @cpu unless it's already pending
91 * somewhere.
92 *
93 * Can be re-enqueued while the callback is still in progress.
94 */
95bool irq_work_queue_on(struct irq_work *work, int cpu)
91{ 96{
97#ifndef CONFIG_SMP
98 return irq_work_queue(work);
99
100#else /* CONFIG_SMP: */
101 /* All work should have been flushed before going offline */
102 WARN_ON_ONCE(cpu_is_offline(cpu));
103
92 /* Only queue if not already pending */ 104 /* Only queue if not already pending */
93 if (!irq_work_claim(work)) 105 if (!irq_work_claim(work))
94 return false; 106 return false;
95 107
96 /* Queue the entry and raise the IPI if needed. */
97 preempt_disable(); 108 preempt_disable();
98 109 if (cpu != smp_processor_id()) {
99 /* If the work is "lazy", handle it from next tick if any */ 110 /* Arch remote IPI send/receive backend aren't NMI safe */
100 if (work->flags & IRQ_WORK_LAZY) { 111 WARN_ON_ONCE(in_nmi());
101 if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) && 112 if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
102 tick_nohz_tick_stopped()) 113 arch_send_call_function_single_ipi(cpu);
103 arch_irq_work_raise();
104 } else { 114 } else {
105 if (llist_add(&work->llnode, this_cpu_ptr(&raised_list))) 115 __irq_work_queue_local(work);
106 arch_irq_work_raise();
107 } 116 }
108
109 preempt_enable(); 117 preempt_enable();
110 118
111 return true; 119 return true;
120#endif /* CONFIG_SMP */
112} 121}
113EXPORT_SYMBOL_GPL(irq_work_queue); 122
114 123
115bool irq_work_needs_cpu(void) 124bool irq_work_needs_cpu(void)
116{ 125{
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index bad96b476eb6..de6efdecc70d 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -202,11 +202,13 @@ void static_key_disable(struct static_key *key)
202} 202}
203EXPORT_SYMBOL_GPL(static_key_disable); 203EXPORT_SYMBOL_GPL(static_key_disable);
204 204
205static void __static_key_slow_dec_cpuslocked(struct static_key *key, 205static bool static_key_slow_try_dec(struct static_key *key)
206 unsigned long rate_limit,
207 struct delayed_work *work)
208{ 206{
209 lockdep_assert_cpus_held(); 207 int val;
208
209 val = atomic_fetch_add_unless(&key->enabled, -1, 1);
210 if (val == 1)
211 return false;
210 212
211 /* 213 /*
212 * The negative count check is valid even when a negative 214 * The negative count check is valid even when a negative
@@ -215,63 +217,70 @@ static void __static_key_slow_dec_cpuslocked(struct static_key *key,
215 * returns is unbalanced, because all other static_key_slow_inc() 217 * returns is unbalanced, because all other static_key_slow_inc()
216 * instances block while the update is in progress. 218 * instances block while the update is in progress.
217 */ 219 */
218 if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) { 220 WARN(val < 0, "jump label: negative count!\n");
219 WARN(atomic_read(&key->enabled) < 0, 221 return true;
220 "jump label: negative count!\n"); 222}
223
224static void __static_key_slow_dec_cpuslocked(struct static_key *key)
225{
226 lockdep_assert_cpus_held();
227
228 if (static_key_slow_try_dec(key))
221 return; 229 return;
222 }
223 230
224 if (rate_limit) { 231 jump_label_lock();
225 atomic_inc(&key->enabled); 232 if (atomic_dec_and_test(&key->enabled))
226 schedule_delayed_work(work, rate_limit);
227 } else {
228 jump_label_update(key); 233 jump_label_update(key);
229 }
230 jump_label_unlock(); 234 jump_label_unlock();
231} 235}
232 236
233static void __static_key_slow_dec(struct static_key *key, 237static void __static_key_slow_dec(struct static_key *key)
234 unsigned long rate_limit,
235 struct delayed_work *work)
236{ 238{
237 cpus_read_lock(); 239 cpus_read_lock();
238 __static_key_slow_dec_cpuslocked(key, rate_limit, work); 240 __static_key_slow_dec_cpuslocked(key);
239 cpus_read_unlock(); 241 cpus_read_unlock();
240} 242}
241 243
242static void jump_label_update_timeout(struct work_struct *work) 244void jump_label_update_timeout(struct work_struct *work)
243{ 245{
244 struct static_key_deferred *key = 246 struct static_key_deferred *key =
245 container_of(work, struct static_key_deferred, work.work); 247 container_of(work, struct static_key_deferred, work.work);
246 __static_key_slow_dec(&key->key, 0, NULL); 248 __static_key_slow_dec(&key->key);
247} 249}
250EXPORT_SYMBOL_GPL(jump_label_update_timeout);
248 251
249void static_key_slow_dec(struct static_key *key) 252void static_key_slow_dec(struct static_key *key)
250{ 253{
251 STATIC_KEY_CHECK_USE(key); 254 STATIC_KEY_CHECK_USE(key);
252 __static_key_slow_dec(key, 0, NULL); 255 __static_key_slow_dec(key);
253} 256}
254EXPORT_SYMBOL_GPL(static_key_slow_dec); 257EXPORT_SYMBOL_GPL(static_key_slow_dec);
255 258
256void static_key_slow_dec_cpuslocked(struct static_key *key) 259void static_key_slow_dec_cpuslocked(struct static_key *key)
257{ 260{
258 STATIC_KEY_CHECK_USE(key); 261 STATIC_KEY_CHECK_USE(key);
259 __static_key_slow_dec_cpuslocked(key, 0, NULL); 262 __static_key_slow_dec_cpuslocked(key);
260} 263}
261 264
262void static_key_slow_dec_deferred(struct static_key_deferred *key) 265void __static_key_slow_dec_deferred(struct static_key *key,
266 struct delayed_work *work,
267 unsigned long timeout)
263{ 268{
264 STATIC_KEY_CHECK_USE(key); 269 STATIC_KEY_CHECK_USE(key);
265 __static_key_slow_dec(&key->key, key->timeout, &key->work); 270
271 if (static_key_slow_try_dec(key))
272 return;
273
274 schedule_delayed_work(work, timeout);
266} 275}
267EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred); 276EXPORT_SYMBOL_GPL(__static_key_slow_dec_deferred);
268 277
269void static_key_deferred_flush(struct static_key_deferred *key) 278void __static_key_deferred_flush(void *key, struct delayed_work *work)
270{ 279{
271 STATIC_KEY_CHECK_USE(key); 280 STATIC_KEY_CHECK_USE(key);
272 flush_delayed_work(&key->work); 281 flush_delayed_work(work);
273} 282}
274EXPORT_SYMBOL_GPL(static_key_deferred_flush); 283EXPORT_SYMBOL_GPL(__static_key_deferred_flush);
275 284
276void jump_label_rate_limit(struct static_key_deferred *key, 285void jump_label_rate_limit(struct static_key_deferred *key,
277 unsigned long rl) 286 unsigned long rl)
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index d7140447be75..fd5c95ff9251 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1150,7 +1150,7 @@ int kernel_kexec(void)
1150 error = dpm_suspend_end(PMSG_FREEZE); 1150 error = dpm_suspend_end(PMSG_FREEZE);
1151 if (error) 1151 if (error)
1152 goto Resume_devices; 1152 goto Resume_devices;
1153 error = disable_nonboot_cpus(); 1153 error = suspend_disable_secondary_cpus();
1154 if (error) 1154 if (error)
1155 goto Enable_cpus; 1155 goto Enable_cpus;
1156 local_irq_disable(); 1156 local_irq_disable();
@@ -1183,7 +1183,7 @@ int kernel_kexec(void)
1183 Enable_irqs: 1183 Enable_irqs:
1184 local_irq_enable(); 1184 local_irq_enable();
1185 Enable_cpus: 1185 Enable_cpus:
1186 enable_nonboot_cpus(); 1186 suspend_enable_secondary_cpus();
1187 dpm_resume_start(PMSG_RESTORE); 1187 dpm_resume_start(PMSG_RESTORE);
1188 Resume_devices: 1188 Resume_devices:
1189 dpm_resume_end(PMSG_RESTORE); 1189 dpm_resume_end(PMSG_RESTORE);
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index f1d0e00a3971..f7fb8f6a688f 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -688,7 +688,6 @@ static int kexec_calculate_store_digests(struct kimage *image)
688 goto out_free_desc; 688 goto out_free_desc;
689 689
690 desc->tfm = tfm; 690 desc->tfm = tfm;
691 desc->flags = 0;
692 691
693 ret = crypto_shash_init(desc); 692 ret = crypto_shash_init(desc);
694 if (ret < 0) 693 if (ret < 0)
diff --git a/kernel/kheaders.c b/kernel/kheaders.c
new file mode 100644
index 000000000000..70ae6052920d
--- /dev/null
+++ b/kernel/kheaders.c
@@ -0,0 +1,74 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Provide kernel headers useful to build tracing programs
4 * such as for running eBPF tracing tools.
5 *
6 * (Borrowed code from kernel/configs.c)
7 */
8
9#include <linux/kernel.h>
10#include <linux/module.h>
11#include <linux/proc_fs.h>
12#include <linux/init.h>
13#include <linux/uaccess.h>
14
15/*
16 * Define kernel_headers_data and kernel_headers_data_end, within which the
17 * compressed kernel headers are stored. The file is first compressed with xz.
18 */
19
20asm (
21" .pushsection .rodata, \"a\" \n"
22" .global kernel_headers_data \n"
23"kernel_headers_data: \n"
24" .incbin \"kernel/kheaders_data.tar.xz\" \n"
25" .global kernel_headers_data_end \n"
26"kernel_headers_data_end: \n"
27" .popsection \n"
28);
29
30extern char kernel_headers_data;
31extern char kernel_headers_data_end;
32
33static ssize_t
34ikheaders_read_current(struct file *file, char __user *buf,
35 size_t len, loff_t *offset)
36{
37 return simple_read_from_buffer(buf, len, offset,
38 &kernel_headers_data,
39 &kernel_headers_data_end -
40 &kernel_headers_data);
41}
42
43static const struct file_operations ikheaders_file_ops = {
44 .read = ikheaders_read_current,
45 .llseek = default_llseek,
46};
47
48static int __init ikheaders_init(void)
49{
50 struct proc_dir_entry *entry;
51
52 /* create the current headers file */
53 entry = proc_create("kheaders.tar.xz", S_IRUGO, NULL,
54 &ikheaders_file_ops);
55 if (!entry)
56 return -ENOMEM;
57
58 proc_set_size(entry,
59 &kernel_headers_data_end -
60 &kernel_headers_data);
61 return 0;
62}
63
64static void __exit ikheaders_cleanup(void)
65{
66 remove_proc_entry("kheaders.tar.xz", NULL);
67}
68
69module_init(ikheaders_init);
70module_exit(ikheaders_cleanup);
71
72MODULE_LICENSE("GPL v2");
73MODULE_AUTHOR("Joel Fernandes");
74MODULE_DESCRIPTION("Echo the kernel header artifacts used to build the kernel");
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index c83e54727131..b1ea30a5540e 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -709,7 +709,6 @@ static void unoptimize_kprobe(struct kprobe *p, bool force)
709static int reuse_unused_kprobe(struct kprobe *ap) 709static int reuse_unused_kprobe(struct kprobe *ap)
710{ 710{
711 struct optimized_kprobe *op; 711 struct optimized_kprobe *op;
712 int ret;
713 712
714 /* 713 /*
715 * Unused kprobe MUST be on the way of delayed unoptimizing (means 714 * Unused kprobe MUST be on the way of delayed unoptimizing (means
@@ -720,9 +719,8 @@ static int reuse_unused_kprobe(struct kprobe *ap)
720 /* Enable the probe again */ 719 /* Enable the probe again */
721 ap->flags &= ~KPROBE_FLAG_DISABLED; 720 ap->flags &= ~KPROBE_FLAG_DISABLED;
722 /* Optimize it again (remove from op->list) */ 721 /* Optimize it again (remove from op->list) */
723 ret = kprobe_optready(ap); 722 if (!kprobe_optready(ap))
724 if (ret) 723 return -EINVAL;
725 return ret;
726 724
727 optimize_kprobe(ap); 725 optimize_kprobe(ap);
728 return 0; 726 return 0;
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 96b4179cee6a..99a5b5f46dc5 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -120,8 +120,8 @@ account_global_scheduler_latency(struct task_struct *tsk,
120 break; 120 break;
121 } 121 }
122 122
123 /* 0 and ULONG_MAX entries mean end of backtrace: */ 123 /* 0 entry marks end of backtrace: */
124 if (record == 0 || record == ULONG_MAX) 124 if (!record)
125 break; 125 break;
126 } 126 }
127 if (same) { 127 if (same) {
@@ -141,20 +141,6 @@ account_global_scheduler_latency(struct task_struct *tsk,
141 memcpy(&latency_record[i], lat, sizeof(struct latency_record)); 141 memcpy(&latency_record[i], lat, sizeof(struct latency_record));
142} 142}
143 143
144/*
145 * Iterator to store a backtrace into a latency record entry
146 */
147static inline void store_stacktrace(struct task_struct *tsk,
148 struct latency_record *lat)
149{
150 struct stack_trace trace;
151
152 memset(&trace, 0, sizeof(trace));
153 trace.max_entries = LT_BACKTRACEDEPTH;
154 trace.entries = &lat->backtrace[0];
155 save_stack_trace_tsk(tsk, &trace);
156}
157
158/** 144/**
159 * __account_scheduler_latency - record an occurred latency 145 * __account_scheduler_latency - record an occurred latency
160 * @tsk - the task struct of the task hitting the latency 146 * @tsk - the task struct of the task hitting the latency
@@ -191,7 +177,8 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
191 lat.count = 1; 177 lat.count = 1;
192 lat.time = usecs; 178 lat.time = usecs;
193 lat.max = usecs; 179 lat.max = usecs;
194 store_stacktrace(tsk, &lat); 180
181 stack_trace_save_tsk(tsk, lat.backtrace, LT_BACKTRACEDEPTH, 0);
195 182
196 raw_spin_lock_irqsave(&latency_lock, flags); 183 raw_spin_lock_irqsave(&latency_lock, flags);
197 184
@@ -210,8 +197,8 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
210 break; 197 break;
211 } 198 }
212 199
213 /* 0 and ULONG_MAX entries mean end of backtrace: */ 200 /* 0 entry is end of backtrace */
214 if (record == 0 || record == ULONG_MAX) 201 if (!record)
215 break; 202 break;
216 } 203 }
217 if (same) { 204 if (same) {
@@ -252,10 +239,10 @@ static int lstats_show(struct seq_file *m, void *v)
252 lr->count, lr->time, lr->max); 239 lr->count, lr->time, lr->max);
253 for (q = 0; q < LT_BACKTRACEDEPTH; q++) { 240 for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
254 unsigned long bt = lr->backtrace[q]; 241 unsigned long bt = lr->backtrace[q];
242
255 if (!bt) 243 if (!bt)
256 break; 244 break;
257 if (bt == ULONG_MAX) 245
258 break;
259 seq_printf(m, " %ps", (void *)bt); 246 seq_printf(m, " %ps", (void *)bt);
260 } 247 }
261 seq_puts(m, "\n"); 248 seq_puts(m, "\n");
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index eb0ee10a1981..f6fbaff10e71 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -419,6 +419,7 @@ static struct attribute *klp_patch_attrs[] = {
419 &force_kobj_attr.attr, 419 &force_kobj_attr.attr,
420 NULL 420 NULL
421}; 421};
422ATTRIBUTE_GROUPS(klp_patch);
422 423
423static void klp_free_object_dynamic(struct klp_object *obj) 424static void klp_free_object_dynamic(struct klp_object *obj)
424{ 425{
@@ -426,7 +427,13 @@ static void klp_free_object_dynamic(struct klp_object *obj)
426 kfree(obj); 427 kfree(obj);
427} 428}
428 429
429static struct klp_object *klp_alloc_object_dynamic(const char *name) 430static void klp_init_func_early(struct klp_object *obj,
431 struct klp_func *func);
432static void klp_init_object_early(struct klp_patch *patch,
433 struct klp_object *obj);
434
435static struct klp_object *klp_alloc_object_dynamic(const char *name,
436 struct klp_patch *patch)
430{ 437{
431 struct klp_object *obj; 438 struct klp_object *obj;
432 439
@@ -442,7 +449,7 @@ static struct klp_object *klp_alloc_object_dynamic(const char *name)
442 } 449 }
443 } 450 }
444 451
445 INIT_LIST_HEAD(&obj->func_list); 452 klp_init_object_early(patch, obj);
446 obj->dynamic = true; 453 obj->dynamic = true;
447 454
448 return obj; 455 return obj;
@@ -471,6 +478,7 @@ static struct klp_func *klp_alloc_func_nop(struct klp_func *old_func,
471 } 478 }
472 } 479 }
473 480
481 klp_init_func_early(obj, func);
474 /* 482 /*
475 * func->new_func is same as func->old_func. These addresses are 483 * func->new_func is same as func->old_func. These addresses are
476 * set when the object is loaded, see klp_init_object_loaded(). 484 * set when the object is loaded, see klp_init_object_loaded().
@@ -490,11 +498,9 @@ static int klp_add_object_nops(struct klp_patch *patch,
490 obj = klp_find_object(patch, old_obj); 498 obj = klp_find_object(patch, old_obj);
491 499
492 if (!obj) { 500 if (!obj) {
493 obj = klp_alloc_object_dynamic(old_obj->name); 501 obj = klp_alloc_object_dynamic(old_obj->name, patch);
494 if (!obj) 502 if (!obj)
495 return -ENOMEM; 503 return -ENOMEM;
496
497 list_add_tail(&obj->node, &patch->obj_list);
498 } 504 }
499 505
500 klp_for_each_func(old_obj, old_func) { 506 klp_for_each_func(old_obj, old_func) {
@@ -505,8 +511,6 @@ static int klp_add_object_nops(struct klp_patch *patch,
505 func = klp_alloc_func_nop(old_func, obj); 511 func = klp_alloc_func_nop(old_func, obj);
506 if (!func) 512 if (!func)
507 return -ENOMEM; 513 return -ENOMEM;
508
509 list_add_tail(&func->node, &obj->func_list);
510 } 514 }
511 515
512 return 0; 516 return 0;
@@ -546,7 +550,7 @@ static void klp_kobj_release_patch(struct kobject *kobj)
546static struct kobj_type klp_ktype_patch = { 550static struct kobj_type klp_ktype_patch = {
547 .release = klp_kobj_release_patch, 551 .release = klp_kobj_release_patch,
548 .sysfs_ops = &kobj_sysfs_ops, 552 .sysfs_ops = &kobj_sysfs_ops,
549 .default_attrs = klp_patch_attrs, 553 .default_groups = klp_patch_groups,
550}; 554};
551 555
552static void klp_kobj_release_object(struct kobject *kobj) 556static void klp_kobj_release_object(struct kobject *kobj)
@@ -588,13 +592,7 @@ static void __klp_free_funcs(struct klp_object *obj, bool nops_only)
588 continue; 592 continue;
589 593
590 list_del(&func->node); 594 list_del(&func->node);
591 595 kobject_put(&func->kobj);
592 /* Might be called from klp_init_patch() error path. */
593 if (func->kobj_added) {
594 kobject_put(&func->kobj);
595 } else if (func->nop) {
596 klp_free_func_nop(func);
597 }
598 } 596 }
599} 597}
600 598
@@ -624,13 +622,7 @@ static void __klp_free_objects(struct klp_patch *patch, bool nops_only)
624 continue; 622 continue;
625 623
626 list_del(&obj->node); 624 list_del(&obj->node);
627 625 kobject_put(&obj->kobj);
628 /* Might be called from klp_init_patch() error path. */
629 if (obj->kobj_added) {
630 kobject_put(&obj->kobj);
631 } else if (obj->dynamic) {
632 klp_free_object_dynamic(obj);
633 }
634 } 626 }
635} 627}
636 628
@@ -675,10 +667,8 @@ static void klp_free_patch_finish(struct klp_patch *patch)
675 * this is called when the patch gets disabled and it 667 * this is called when the patch gets disabled and it
676 * cannot get enabled again. 668 * cannot get enabled again.
677 */ 669 */
678 if (patch->kobj_added) { 670 kobject_put(&patch->kobj);
679 kobject_put(&patch->kobj); 671 wait_for_completion(&patch->finish);
680 wait_for_completion(&patch->finish);
681 }
682 672
683 /* Put the module after the last access to struct klp_patch. */ 673 /* Put the module after the last access to struct klp_patch. */
684 if (!patch->forced) 674 if (!patch->forced)
@@ -700,8 +690,6 @@ static void klp_free_patch_work_fn(struct work_struct *work)
700 690
701static int klp_init_func(struct klp_object *obj, struct klp_func *func) 691static int klp_init_func(struct klp_object *obj, struct klp_func *func)
702{ 692{
703 int ret;
704
705 if (!func->old_name) 693 if (!func->old_name)
706 return -EINVAL; 694 return -EINVAL;
707 695
@@ -724,13 +712,9 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func)
724 * object. If the user selects 0 for old_sympos, then 1 will be used 712 * object. If the user selects 0 for old_sympos, then 1 will be used
725 * since a unique symbol will be the first occurrence. 713 * since a unique symbol will be the first occurrence.
726 */ 714 */
727 ret = kobject_init_and_add(&func->kobj, &klp_ktype_func, 715 return kobject_add(&func->kobj, &obj->kobj, "%s,%lu",
728 &obj->kobj, "%s,%lu", func->old_name, 716 func->old_name,
729 func->old_sympos ? func->old_sympos : 1); 717 func->old_sympos ? func->old_sympos : 1);
730 if (!ret)
731 func->kobj_added = true;
732
733 return ret;
734} 718}
735 719
736/* Arches may override this to finish any remaining arch-specific tasks */ 720/* Arches may override this to finish any remaining arch-specific tasks */
@@ -801,11 +785,9 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)
801 klp_find_object_module(obj); 785 klp_find_object_module(obj);
802 786
803 name = klp_is_module(obj) ? obj->name : "vmlinux"; 787 name = klp_is_module(obj) ? obj->name : "vmlinux";
804 ret = kobject_init_and_add(&obj->kobj, &klp_ktype_object, 788 ret = kobject_add(&obj->kobj, &patch->kobj, "%s", name);
805 &patch->kobj, "%s", name);
806 if (ret) 789 if (ret)
807 return ret; 790 return ret;
808 obj->kobj_added = true;
809 791
810 klp_for_each_func(obj, func) { 792 klp_for_each_func(obj, func) {
811 ret = klp_init_func(obj, func); 793 ret = klp_init_func(obj, func);
@@ -819,6 +801,21 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)
819 return ret; 801 return ret;
820} 802}
821 803
804static void klp_init_func_early(struct klp_object *obj,
805 struct klp_func *func)
806{
807 kobject_init(&func->kobj, &klp_ktype_func);
808 list_add_tail(&func->node, &obj->func_list);
809}
810
811static void klp_init_object_early(struct klp_patch *patch,
812 struct klp_object *obj)
813{
814 INIT_LIST_HEAD(&obj->func_list);
815 kobject_init(&obj->kobj, &klp_ktype_object);
816 list_add_tail(&obj->node, &patch->obj_list);
817}
818
822static int klp_init_patch_early(struct klp_patch *patch) 819static int klp_init_patch_early(struct klp_patch *patch)
823{ 820{
824 struct klp_object *obj; 821 struct klp_object *obj;
@@ -829,7 +826,7 @@ static int klp_init_patch_early(struct klp_patch *patch)
829 826
830 INIT_LIST_HEAD(&patch->list); 827 INIT_LIST_HEAD(&patch->list);
831 INIT_LIST_HEAD(&patch->obj_list); 828 INIT_LIST_HEAD(&patch->obj_list);
832 patch->kobj_added = false; 829 kobject_init(&patch->kobj, &klp_ktype_patch);
833 patch->enabled = false; 830 patch->enabled = false;
834 patch->forced = false; 831 patch->forced = false;
835 INIT_WORK(&patch->free_work, klp_free_patch_work_fn); 832 INIT_WORK(&patch->free_work, klp_free_patch_work_fn);
@@ -839,13 +836,10 @@ static int klp_init_patch_early(struct klp_patch *patch)
839 if (!obj->funcs) 836 if (!obj->funcs)
840 return -EINVAL; 837 return -EINVAL;
841 838
842 INIT_LIST_HEAD(&obj->func_list); 839 klp_init_object_early(patch, obj);
843 obj->kobj_added = false;
844 list_add_tail(&obj->node, &patch->obj_list);
845 840
846 klp_for_each_func_static(obj, func) { 841 klp_for_each_func_static(obj, func) {
847 func->kobj_added = false; 842 klp_init_func_early(obj, func);
848 list_add_tail(&func->node, &obj->func_list);
849 } 843 }
850 } 844 }
851 845
@@ -860,11 +854,9 @@ static int klp_init_patch(struct klp_patch *patch)
860 struct klp_object *obj; 854 struct klp_object *obj;
861 int ret; 855 int ret;
862 856
863 ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch, 857 ret = kobject_add(&patch->kobj, klp_root_kobj, "%s", patch->mod->name);
864 klp_root_kobj, "%s", patch->mod->name);
865 if (ret) 858 if (ret)
866 return ret; 859 return ret;
867 patch->kobj_added = true;
868 860
869 if (patch->replace) { 861 if (patch->replace) {
870 ret = klp_add_nops(patch); 862 ret = klp_add_nops(patch);
@@ -926,9 +918,6 @@ static int __klp_enable_patch(struct klp_patch *patch)
926 if (WARN_ON(patch->enabled)) 918 if (WARN_ON(patch->enabled))
927 return -EINVAL; 919 return -EINVAL;
928 920
929 if (!patch->kobj_added)
930 return -EINVAL;
931
932 pr_notice("enabling patch '%s'\n", patch->mod->name); 921 pr_notice("enabling patch '%s'\n", patch->mod->name);
933 922
934 klp_init_transition(patch, KLP_PATCHED); 923 klp_init_transition(patch, KLP_PATCHED);
@@ -1003,11 +992,10 @@ int klp_enable_patch(struct klp_patch *patch)
1003 return -ENODEV; 992 return -ENODEV;
1004 993
1005 if (!klp_have_reliable_stack()) { 994 if (!klp_have_reliable_stack()) {
1006 pr_err("This architecture doesn't have support for the livepatch consistency model.\n"); 995 pr_warn("This architecture doesn't have support for the livepatch consistency model.\n");
1007 return -EOPNOTSUPP; 996 pr_warn("The livepatch transition may never complete.\n");
1008 } 997 }
1009 998
1010
1011 mutex_lock(&klp_mutex); 999 mutex_lock(&klp_mutex);
1012 1000
1013 ret = klp_init_patch_early(patch); 1001 ret = klp_init_patch_early(patch);
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index 9c89ae8b337a..c53370d596be 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -202,15 +202,15 @@ void klp_update_patch_state(struct task_struct *task)
202 * Determine whether the given stack trace includes any references to a 202 * Determine whether the given stack trace includes any references to a
203 * to-be-patched or to-be-unpatched function. 203 * to-be-patched or to-be-unpatched function.
204 */ 204 */
205static int klp_check_stack_func(struct klp_func *func, 205static int klp_check_stack_func(struct klp_func *func, unsigned long *entries,
206 struct stack_trace *trace) 206 unsigned int nr_entries)
207{ 207{
208 unsigned long func_addr, func_size, address; 208 unsigned long func_addr, func_size, address;
209 struct klp_ops *ops; 209 struct klp_ops *ops;
210 int i; 210 int i;
211 211
212 for (i = 0; i < trace->nr_entries; i++) { 212 for (i = 0; i < nr_entries; i++) {
213 address = trace->entries[i]; 213 address = entries[i];
214 214
215 if (klp_target_state == KLP_UNPATCHED) { 215 if (klp_target_state == KLP_UNPATCHED) {
216 /* 216 /*
@@ -254,29 +254,25 @@ static int klp_check_stack_func(struct klp_func *func,
254static int klp_check_stack(struct task_struct *task, char *err_buf) 254static int klp_check_stack(struct task_struct *task, char *err_buf)
255{ 255{
256 static unsigned long entries[MAX_STACK_ENTRIES]; 256 static unsigned long entries[MAX_STACK_ENTRIES];
257 struct stack_trace trace;
258 struct klp_object *obj; 257 struct klp_object *obj;
259 struct klp_func *func; 258 struct klp_func *func;
260 int ret; 259 int ret, nr_entries;
261 260
262 trace.skip = 0; 261 ret = stack_trace_save_tsk_reliable(task, entries, ARRAY_SIZE(entries));
263 trace.nr_entries = 0;
264 trace.max_entries = MAX_STACK_ENTRIES;
265 trace.entries = entries;
266 ret = save_stack_trace_tsk_reliable(task, &trace);
267 WARN_ON_ONCE(ret == -ENOSYS); 262 WARN_ON_ONCE(ret == -ENOSYS);
268 if (ret) { 263 if (ret < 0) {
269 snprintf(err_buf, STACK_ERR_BUF_SIZE, 264 snprintf(err_buf, STACK_ERR_BUF_SIZE,
270 "%s: %s:%d has an unreliable stack\n", 265 "%s: %s:%d has an unreliable stack\n",
271 __func__, task->comm, task->pid); 266 __func__, task->comm, task->pid);
272 return ret; 267 return ret;
273 } 268 }
269 nr_entries = ret;
274 270
275 klp_for_each_object(klp_transition_patch, obj) { 271 klp_for_each_object(klp_transition_patch, obj) {
276 if (!obj->patched) 272 if (!obj->patched)
277 continue; 273 continue;
278 klp_for_each_func(obj, func) { 274 klp_for_each_func(obj, func) {
279 ret = klp_check_stack_func(func, &trace); 275 ret = klp_check_stack_func(func, entries, nr_entries);
280 if (ret) { 276 if (ret) {
281 snprintf(err_buf, STACK_ERR_BUF_SIZE, 277 snprintf(err_buf, STACK_ERR_BUF_SIZE,
282 "%s: %s:%d is sleeping on function %s\n", 278 "%s: %s:%d is sleeping on function %s\n",
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 392c7f23af76..6fe2f333aecb 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -3,7 +3,7 @@
3# and is generally not a function of system call inputs. 3# and is generally not a function of system call inputs.
4KCOV_INSTRUMENT := n 4KCOV_INSTRUMENT := n
5 5
6obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o 6obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o rwsem-xadd.o
7 7
8ifdef CONFIG_FUNCTION_TRACER 8ifdef CONFIG_FUNCTION_TRACER
9CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) 9CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
@@ -25,8 +25,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
25obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o 25obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
26obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o 26obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
27obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o 27obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
28obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
29obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
30obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o 28obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
31obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o 29obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
32obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o 30obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o
31obj-$(CONFIG_LOCK_EVENT_COUNTS) += lock_events.o
diff --git a/kernel/locking/lock_events.c b/kernel/locking/lock_events.c
new file mode 100644
index 000000000000..fa2c2f951c6b
--- /dev/null
+++ b/kernel/locking/lock_events.c
@@ -0,0 +1,179 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License as published by
5 * the Free Software Foundation; either version 2 of the License, or
6 * (at your option) any later version.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * Authors: Waiman Long <waiman.long@hpe.com>
14 */
15
16/*
17 * Collect locking event counts
18 */
19#include <linux/debugfs.h>
20#include <linux/sched.h>
21#include <linux/sched/clock.h>
22#include <linux/fs.h>
23
24#include "lock_events.h"
25
26#undef LOCK_EVENT
27#define LOCK_EVENT(name) [LOCKEVENT_ ## name] = #name,
28
29#define LOCK_EVENTS_DIR "lock_event_counts"
30
31/*
32 * When CONFIG_LOCK_EVENT_COUNTS is enabled, event counts of different
33 * types of locks will be reported under the <debugfs>/lock_event_counts/
34 * directory. See lock_events_list.h for the list of available locking
35 * events.
36 *
37 * Writing to the special ".reset_counts" file will reset all the above
38 * locking event counts. This is a very slow operation and so should not
39 * be done frequently.
40 *
41 * These event counts are implemented as per-cpu variables which are
42 * summed and computed whenever the corresponding debugfs files are read. This
43 * minimizes added overhead making the counts usable even in a production
44 * environment.
45 */
46static const char * const lockevent_names[lockevent_num + 1] = {
47
48#include "lock_events_list.h"
49
50 [LOCKEVENT_reset_cnts] = ".reset_counts",
51};
52
53/*
54 * Per-cpu counts
55 */
56DEFINE_PER_CPU(unsigned long, lockevents[lockevent_num]);
57
58/*
59 * The lockevent_read() function can be overridden.
60 */
61ssize_t __weak lockevent_read(struct file *file, char __user *user_buf,
62 size_t count, loff_t *ppos)
63{
64 char buf[64];
65 int cpu, id, len;
66 u64 sum = 0;
67
68 /*
69 * Get the counter ID stored in file->f_inode->i_private
70 */
71 id = (long)file_inode(file)->i_private;
72
73 if (id >= lockevent_num)
74 return -EBADF;
75
76 for_each_possible_cpu(cpu)
77 sum += per_cpu(lockevents[id], cpu);
78 len = snprintf(buf, sizeof(buf) - 1, "%llu\n", sum);
79
80 return simple_read_from_buffer(user_buf, count, ppos, buf, len);
81}
82
83/*
84 * Function to handle write request
85 *
86 * When idx = reset_cnts, reset all the counts.
87 */
88static ssize_t lockevent_write(struct file *file, const char __user *user_buf,
89 size_t count, loff_t *ppos)
90{
91 int cpu;
92
93 /*
94 * Get the counter ID stored in file->f_inode->i_private
95 */
96 if ((long)file_inode(file)->i_private != LOCKEVENT_reset_cnts)
97 return count;
98
99 for_each_possible_cpu(cpu) {
100 int i;
101 unsigned long *ptr = per_cpu_ptr(lockevents, cpu);
102
103 for (i = 0 ; i < lockevent_num; i++)
104 WRITE_ONCE(ptr[i], 0);
105 }
106 return count;
107}
108
109/*
110 * Debugfs data structures
111 */
112static const struct file_operations fops_lockevent = {
113 .read = lockevent_read,
114 .write = lockevent_write,
115 .llseek = default_llseek,
116};
117
118#ifdef CONFIG_PARAVIRT_SPINLOCKS
119#include <asm/paravirt.h>
120
121static bool __init skip_lockevent(const char *name)
122{
123 static int pv_on __initdata = -1;
124
125 if (pv_on < 0)
126 pv_on = !pv_is_native_spin_unlock();
127 /*
128 * Skip PV qspinlock events on bare metal.
129 */
130 if (!pv_on && !memcmp(name, "pv_", 3))
131 return true;
132 return false;
133}
134#else
135static inline bool skip_lockevent(const char *name)
136{
137 return false;
138}
139#endif
140
141/*
142 * Initialize debugfs for the locking event counts.
143 */
144static int __init init_lockevent_counts(void)
145{
146 struct dentry *d_counts = debugfs_create_dir(LOCK_EVENTS_DIR, NULL);
147 int i;
148
149 if (!d_counts)
150 goto out;
151
152 /*
153 * Create the debugfs files
154 *
155 * As reading from and writing to the stat files can be slow, only
156 * root is allowed to do the read/write to limit impact to system
157 * performance.
158 */
159 for (i = 0; i < lockevent_num; i++) {
160 if (skip_lockevent(lockevent_names[i]))
161 continue;
162 if (!debugfs_create_file(lockevent_names[i], 0400, d_counts,
163 (void *)(long)i, &fops_lockevent))
164 goto fail_undo;
165 }
166
167 if (!debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200,
168 d_counts, (void *)(long)LOCKEVENT_reset_cnts,
169 &fops_lockevent))
170 goto fail_undo;
171
172 return 0;
173fail_undo:
174 debugfs_remove_recursive(d_counts);
175out:
176 pr_warn("Could not create '%s' debugfs entries\n", LOCK_EVENTS_DIR);
177 return -ENOMEM;
178}
179fs_initcall(init_lockevent_counts);
diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h
new file mode 100644
index 000000000000..feb1acc54611
--- /dev/null
+++ b/kernel/locking/lock_events.h
@@ -0,0 +1,59 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License as published by
5 * the Free Software Foundation; either version 2 of the License, or
6 * (at your option) any later version.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * Authors: Waiman Long <longman@redhat.com>
14 */
15
16#ifndef __LOCKING_LOCK_EVENTS_H
17#define __LOCKING_LOCK_EVENTS_H
18
19enum lock_events {
20
21#include "lock_events_list.h"
22
23 lockevent_num, /* Total number of lock event counts */
24 LOCKEVENT_reset_cnts = lockevent_num,
25};
26
27#ifdef CONFIG_LOCK_EVENT_COUNTS
28/*
29 * Per-cpu counters
30 */
31DECLARE_PER_CPU(unsigned long, lockevents[lockevent_num]);
32
33/*
34 * Increment the PV qspinlock statistical counters
35 */
36static inline void __lockevent_inc(enum lock_events event, bool cond)
37{
38 if (cond)
39 __this_cpu_inc(lockevents[event]);
40}
41
42#define lockevent_inc(ev) __lockevent_inc(LOCKEVENT_ ##ev, true)
43#define lockevent_cond_inc(ev, c) __lockevent_inc(LOCKEVENT_ ##ev, c)
44
45static inline void __lockevent_add(enum lock_events event, int inc)
46{
47 __this_cpu_add(lockevents[event], inc);
48}
49
50#define lockevent_add(ev, c) __lockevent_add(LOCKEVENT_ ##ev, c)
51
52#else /* CONFIG_LOCK_EVENT_COUNTS */
53
54#define lockevent_inc(ev)
55#define lockevent_add(ev, c)
56#define lockevent_cond_inc(ev, c)
57
58#endif /* CONFIG_LOCK_EVENT_COUNTS */
59#endif /* __LOCKING_LOCK_EVENTS_H */
diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h
new file mode 100644
index 000000000000..ad7668cfc9da
--- /dev/null
+++ b/kernel/locking/lock_events_list.h
@@ -0,0 +1,67 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License as published by
5 * the Free Software Foundation; either version 2 of the License, or
6 * (at your option) any later version.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * Authors: Waiman Long <longman@redhat.com>
14 */
15
16#ifndef LOCK_EVENT
17#define LOCK_EVENT(name) LOCKEVENT_ ## name,
18#endif
19
20#ifdef CONFIG_QUEUED_SPINLOCKS
21#ifdef CONFIG_PARAVIRT_SPINLOCKS
22/*
23 * Locking events for PV qspinlock.
24 */
25LOCK_EVENT(pv_hash_hops) /* Average # of hops per hashing operation */
26LOCK_EVENT(pv_kick_unlock) /* # of vCPU kicks issued at unlock time */
27LOCK_EVENT(pv_kick_wake) /* # of vCPU kicks for pv_latency_wake */
28LOCK_EVENT(pv_latency_kick) /* Average latency (ns) of vCPU kick */
29LOCK_EVENT(pv_latency_wake) /* Average latency (ns) of kick-to-wakeup */
30LOCK_EVENT(pv_lock_stealing) /* # of lock stealing operations */
31LOCK_EVENT(pv_spurious_wakeup) /* # of spurious wakeups in non-head vCPUs */
32LOCK_EVENT(pv_wait_again) /* # of wait's after queue head vCPU kick */
33LOCK_EVENT(pv_wait_early) /* # of early vCPU wait's */
34LOCK_EVENT(pv_wait_head) /* # of vCPU wait's at the queue head */
35LOCK_EVENT(pv_wait_node) /* # of vCPU wait's at non-head queue node */
36#endif /* CONFIG_PARAVIRT_SPINLOCKS */
37
38/*
39 * Locking events for qspinlock
40 *
41 * Subtracting lock_use_node[234] from lock_slowpath will give you
42 * lock_use_node1.
43 */
44LOCK_EVENT(lock_pending) /* # of locking ops via pending code */
45LOCK_EVENT(lock_slowpath) /* # of locking ops via MCS lock queue */
46LOCK_EVENT(lock_use_node2) /* # of locking ops that use 2nd percpu node */
47LOCK_EVENT(lock_use_node3) /* # of locking ops that use 3rd percpu node */
48LOCK_EVENT(lock_use_node4) /* # of locking ops that use 4th percpu node */
49LOCK_EVENT(lock_no_node) /* # of locking ops w/o using percpu node */
50#endif /* CONFIG_QUEUED_SPINLOCKS */
51
52/*
53 * Locking events for rwsem
54 */
55LOCK_EVENT(rwsem_sleep_reader) /* # of reader sleeps */
56LOCK_EVENT(rwsem_sleep_writer) /* # of writer sleeps */
57LOCK_EVENT(rwsem_wake_reader) /* # of reader wakeups */
58LOCK_EVENT(rwsem_wake_writer) /* # of writer wakeups */
59LOCK_EVENT(rwsem_opt_wlock) /* # of write locks opt-spin acquired */
60LOCK_EVENT(rwsem_opt_fail) /* # of failed opt-spinnings */
61LOCK_EVENT(rwsem_rlock) /* # of read locks acquired */
62LOCK_EVENT(rwsem_rlock_fast) /* # of fast read locks acquired */
63LOCK_EVENT(rwsem_rlock_fail) /* # of failed read lock acquisitions */
64LOCK_EVENT(rwsem_rtrylock) /* # of read trylock calls */
65LOCK_EVENT(rwsem_wlock) /* # of write locks acquired */
66LOCK_EVENT(rwsem_wlock_fail) /* # of failed write lock acquisitions */
67LOCK_EVENT(rwsem_wtrylock) /* # of write trylock calls */
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 34cdcbedda49..d06190fa5082 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -434,29 +434,14 @@ static void print_lockdep_off(const char *bug_msg)
434#endif 434#endif
435} 435}
436 436
437static int save_trace(struct stack_trace *trace) 437static int save_trace(struct lock_trace *trace)
438{ 438{
439 trace->nr_entries = 0; 439 unsigned long *entries = stack_trace + nr_stack_trace_entries;
440 trace->max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; 440 unsigned int max_entries;
441 trace->entries = stack_trace + nr_stack_trace_entries;
442
443 trace->skip = 3;
444
445 save_stack_trace(trace);
446
447 /*
448 * Some daft arches put -1 at the end to indicate its a full trace.
449 *
450 * <rant> this is buggy anyway, since it takes a whole extra entry so a
451 * complete trace that maxes out the entries provided will be reported
452 * as incomplete, friggin useless </rant>
453 */
454 if (trace->nr_entries != 0 &&
455 trace->entries[trace->nr_entries-1] == ULONG_MAX)
456 trace->nr_entries--;
457
458 trace->max_entries = trace->nr_entries;
459 441
442 trace->offset = nr_stack_trace_entries;
443 max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries;
444 trace->nr_entries = stack_trace_save(entries, max_entries, 3);
460 nr_stack_trace_entries += trace->nr_entries; 445 nr_stack_trace_entries += trace->nr_entries;
461 446
462 if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) { 447 if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) {
@@ -516,11 +501,11 @@ static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit)
516{ 501{
517 char c = '.'; 502 char c = '.';
518 503
519 if (class->usage_mask & lock_flag(bit + 2)) 504 if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK))
520 c = '+'; 505 c = '+';
521 if (class->usage_mask & lock_flag(bit)) { 506 if (class->usage_mask & lock_flag(bit)) {
522 c = '-'; 507 c = '-';
523 if (class->usage_mask & lock_flag(bit + 2)) 508 if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK))
524 c = '?'; 509 c = '?';
525 } 510 }
526 511
@@ -649,6 +634,9 @@ static int static_obj(const void *obj)
649 end = (unsigned long) &_end, 634 end = (unsigned long) &_end,
650 addr = (unsigned long) obj; 635 addr = (unsigned long) obj;
651 636
637 if (arch_is_kernel_initmem_freed(addr))
638 return 0;
639
652 /* 640 /*
653 * static variable? 641 * static variable?
654 */ 642 */
@@ -1207,7 +1195,7 @@ static struct lock_list *alloc_list_entry(void)
1207static int add_lock_to_list(struct lock_class *this, 1195static int add_lock_to_list(struct lock_class *this,
1208 struct lock_class *links_to, struct list_head *head, 1196 struct lock_class *links_to, struct list_head *head,
1209 unsigned long ip, int distance, 1197 unsigned long ip, int distance,
1210 struct stack_trace *trace) 1198 struct lock_trace *trace)
1211{ 1199{
1212 struct lock_list *entry; 1200 struct lock_list *entry;
1213 /* 1201 /*
@@ -1426,6 +1414,13 @@ static inline int __bfs_backwards(struct lock_list *src_entry,
1426 * checking. 1414 * checking.
1427 */ 1415 */
1428 1416
1417static void print_lock_trace(struct lock_trace *trace, unsigned int spaces)
1418{
1419 unsigned long *entries = stack_trace + trace->offset;
1420
1421 stack_trace_print(entries, trace->nr_entries, spaces);
1422}
1423
1429/* 1424/*
1430 * Print a dependency chain entry (this is only done when a deadlock 1425 * Print a dependency chain entry (this is only done when a deadlock
1431 * has been detected): 1426 * has been detected):
@@ -1438,8 +1433,7 @@ print_circular_bug_entry(struct lock_list *target, int depth)
1438 printk("\n-> #%u", depth); 1433 printk("\n-> #%u", depth);
1439 print_lock_name(target->class); 1434 print_lock_name(target->class);
1440 printk(KERN_CONT ":\n"); 1435 printk(KERN_CONT ":\n");
1441 print_stack_trace(&target->trace, 6); 1436 print_lock_trace(&target->trace, 6);
1442
1443 return 0; 1437 return 0;
1444} 1438}
1445 1439
@@ -1533,10 +1527,9 @@ static inline int class_equal(struct lock_list *entry, void *data)
1533} 1527}
1534 1528
1535static noinline int print_circular_bug(struct lock_list *this, 1529static noinline int print_circular_bug(struct lock_list *this,
1536 struct lock_list *target, 1530 struct lock_list *target,
1537 struct held_lock *check_src, 1531 struct held_lock *check_src,
1538 struct held_lock *check_tgt, 1532 struct held_lock *check_tgt)
1539 struct stack_trace *trace)
1540{ 1533{
1541 struct task_struct *curr = current; 1534 struct task_struct *curr = current;
1542 struct lock_list *parent; 1535 struct lock_list *parent;
@@ -1676,19 +1669,25 @@ check_redundant(struct lock_list *root, struct lock_class *target,
1676} 1669}
1677 1670
1678#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) 1671#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
1672
1673static inline int usage_accumulate(struct lock_list *entry, void *mask)
1674{
1675 *(unsigned long *)mask |= entry->class->usage_mask;
1676
1677 return 0;
1678}
1679
1679/* 1680/*
1680 * Forwards and backwards subgraph searching, for the purposes of 1681 * Forwards and backwards subgraph searching, for the purposes of
1681 * proving that two subgraphs can be connected by a new dependency 1682 * proving that two subgraphs can be connected by a new dependency
1682 * without creating any illegal irq-safe -> irq-unsafe lock dependency. 1683 * without creating any illegal irq-safe -> irq-unsafe lock dependency.
1683 */ 1684 */
1684 1685
1685static inline int usage_match(struct lock_list *entry, void *bit) 1686static inline int usage_match(struct lock_list *entry, void *mask)
1686{ 1687{
1687 return entry->class->usage_mask & (1 << (enum lock_usage_bit)bit); 1688 return entry->class->usage_mask & *(unsigned long *)mask;
1688} 1689}
1689 1690
1690
1691
1692/* 1691/*
1693 * Find a node in the forwards-direction dependency sub-graph starting 1692 * Find a node in the forwards-direction dependency sub-graph starting
1694 * at @root->class that matches @bit. 1693 * at @root->class that matches @bit.
@@ -1700,14 +1699,14 @@ static inline int usage_match(struct lock_list *entry, void *bit)
1700 * Return <0 on error. 1699 * Return <0 on error.
1701 */ 1700 */
1702static int 1701static int
1703find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit, 1702find_usage_forwards(struct lock_list *root, unsigned long usage_mask,
1704 struct lock_list **target_entry) 1703 struct lock_list **target_entry)
1705{ 1704{
1706 int result; 1705 int result;
1707 1706
1708 debug_atomic_inc(nr_find_usage_forwards_checks); 1707 debug_atomic_inc(nr_find_usage_forwards_checks);
1709 1708
1710 result = __bfs_forwards(root, (void *)bit, usage_match, target_entry); 1709 result = __bfs_forwards(root, &usage_mask, usage_match, target_entry);
1711 1710
1712 return result; 1711 return result;
1713} 1712}
@@ -1723,14 +1722,14 @@ find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,
1723 * Return <0 on error. 1722 * Return <0 on error.
1724 */ 1723 */
1725static int 1724static int
1726find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit, 1725find_usage_backwards(struct lock_list *root, unsigned long usage_mask,
1727 struct lock_list **target_entry) 1726 struct lock_list **target_entry)
1728{ 1727{
1729 int result; 1728 int result;
1730 1729
1731 debug_atomic_inc(nr_find_usage_backwards_checks); 1730 debug_atomic_inc(nr_find_usage_backwards_checks);
1732 1731
1733 result = __bfs_backwards(root, (void *)bit, usage_match, target_entry); 1732 result = __bfs_backwards(root, &usage_mask, usage_match, target_entry);
1734 1733
1735 return result; 1734 return result;
1736} 1735}
@@ -1752,7 +1751,7 @@ static void print_lock_class_header(struct lock_class *class, int depth)
1752 1751
1753 len += printk("%*s %s", depth, "", usage_str[bit]); 1752 len += printk("%*s %s", depth, "", usage_str[bit]);
1754 len += printk(KERN_CONT " at:\n"); 1753 len += printk(KERN_CONT " at:\n");
1755 print_stack_trace(class->usage_traces + bit, len); 1754 print_lock_trace(class->usage_traces + bit, len);
1756 } 1755 }
1757 } 1756 }
1758 printk("%*s }\n", depth, ""); 1757 printk("%*s }\n", depth, "");
@@ -1777,7 +1776,7 @@ print_shortest_lock_dependencies(struct lock_list *leaf,
1777 do { 1776 do {
1778 print_lock_class_header(entry->class, depth); 1777 print_lock_class_header(entry->class, depth);
1779 printk("%*s ... acquired at:\n", depth, ""); 1778 printk("%*s ... acquired at:\n", depth, "");
1780 print_stack_trace(&entry->trace, 2); 1779 print_lock_trace(&entry->trace, 2);
1781 printk("\n"); 1780 printk("\n");
1782 1781
1783 if (depth == 0 && (entry != root)) { 1782 if (depth == 0 && (entry != root)) {
@@ -1890,14 +1889,14 @@ print_bad_irq_dependency(struct task_struct *curr,
1890 print_lock_name(backwards_entry->class); 1889 print_lock_name(backwards_entry->class);
1891 pr_warn("\n... which became %s-irq-safe at:\n", irqclass); 1890 pr_warn("\n... which became %s-irq-safe at:\n", irqclass);
1892 1891
1893 print_stack_trace(backwards_entry->class->usage_traces + bit1, 1); 1892 print_lock_trace(backwards_entry->class->usage_traces + bit1, 1);
1894 1893
1895 pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass); 1894 pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass);
1896 print_lock_name(forwards_entry->class); 1895 print_lock_name(forwards_entry->class);
1897 pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass); 1896 pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass);
1898 pr_warn("..."); 1897 pr_warn("...");
1899 1898
1900 print_stack_trace(forwards_entry->class->usage_traces + bit2, 1); 1899 print_lock_trace(forwards_entry->class->usage_traces + bit2, 1);
1901 1900
1902 pr_warn("\nother info that might help us debug this:\n\n"); 1901 pr_warn("\nother info that might help us debug this:\n\n");
1903 print_irq_lock_scenario(backwards_entry, forwards_entry, 1902 print_irq_lock_scenario(backwards_entry, forwards_entry,
@@ -1922,39 +1921,6 @@ print_bad_irq_dependency(struct task_struct *curr,
1922 return 0; 1921 return 0;
1923} 1922}
1924 1923
1925static int
1926check_usage(struct task_struct *curr, struct held_lock *prev,
1927 struct held_lock *next, enum lock_usage_bit bit_backwards,
1928 enum lock_usage_bit bit_forwards, const char *irqclass)
1929{
1930 int ret;
1931 struct lock_list this, that;
1932 struct lock_list *uninitialized_var(target_entry);
1933 struct lock_list *uninitialized_var(target_entry1);
1934
1935 this.parent = NULL;
1936
1937 this.class = hlock_class(prev);
1938 ret = find_usage_backwards(&this, bit_backwards, &target_entry);
1939 if (ret < 0)
1940 return print_bfs_bug(ret);
1941 if (ret == 1)
1942 return ret;
1943
1944 that.parent = NULL;
1945 that.class = hlock_class(next);
1946 ret = find_usage_forwards(&that, bit_forwards, &target_entry1);
1947 if (ret < 0)
1948 return print_bfs_bug(ret);
1949 if (ret == 1)
1950 return ret;
1951
1952 return print_bad_irq_dependency(curr, &this, &that,
1953 target_entry, target_entry1,
1954 prev, next,
1955 bit_backwards, bit_forwards, irqclass);
1956}
1957
1958static const char *state_names[] = { 1924static const char *state_names[] = {
1959#define LOCKDEP_STATE(__STATE) \ 1925#define LOCKDEP_STATE(__STATE) \
1960 __stringify(__STATE), 1926 __stringify(__STATE),
@@ -1971,9 +1937,19 @@ static const char *state_rnames[] = {
1971 1937
1972static inline const char *state_name(enum lock_usage_bit bit) 1938static inline const char *state_name(enum lock_usage_bit bit)
1973{ 1939{
1974 return (bit & LOCK_USAGE_READ_MASK) ? state_rnames[bit >> 2] : state_names[bit >> 2]; 1940 if (bit & LOCK_USAGE_READ_MASK)
1941 return state_rnames[bit >> LOCK_USAGE_DIR_MASK];
1942 else
1943 return state_names[bit >> LOCK_USAGE_DIR_MASK];
1975} 1944}
1976 1945
1946/*
1947 * The bit number is encoded like:
1948 *
1949 * bit0: 0 exclusive, 1 read lock
1950 * bit1: 0 used in irq, 1 irq enabled
1951 * bit2-n: state
1952 */
1977static int exclusive_bit(int new_bit) 1953static int exclusive_bit(int new_bit)
1978{ 1954{
1979 int state = new_bit & LOCK_USAGE_STATE_MASK; 1955 int state = new_bit & LOCK_USAGE_STATE_MASK;
@@ -1985,45 +1961,160 @@ static int exclusive_bit(int new_bit)
1985 return state | (dir ^ LOCK_USAGE_DIR_MASK); 1961 return state | (dir ^ LOCK_USAGE_DIR_MASK);
1986} 1962}
1987 1963
1964/*
1965 * Observe that when given a bitmask where each bitnr is encoded as above, a
1966 * right shift of the mask transforms the individual bitnrs as -1 and
1967 * conversely, a left shift transforms into +1 for the individual bitnrs.
1968 *
1969 * So for all bits whose number have LOCK_ENABLED_* set (bitnr1 == 1), we can
1970 * create the mask with those bit numbers using LOCK_USED_IN_* (bitnr1 == 0)
1971 * instead by subtracting the bit number by 2, or shifting the mask right by 2.
1972 *
1973 * Similarly, bitnr1 == 0 becomes bitnr1 == 1 by adding 2, or shifting left 2.
1974 *
1975 * So split the mask (note that LOCKF_ENABLED_IRQ_ALL|LOCKF_USED_IN_IRQ_ALL is
1976 * all bits set) and recompose with bitnr1 flipped.
1977 */
1978static unsigned long invert_dir_mask(unsigned long mask)
1979{
1980 unsigned long excl = 0;
1981
1982 /* Invert dir */
1983 excl |= (mask & LOCKF_ENABLED_IRQ_ALL) >> LOCK_USAGE_DIR_MASK;
1984 excl |= (mask & LOCKF_USED_IN_IRQ_ALL) << LOCK_USAGE_DIR_MASK;
1985
1986 return excl;
1987}
1988
1989/*
1990 * As above, we clear bitnr0 (LOCK_*_READ off) with bitmask ops. First, for all
1991 * bits with bitnr0 set (LOCK_*_READ), add those with bitnr0 cleared (LOCK_*).
1992 * And then mask out all bitnr0.
1993 */
1994static unsigned long exclusive_mask(unsigned long mask)
1995{
1996 unsigned long excl = invert_dir_mask(mask);
1997
1998 /* Strip read */
1999 excl |= (excl & LOCKF_IRQ_READ) >> LOCK_USAGE_READ_MASK;
2000 excl &= ~LOCKF_IRQ_READ;
2001
2002 return excl;
2003}
2004
2005/*
2006 * Retrieve the _possible_ original mask to which @mask is
2007 * exclusive. Ie: this is the opposite of exclusive_mask().
2008 * Note that 2 possible original bits can match an exclusive
2009 * bit: one has LOCK_USAGE_READ_MASK set, the other has it
2010 * cleared. So both are returned for each exclusive bit.
2011 */
2012static unsigned long original_mask(unsigned long mask)
2013{
2014 unsigned long excl = invert_dir_mask(mask);
2015
2016 /* Include read in existing usages */
2017 excl |= (excl & LOCKF_IRQ) << LOCK_USAGE_READ_MASK;
2018
2019 return excl;
2020}
2021
2022/*
2023 * Find the first pair of bit match between an original
2024 * usage mask and an exclusive usage mask.
2025 */
2026static int find_exclusive_match(unsigned long mask,
2027 unsigned long excl_mask,
2028 enum lock_usage_bit *bitp,
2029 enum lock_usage_bit *excl_bitp)
2030{
2031 int bit, excl;
2032
2033 for_each_set_bit(bit, &mask, LOCK_USED) {
2034 excl = exclusive_bit(bit);
2035 if (excl_mask & lock_flag(excl)) {
2036 *bitp = bit;
2037 *excl_bitp = excl;
2038 return 0;
2039 }
2040 }
2041 return -1;
2042}
2043
2044/*
2045 * Prove that the new dependency does not connect a hardirq-safe(-read)
2046 * lock with a hardirq-unsafe lock - to achieve this we search
2047 * the backwards-subgraph starting at <prev>, and the
2048 * forwards-subgraph starting at <next>:
2049 */
1988static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, 2050static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
1989 struct held_lock *next, enum lock_usage_bit bit) 2051 struct held_lock *next)
1990{ 2052{
2053 unsigned long usage_mask = 0, forward_mask, backward_mask;
2054 enum lock_usage_bit forward_bit = 0, backward_bit = 0;
2055 struct lock_list *uninitialized_var(target_entry1);
2056 struct lock_list *uninitialized_var(target_entry);
2057 struct lock_list this, that;
2058 int ret;
2059
1991 /* 2060 /*
1992 * Prove that the new dependency does not connect a hardirq-safe 2061 * Step 1: gather all hard/soft IRQs usages backward in an
1993 * lock with a hardirq-unsafe lock - to achieve this we search 2062 * accumulated usage mask.
1994 * the backwards-subgraph starting at <prev>, and the
1995 * forwards-subgraph starting at <next>:
1996 */ 2063 */
1997 if (!check_usage(curr, prev, next, bit, 2064 this.parent = NULL;
1998 exclusive_bit(bit), state_name(bit))) 2065 this.class = hlock_class(prev);
1999 return 0; 2066
2067 ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL);
2068 if (ret < 0)
2069 return print_bfs_bug(ret);
2000 2070
2001 bit++; /* _READ */ 2071 usage_mask &= LOCKF_USED_IN_IRQ_ALL;
2072 if (!usage_mask)
2073 return 1;
2002 2074
2003 /* 2075 /*
2004 * Prove that the new dependency does not connect a hardirq-safe-read 2076 * Step 2: find exclusive uses forward that match the previous
2005 * lock with a hardirq-unsafe lock - to achieve this we search 2077 * backward accumulated mask.
2006 * the backwards-subgraph starting at <prev>, and the
2007 * forwards-subgraph starting at <next>:
2008 */ 2078 */
2009 if (!check_usage(curr, prev, next, bit, 2079 forward_mask = exclusive_mask(usage_mask);
2010 exclusive_bit(bit), state_name(bit)))
2011 return 0;
2012 2080
2013 return 1; 2081 that.parent = NULL;
2014} 2082 that.class = hlock_class(next);
2015 2083
2016static int 2084 ret = find_usage_forwards(&that, forward_mask, &target_entry1);
2017check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, 2085 if (ret < 0)
2018 struct held_lock *next) 2086 return print_bfs_bug(ret);
2019{ 2087 if (ret == 1)
2020#define LOCKDEP_STATE(__STATE) \ 2088 return ret;
2021 if (!check_irq_usage(curr, prev, next, LOCK_USED_IN_##__STATE)) \
2022 return 0;
2023#include "lockdep_states.h"
2024#undef LOCKDEP_STATE
2025 2089
2026 return 1; 2090 /*
2091 * Step 3: we found a bad match! Now retrieve a lock from the backward
2092 * list whose usage mask matches the exclusive usage mask from the
2093 * lock found on the forward list.
2094 */
2095 backward_mask = original_mask(target_entry1->class->usage_mask);
2096
2097 ret = find_usage_backwards(&this, backward_mask, &target_entry);
2098 if (ret < 0)
2099 return print_bfs_bug(ret);
2100 if (DEBUG_LOCKS_WARN_ON(ret == 1))
2101 return 1;
2102
2103 /*
2104 * Step 4: narrow down to a pair of incompatible usage bits
2105 * and report it.
2106 */
2107 ret = find_exclusive_match(target_entry->class->usage_mask,
2108 target_entry1->class->usage_mask,
2109 &backward_bit, &forward_bit);
2110 if (DEBUG_LOCKS_WARN_ON(ret == -1))
2111 return 1;
2112
2113 return print_bad_irq_dependency(curr, &this, &that,
2114 target_entry, target_entry1,
2115 prev, next,
2116 backward_bit, forward_bit,
2117 state_name(backward_bit));
2027} 2118}
2028 2119
2029static void inc_chains(void) 2120static void inc_chains(void)
@@ -2040,9 +2131,8 @@ static void inc_chains(void)
2040 2131
2041#else 2132#else
2042 2133
2043static inline int 2134static inline int check_irq_usage(struct task_struct *curr,
2044check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, 2135 struct held_lock *prev, struct held_lock *next)
2045 struct held_lock *next)
2046{ 2136{
2047 return 1; 2137 return 1;
2048} 2138}
@@ -2170,8 +2260,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
2170 */ 2260 */
2171static int 2261static int
2172check_prev_add(struct task_struct *curr, struct held_lock *prev, 2262check_prev_add(struct task_struct *curr, struct held_lock *prev,
2173 struct held_lock *next, int distance, struct stack_trace *trace, 2263 struct held_lock *next, int distance, struct lock_trace *trace)
2174 int (*save)(struct stack_trace *trace))
2175{ 2264{
2176 struct lock_list *uninitialized_var(target_entry); 2265 struct lock_list *uninitialized_var(target_entry);
2177 struct lock_list *entry; 2266 struct lock_list *entry;
@@ -2209,20 +2298,20 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
2209 this.parent = NULL; 2298 this.parent = NULL;
2210 ret = check_noncircular(&this, hlock_class(prev), &target_entry); 2299 ret = check_noncircular(&this, hlock_class(prev), &target_entry);
2211 if (unlikely(!ret)) { 2300 if (unlikely(!ret)) {
2212 if (!trace->entries) { 2301 if (!trace->nr_entries) {
2213 /* 2302 /*
2214 * If @save fails here, the printing might trigger 2303 * If save_trace fails here, the printing might
2215 * a WARN but because of the !nr_entries it should 2304 * trigger a WARN but because of the !nr_entries it
2216 * not do bad things. 2305 * should not do bad things.
2217 */ 2306 */
2218 save(trace); 2307 save_trace(trace);
2219 } 2308 }
2220 return print_circular_bug(&this, target_entry, next, prev, trace); 2309 return print_circular_bug(&this, target_entry, next, prev);
2221 } 2310 }
2222 else if (unlikely(ret < 0)) 2311 else if (unlikely(ret < 0))
2223 return print_bfs_bug(ret); 2312 return print_bfs_bug(ret);
2224 2313
2225 if (!check_prev_add_irq(curr, prev, next)) 2314 if (!check_irq_usage(curr, prev, next))
2226 return 0; 2315 return 0;
2227 2316
2228 /* 2317 /*
@@ -2265,7 +2354,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
2265 return print_bfs_bug(ret); 2354 return print_bfs_bug(ret);
2266 2355
2267 2356
2268 if (!trace->entries && !save(trace)) 2357 if (!trace->nr_entries && !save_trace(trace))
2269 return 0; 2358 return 0;
2270 2359
2271 /* 2360 /*
@@ -2297,14 +2386,9 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
2297static int 2386static int
2298check_prevs_add(struct task_struct *curr, struct held_lock *next) 2387check_prevs_add(struct task_struct *curr, struct held_lock *next)
2299{ 2388{
2389 struct lock_trace trace = { .nr_entries = 0 };
2300 int depth = curr->lockdep_depth; 2390 int depth = curr->lockdep_depth;
2301 struct held_lock *hlock; 2391 struct held_lock *hlock;
2302 struct stack_trace trace = {
2303 .nr_entries = 0,
2304 .max_entries = 0,
2305 .entries = NULL,
2306 .skip = 0,
2307 };
2308 2392
2309 /* 2393 /*
2310 * Debugging checks. 2394 * Debugging checks.
@@ -2330,7 +2414,8 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
2330 * added: 2414 * added:
2331 */ 2415 */
2332 if (hlock->read != 2 && hlock->check) { 2416 if (hlock->read != 2 && hlock->check) {
2333 int ret = check_prev_add(curr, hlock, next, distance, &trace, save_trace); 2417 int ret = check_prev_add(curr, hlock, next, distance,
2418 &trace);
2334 if (!ret) 2419 if (!ret)
2335 return 0; 2420 return 0;
2336 2421
@@ -2731,6 +2816,10 @@ static inline int validate_chain(struct task_struct *curr,
2731{ 2816{
2732 return 1; 2817 return 1;
2733} 2818}
2819
2820static void print_lock_trace(struct lock_trace *trace, unsigned int spaces)
2821{
2822}
2734#endif 2823#endif
2735 2824
2736/* 2825/*
@@ -2784,6 +2873,12 @@ static void check_chain_key(struct task_struct *curr)
2784#endif 2873#endif
2785} 2874}
2786 2875
2876static int mark_lock(struct task_struct *curr, struct held_lock *this,
2877 enum lock_usage_bit new_bit);
2878
2879#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
2880
2881
2787static void 2882static void
2788print_usage_bug_scenario(struct held_lock *lock) 2883print_usage_bug_scenario(struct held_lock *lock)
2789{ 2884{
@@ -2827,7 +2922,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
2827 print_lock(this); 2922 print_lock(this);
2828 2923
2829 pr_warn("{%s} state was registered at:\n", usage_str[prev_bit]); 2924 pr_warn("{%s} state was registered at:\n", usage_str[prev_bit]);
2830 print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1); 2925 print_lock_trace(hlock_class(this)->usage_traces + prev_bit, 1);
2831 2926
2832 print_irqtrace_events(curr); 2927 print_irqtrace_events(curr);
2833 pr_warn("\nother info that might help us debug this:\n"); 2928 pr_warn("\nother info that might help us debug this:\n");
@@ -2853,10 +2948,6 @@ valid_state(struct task_struct *curr, struct held_lock *this,
2853 return 1; 2948 return 1;
2854} 2949}
2855 2950
2856static int mark_lock(struct task_struct *curr, struct held_lock *this,
2857 enum lock_usage_bit new_bit);
2858
2859#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
2860 2951
2861/* 2952/*
2862 * print irq inversion bug: 2953 * print irq inversion bug:
@@ -2936,7 +3027,7 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this,
2936 3027
2937 root.parent = NULL; 3028 root.parent = NULL;
2938 root.class = hlock_class(this); 3029 root.class = hlock_class(this);
2939 ret = find_usage_forwards(&root, bit, &target_entry); 3030 ret = find_usage_forwards(&root, lock_flag(bit), &target_entry);
2940 if (ret < 0) 3031 if (ret < 0)
2941 return print_bfs_bug(ret); 3032 return print_bfs_bug(ret);
2942 if (ret == 1) 3033 if (ret == 1)
@@ -2960,7 +3051,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
2960 3051
2961 root.parent = NULL; 3052 root.parent = NULL;
2962 root.class = hlock_class(this); 3053 root.class = hlock_class(this);
2963 ret = find_usage_backwards(&root, bit, &target_entry); 3054 ret = find_usage_backwards(&root, lock_flag(bit), &target_entry);
2964 if (ret < 0) 3055 if (ret < 0)
2965 return print_bfs_bug(ret); 3056 return print_bfs_bug(ret);
2966 if (ret == 1) 3057 if (ret == 1)
@@ -3015,7 +3106,7 @@ static int (*state_verbose_f[])(struct lock_class *class) = {
3015static inline int state_verbose(enum lock_usage_bit bit, 3106static inline int state_verbose(enum lock_usage_bit bit,
3016 struct lock_class *class) 3107 struct lock_class *class)
3017{ 3108{
3018 return state_verbose_f[bit >> 2](class); 3109 return state_verbose_f[bit >> LOCK_USAGE_DIR_MASK](class);
3019} 3110}
3020 3111
3021typedef int (*check_usage_f)(struct task_struct *, struct held_lock *, 3112typedef int (*check_usage_f)(struct task_struct *, struct held_lock *,
@@ -3157,7 +3248,7 @@ void lockdep_hardirqs_on(unsigned long ip)
3157 /* 3248 /*
3158 * See the fine text that goes along with this variable definition. 3249 * See the fine text that goes along with this variable definition.
3159 */ 3250 */
3160 if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) 3251 if (DEBUG_LOCKS_WARN_ON(early_boot_irqs_disabled))
3161 return; 3252 return;
3162 3253
3163 /* 3254 /*
@@ -4689,8 +4780,8 @@ static void free_zapped_rcu(struct rcu_head *ch)
4689 return; 4780 return;
4690 4781
4691 raw_local_irq_save(flags); 4782 raw_local_irq_save(flags);
4692 if (!graph_lock()) 4783 arch_spin_lock(&lockdep_lock);
4693 goto out_irq; 4784 current->lockdep_recursion = 1;
4694 4785
4695 /* closed head */ 4786 /* closed head */
4696 pf = delayed_free.pf + (delayed_free.index ^ 1); 4787 pf = delayed_free.pf + (delayed_free.index ^ 1);
@@ -4702,8 +4793,8 @@ static void free_zapped_rcu(struct rcu_head *ch)
4702 */ 4793 */
4703 call_rcu_zapped(delayed_free.pf + delayed_free.index); 4794 call_rcu_zapped(delayed_free.pf + delayed_free.index);
4704 4795
4705 graph_unlock(); 4796 current->lockdep_recursion = 0;
4706out_irq: 4797 arch_spin_unlock(&lockdep_lock);
4707 raw_local_irq_restore(flags); 4798 raw_local_irq_restore(flags);
4708} 4799}
4709 4800
@@ -4744,21 +4835,17 @@ static void lockdep_free_key_range_reg(void *start, unsigned long size)
4744{ 4835{
4745 struct pending_free *pf; 4836 struct pending_free *pf;
4746 unsigned long flags; 4837 unsigned long flags;
4747 int locked;
4748 4838
4749 init_data_structures_once(); 4839 init_data_structures_once();
4750 4840
4751 raw_local_irq_save(flags); 4841 raw_local_irq_save(flags);
4752 locked = graph_lock(); 4842 arch_spin_lock(&lockdep_lock);
4753 if (!locked) 4843 current->lockdep_recursion = 1;
4754 goto out_irq;
4755
4756 pf = get_pending_free(); 4844 pf = get_pending_free();
4757 __lockdep_free_key_range(pf, start, size); 4845 __lockdep_free_key_range(pf, start, size);
4758 call_rcu_zapped(pf); 4846 call_rcu_zapped(pf);
4759 4847 current->lockdep_recursion = 0;
4760 graph_unlock(); 4848 arch_spin_unlock(&lockdep_lock);
4761out_irq:
4762 raw_local_irq_restore(flags); 4849 raw_local_irq_restore(flags);
4763 4850
4764 /* 4851 /*
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index d4c197425f68..150ec3f0c5b5 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -42,13 +42,35 @@ enum {
42 __LOCKF(USED) 42 __LOCKF(USED)
43}; 43};
44 44
45#define LOCKF_ENABLED_IRQ (LOCKF_ENABLED_HARDIRQ | LOCKF_ENABLED_SOFTIRQ) 45#define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE |
46#define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ) 46static const unsigned long LOCKF_ENABLED_IRQ =
47#include "lockdep_states.h"
48 0;
49#undef LOCKDEP_STATE
50
51#define LOCKDEP_STATE(__STATE) LOCKF_USED_IN_##__STATE |
52static const unsigned long LOCKF_USED_IN_IRQ =
53#include "lockdep_states.h"
54 0;
55#undef LOCKDEP_STATE
56
57#define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE##_READ |
58static const unsigned long LOCKF_ENABLED_IRQ_READ =
59#include "lockdep_states.h"
60 0;
61#undef LOCKDEP_STATE
62
63#define LOCKDEP_STATE(__STATE) LOCKF_USED_IN_##__STATE##_READ |
64static const unsigned long LOCKF_USED_IN_IRQ_READ =
65#include "lockdep_states.h"
66 0;
67#undef LOCKDEP_STATE
68
69#define LOCKF_ENABLED_IRQ_ALL (LOCKF_ENABLED_IRQ | LOCKF_ENABLED_IRQ_READ)
70#define LOCKF_USED_IN_IRQ_ALL (LOCKF_USED_IN_IRQ | LOCKF_USED_IN_IRQ_READ)
47 71
48#define LOCKF_ENABLED_IRQ_READ \ 72#define LOCKF_IRQ (LOCKF_ENABLED_IRQ | LOCKF_USED_IN_IRQ)
49 (LOCKF_ENABLED_HARDIRQ_READ | LOCKF_ENABLED_SOFTIRQ_READ) 73#define LOCKF_IRQ_READ (LOCKF_ENABLED_IRQ_READ | LOCKF_USED_IN_IRQ_READ)
50#define LOCKF_USED_IN_IRQ_READ \
51 (LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ)
52 74
53/* 75/*
54 * CONFIG_LOCKDEP_SMALL is defined for sparc. Sparc requires .text, 76 * CONFIG_LOCKDEP_SMALL is defined for sparc. Sparc requires .text,
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index ad40a2617063..80a463d31a8d 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -829,7 +829,9 @@ static void lock_torture_cleanup(void)
829 "End of test: SUCCESS"); 829 "End of test: SUCCESS");
830 830
831 kfree(cxt.lwsa); 831 kfree(cxt.lwsa);
832 cxt.lwsa = NULL;
832 kfree(cxt.lrsa); 833 kfree(cxt.lrsa);
834 cxt.lrsa = NULL;
833 835
834end: 836end:
835 torture_cleanup_end(); 837 torture_cleanup_end();
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 883cf1b92d90..f17dad99eec8 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -7,6 +7,8 @@
7#include <linux/sched.h> 7#include <linux/sched.h>
8#include <linux/errno.h> 8#include <linux/errno.h>
9 9
10#include "rwsem.h"
11
10int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, 12int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
11 const char *name, struct lock_class_key *rwsem_key) 13 const char *name, struct lock_class_key *rwsem_key)
12{ 14{
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 5e9247dc2515..e14b32c69639 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -395,7 +395,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
395 * 0,1,0 -> 0,0,1 395 * 0,1,0 -> 0,0,1
396 */ 396 */
397 clear_pending_set_locked(lock); 397 clear_pending_set_locked(lock);
398 qstat_inc(qstat_lock_pending, true); 398 lockevent_inc(lock_pending);
399 return; 399 return;
400 400
401 /* 401 /*
@@ -403,7 +403,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
403 * queuing. 403 * queuing.
404 */ 404 */
405queue: 405queue:
406 qstat_inc(qstat_lock_slowpath, true); 406 lockevent_inc(lock_slowpath);
407pv_queue: 407pv_queue:
408 node = this_cpu_ptr(&qnodes[0].mcs); 408 node = this_cpu_ptr(&qnodes[0].mcs);
409 idx = node->count++; 409 idx = node->count++;
@@ -419,7 +419,7 @@ pv_queue:
419 * simple enough. 419 * simple enough.
420 */ 420 */
421 if (unlikely(idx >= MAX_NODES)) { 421 if (unlikely(idx >= MAX_NODES)) {
422 qstat_inc(qstat_lock_no_node, true); 422 lockevent_inc(lock_no_node);
423 while (!queued_spin_trylock(lock)) 423 while (!queued_spin_trylock(lock))
424 cpu_relax(); 424 cpu_relax();
425 goto release; 425 goto release;
@@ -430,7 +430,7 @@ pv_queue:
430 /* 430 /*
431 * Keep counts of non-zero index values: 431 * Keep counts of non-zero index values:
432 */ 432 */
433 qstat_inc(qstat_lock_use_node2 + idx - 1, idx); 433 lockevent_cond_inc(lock_use_node2 + idx - 1, idx);
434 434
435 /* 435 /*
436 * Ensure that we increment the head node->count before initialising 436 * Ensure that we increment the head node->count before initialising
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 8f36c27c1794..89bab079e7a4 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -89,7 +89,7 @@ static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock)
89 89
90 if (!(val & _Q_LOCKED_PENDING_MASK) && 90 if (!(val & _Q_LOCKED_PENDING_MASK) &&
91 (cmpxchg_acquire(&lock->locked, 0, _Q_LOCKED_VAL) == 0)) { 91 (cmpxchg_acquire(&lock->locked, 0, _Q_LOCKED_VAL) == 0)) {
92 qstat_inc(qstat_pv_lock_stealing, true); 92 lockevent_inc(pv_lock_stealing);
93 return true; 93 return true;
94 } 94 }
95 if (!(val & _Q_TAIL_MASK) || (val & _Q_PENDING_MASK)) 95 if (!(val & _Q_TAIL_MASK) || (val & _Q_PENDING_MASK))
@@ -219,7 +219,7 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
219 hopcnt++; 219 hopcnt++;
220 if (!cmpxchg(&he->lock, NULL, lock)) { 220 if (!cmpxchg(&he->lock, NULL, lock)) {
221 WRITE_ONCE(he->node, node); 221 WRITE_ONCE(he->node, node);
222 qstat_hop(hopcnt); 222 lockevent_pv_hop(hopcnt);
223 return &he->lock; 223 return &he->lock;
224 } 224 }
225 } 225 }
@@ -320,8 +320,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
320 smp_store_mb(pn->state, vcpu_halted); 320 smp_store_mb(pn->state, vcpu_halted);
321 321
322 if (!READ_ONCE(node->locked)) { 322 if (!READ_ONCE(node->locked)) {
323 qstat_inc(qstat_pv_wait_node, true); 323 lockevent_inc(pv_wait_node);
324 qstat_inc(qstat_pv_wait_early, wait_early); 324 lockevent_cond_inc(pv_wait_early, wait_early);
325 pv_wait(&pn->state, vcpu_halted); 325 pv_wait(&pn->state, vcpu_halted);
326 } 326 }
327 327
@@ -339,7 +339,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
339 * So it is better to spin for a while in the hope that the 339 * So it is better to spin for a while in the hope that the
340 * MCS lock will be released soon. 340 * MCS lock will be released soon.
341 */ 341 */
342 qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked)); 342 lockevent_cond_inc(pv_spurious_wakeup,
343 !READ_ONCE(node->locked));
343 } 344 }
344 345
345 /* 346 /*
@@ -416,7 +417,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
416 /* 417 /*
417 * Tracking # of slowpath locking operations 418 * Tracking # of slowpath locking operations
418 */ 419 */
419 qstat_inc(qstat_lock_slowpath, true); 420 lockevent_inc(lock_slowpath);
420 421
421 for (;; waitcnt++) { 422 for (;; waitcnt++) {
422 /* 423 /*
@@ -464,8 +465,8 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
464 } 465 }
465 } 466 }
466 WRITE_ONCE(pn->state, vcpu_hashed); 467 WRITE_ONCE(pn->state, vcpu_hashed);
467 qstat_inc(qstat_pv_wait_head, true); 468 lockevent_inc(pv_wait_head);
468 qstat_inc(qstat_pv_wait_again, waitcnt); 469 lockevent_cond_inc(pv_wait_again, waitcnt);
469 pv_wait(&lock->locked, _Q_SLOW_VAL); 470 pv_wait(&lock->locked, _Q_SLOW_VAL);
470 471
471 /* 472 /*
@@ -528,7 +529,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
528 * vCPU is harmless other than the additional latency in completing 529 * vCPU is harmless other than the additional latency in completing
529 * the unlock. 530 * the unlock.
530 */ 531 */
531 qstat_inc(qstat_pv_kick_unlock, true); 532 lockevent_inc(pv_kick_unlock);
532 pv_kick(node->cpu); 533 pv_kick(node->cpu);
533} 534}
534 535
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index d73f85388d5c..54152670ff24 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -9,262 +9,105 @@
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details. 10 * GNU General Public License for more details.
11 * 11 *
12 * Authors: Waiman Long <waiman.long@hpe.com> 12 * Authors: Waiman Long <longman@redhat.com>
13 */ 13 */
14 14
15/* 15#include "lock_events.h"
16 * When queued spinlock statistical counters are enabled, the following
17 * debugfs files will be created for reporting the counter values:
18 *
19 * <debugfs>/qlockstat/
20 * pv_hash_hops - average # of hops per hashing operation
21 * pv_kick_unlock - # of vCPU kicks issued at unlock time
22 * pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake
23 * pv_latency_kick - average latency (ns) of vCPU kick operation
24 * pv_latency_wake - average latency (ns) from vCPU kick to wakeup
25 * pv_lock_stealing - # of lock stealing operations
26 * pv_spurious_wakeup - # of spurious wakeups in non-head vCPUs
27 * pv_wait_again - # of wait's after a queue head vCPU kick
28 * pv_wait_early - # of early vCPU wait's
29 * pv_wait_head - # of vCPU wait's at the queue head
30 * pv_wait_node - # of vCPU wait's at a non-head queue node
31 * lock_pending - # of locking operations via pending code
32 * lock_slowpath - # of locking operations via MCS lock queue
33 * lock_use_node2 - # of locking operations that use 2nd per-CPU node
34 * lock_use_node3 - # of locking operations that use 3rd per-CPU node
35 * lock_use_node4 - # of locking operations that use 4th per-CPU node
36 * lock_no_node - # of locking operations without using per-CPU node
37 *
38 * Subtracting lock_use_node[234] from lock_slowpath will give you
39 * lock_use_node1.
40 *
41 * Writing to the "reset_counters" file will reset all the above counter
42 * values.
43 *
44 * These statistical counters are implemented as per-cpu variables which are
45 * summed and computed whenever the corresponding debugfs files are read. This
46 * minimizes added overhead making the counters usable even in a production
47 * environment.
48 *
49 * There may be slight difference between pv_kick_wake and pv_kick_unlock.
50 */
51enum qlock_stats {
52 qstat_pv_hash_hops,
53 qstat_pv_kick_unlock,
54 qstat_pv_kick_wake,
55 qstat_pv_latency_kick,
56 qstat_pv_latency_wake,
57 qstat_pv_lock_stealing,
58 qstat_pv_spurious_wakeup,
59 qstat_pv_wait_again,
60 qstat_pv_wait_early,
61 qstat_pv_wait_head,
62 qstat_pv_wait_node,
63 qstat_lock_pending,
64 qstat_lock_slowpath,
65 qstat_lock_use_node2,
66 qstat_lock_use_node3,
67 qstat_lock_use_node4,
68 qstat_lock_no_node,
69 qstat_num, /* Total number of statistical counters */
70 qstat_reset_cnts = qstat_num,
71};
72 16
73#ifdef CONFIG_QUEUED_LOCK_STAT 17#ifdef CONFIG_LOCK_EVENT_COUNTS
18#ifdef CONFIG_PARAVIRT_SPINLOCKS
74/* 19/*
75 * Collect pvqspinlock statistics 20 * Collect pvqspinlock locking event counts
76 */ 21 */
77#include <linux/debugfs.h>
78#include <linux/sched.h> 22#include <linux/sched.h>
79#include <linux/sched/clock.h> 23#include <linux/sched/clock.h>
80#include <linux/fs.h> 24#include <linux/fs.h>
81 25
82static const char * const qstat_names[qstat_num + 1] = { 26#define EVENT_COUNT(ev) lockevents[LOCKEVENT_ ## ev]
83 [qstat_pv_hash_hops] = "pv_hash_hops",
84 [qstat_pv_kick_unlock] = "pv_kick_unlock",
85 [qstat_pv_kick_wake] = "pv_kick_wake",
86 [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup",
87 [qstat_pv_latency_kick] = "pv_latency_kick",
88 [qstat_pv_latency_wake] = "pv_latency_wake",
89 [qstat_pv_lock_stealing] = "pv_lock_stealing",
90 [qstat_pv_wait_again] = "pv_wait_again",
91 [qstat_pv_wait_early] = "pv_wait_early",
92 [qstat_pv_wait_head] = "pv_wait_head",
93 [qstat_pv_wait_node] = "pv_wait_node",
94 [qstat_lock_pending] = "lock_pending",
95 [qstat_lock_slowpath] = "lock_slowpath",
96 [qstat_lock_use_node2] = "lock_use_node2",
97 [qstat_lock_use_node3] = "lock_use_node3",
98 [qstat_lock_use_node4] = "lock_use_node4",
99 [qstat_lock_no_node] = "lock_no_node",
100 [qstat_reset_cnts] = "reset_counters",
101};
102 27
103/* 28/*
104 * Per-cpu counters 29 * PV specific per-cpu counter
105 */ 30 */
106static DEFINE_PER_CPU(unsigned long, qstats[qstat_num]);
107static DEFINE_PER_CPU(u64, pv_kick_time); 31static DEFINE_PER_CPU(u64, pv_kick_time);
108 32
109/* 33/*
110 * Function to read and return the qlock statistical counter values 34 * Function to read and return the PV qspinlock counts.
111 * 35 *
112 * The following counters are handled specially: 36 * The following counters are handled specially:
113 * 1. qstat_pv_latency_kick 37 * 1. pv_latency_kick
114 * Average kick latency (ns) = pv_latency_kick/pv_kick_unlock 38 * Average kick latency (ns) = pv_latency_kick/pv_kick_unlock
115 * 2. qstat_pv_latency_wake 39 * 2. pv_latency_wake
116 * Average wake latency (ns) = pv_latency_wake/pv_kick_wake 40 * Average wake latency (ns) = pv_latency_wake/pv_kick_wake
117 * 3. qstat_pv_hash_hops 41 * 3. pv_hash_hops
118 * Average hops/hash = pv_hash_hops/pv_kick_unlock 42 * Average hops/hash = pv_hash_hops/pv_kick_unlock
119 */ 43 */
120static ssize_t qstat_read(struct file *file, char __user *user_buf, 44ssize_t lockevent_read(struct file *file, char __user *user_buf,
121 size_t count, loff_t *ppos) 45 size_t count, loff_t *ppos)
122{ 46{
123 char buf[64]; 47 char buf[64];
124 int cpu, counter, len; 48 int cpu, id, len;
125 u64 stat = 0, kicks = 0; 49 u64 sum = 0, kicks = 0;
126 50
127 /* 51 /*
128 * Get the counter ID stored in file->f_inode->i_private 52 * Get the counter ID stored in file->f_inode->i_private
129 */ 53 */
130 counter = (long)file_inode(file)->i_private; 54 id = (long)file_inode(file)->i_private;
131 55
132 if (counter >= qstat_num) 56 if (id >= lockevent_num)
133 return -EBADF; 57 return -EBADF;
134 58
135 for_each_possible_cpu(cpu) { 59 for_each_possible_cpu(cpu) {
136 stat += per_cpu(qstats[counter], cpu); 60 sum += per_cpu(lockevents[id], cpu);
137 /* 61 /*
138 * Need to sum additional counter for some of them 62 * Need to sum additional counters for some of them
139 */ 63 */
140 switch (counter) { 64 switch (id) {
141 65
142 case qstat_pv_latency_kick: 66 case LOCKEVENT_pv_latency_kick:
143 case qstat_pv_hash_hops: 67 case LOCKEVENT_pv_hash_hops:
144 kicks += per_cpu(qstats[qstat_pv_kick_unlock], cpu); 68 kicks += per_cpu(EVENT_COUNT(pv_kick_unlock), cpu);
145 break; 69 break;
146 70
147 case qstat_pv_latency_wake: 71 case LOCKEVENT_pv_latency_wake:
148 kicks += per_cpu(qstats[qstat_pv_kick_wake], cpu); 72 kicks += per_cpu(EVENT_COUNT(pv_kick_wake), cpu);
149 break; 73 break;
150 } 74 }
151 } 75 }
152 76
153 if (counter == qstat_pv_hash_hops) { 77 if (id == LOCKEVENT_pv_hash_hops) {
154 u64 frac = 0; 78 u64 frac = 0;
155 79
156 if (kicks) { 80 if (kicks) {
157 frac = 100ULL * do_div(stat, kicks); 81 frac = 100ULL * do_div(sum, kicks);
158 frac = DIV_ROUND_CLOSEST_ULL(frac, kicks); 82 frac = DIV_ROUND_CLOSEST_ULL(frac, kicks);
159 } 83 }
160 84
161 /* 85 /*
162 * Return a X.XX decimal number 86 * Return a X.XX decimal number
163 */ 87 */
164 len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", stat, frac); 88 len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n",
89 sum, frac);
165 } else { 90 } else {
166 /* 91 /*
167 * Round to the nearest ns 92 * Round to the nearest ns
168 */ 93 */
169 if ((counter == qstat_pv_latency_kick) || 94 if ((id == LOCKEVENT_pv_latency_kick) ||
170 (counter == qstat_pv_latency_wake)) { 95 (id == LOCKEVENT_pv_latency_wake)) {
171 if (kicks) 96 if (kicks)
172 stat = DIV_ROUND_CLOSEST_ULL(stat, kicks); 97 sum = DIV_ROUND_CLOSEST_ULL(sum, kicks);
173 } 98 }
174 len = snprintf(buf, sizeof(buf) - 1, "%llu\n", stat); 99 len = snprintf(buf, sizeof(buf) - 1, "%llu\n", sum);
175 } 100 }
176 101
177 return simple_read_from_buffer(user_buf, count, ppos, buf, len); 102 return simple_read_from_buffer(user_buf, count, ppos, buf, len);
178} 103}
179 104
180/* 105/*
181 * Function to handle write request
182 *
183 * When counter = reset_cnts, reset all the counter values.
184 * Since the counter updates aren't atomic, the resetting is done twice
185 * to make sure that the counters are very likely to be all cleared.
186 */
187static ssize_t qstat_write(struct file *file, const char __user *user_buf,
188 size_t count, loff_t *ppos)
189{
190 int cpu;
191
192 /*
193 * Get the counter ID stored in file->f_inode->i_private
194 */
195 if ((long)file_inode(file)->i_private != qstat_reset_cnts)
196 return count;
197
198 for_each_possible_cpu(cpu) {
199 int i;
200 unsigned long *ptr = per_cpu_ptr(qstats, cpu);
201
202 for (i = 0 ; i < qstat_num; i++)
203 WRITE_ONCE(ptr[i], 0);
204 }
205 return count;
206}
207
208/*
209 * Debugfs data structures
210 */
211static const struct file_operations fops_qstat = {
212 .read = qstat_read,
213 .write = qstat_write,
214 .llseek = default_llseek,
215};
216
217/*
218 * Initialize debugfs for the qspinlock statistical counters
219 */
220static int __init init_qspinlock_stat(void)
221{
222 struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL);
223 int i;
224
225 if (!d_qstat)
226 goto out;
227
228 /*
229 * Create the debugfs files
230 *
231 * As reading from and writing to the stat files can be slow, only
232 * root is allowed to do the read/write to limit impact to system
233 * performance.
234 */
235 for (i = 0; i < qstat_num; i++)
236 if (!debugfs_create_file(qstat_names[i], 0400, d_qstat,
237 (void *)(long)i, &fops_qstat))
238 goto fail_undo;
239
240 if (!debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat,
241 (void *)(long)qstat_reset_cnts, &fops_qstat))
242 goto fail_undo;
243
244 return 0;
245fail_undo:
246 debugfs_remove_recursive(d_qstat);
247out:
248 pr_warn("Could not create 'qlockstat' debugfs entries\n");
249 return -ENOMEM;
250}
251fs_initcall(init_qspinlock_stat);
252
253/*
254 * Increment the PV qspinlock statistical counters
255 */
256static inline void qstat_inc(enum qlock_stats stat, bool cond)
257{
258 if (cond)
259 this_cpu_inc(qstats[stat]);
260}
261
262/*
263 * PV hash hop count 106 * PV hash hop count
264 */ 107 */
265static inline void qstat_hop(int hopcnt) 108static inline void lockevent_pv_hop(int hopcnt)
266{ 109{
267 this_cpu_add(qstats[qstat_pv_hash_hops], hopcnt); 110 this_cpu_add(EVENT_COUNT(pv_hash_hops), hopcnt);
268} 111}
269 112
270/* 113/*
@@ -276,7 +119,7 @@ static inline void __pv_kick(int cpu)
276 119
277 per_cpu(pv_kick_time, cpu) = start; 120 per_cpu(pv_kick_time, cpu) = start;
278 pv_kick(cpu); 121 pv_kick(cpu);
279 this_cpu_add(qstats[qstat_pv_latency_kick], sched_clock() - start); 122 this_cpu_add(EVENT_COUNT(pv_latency_kick), sched_clock() - start);
280} 123}
281 124
282/* 125/*
@@ -289,18 +132,19 @@ static inline void __pv_wait(u8 *ptr, u8 val)
289 *pkick_time = 0; 132 *pkick_time = 0;
290 pv_wait(ptr, val); 133 pv_wait(ptr, val);
291 if (*pkick_time) { 134 if (*pkick_time) {
292 this_cpu_add(qstats[qstat_pv_latency_wake], 135 this_cpu_add(EVENT_COUNT(pv_latency_wake),
293 sched_clock() - *pkick_time); 136 sched_clock() - *pkick_time);
294 qstat_inc(qstat_pv_kick_wake, true); 137 lockevent_inc(pv_kick_wake);
295 } 138 }
296} 139}
297 140
298#define pv_kick(c) __pv_kick(c) 141#define pv_kick(c) __pv_kick(c)
299#define pv_wait(p, v) __pv_wait(p, v) 142#define pv_wait(p, v) __pv_wait(p, v)
300 143
301#else /* CONFIG_QUEUED_LOCK_STAT */ 144#endif /* CONFIG_PARAVIRT_SPINLOCKS */
145
146#else /* CONFIG_LOCK_EVENT_COUNTS */
302 147
303static inline void qstat_inc(enum qlock_stats stat, bool cond) { } 148static inline void lockevent_pv_hop(int hopcnt) { }
304static inline void qstat_hop(int hopcnt) { }
305 149
306#endif /* CONFIG_QUEUED_LOCK_STAT */ 150#endif /* CONFIG_LOCK_EVENT_COUNTS */
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
deleted file mode 100644
index a7ffb2a96ede..000000000000
--- a/kernel/locking/rwsem-spinlock.c
+++ /dev/null
@@ -1,339 +0,0 @@
1// SPDX-License-Identifier: GPL-2.0
2/* rwsem-spinlock.c: R/W semaphores: contention handling functions for
3 * generic spinlock implementation
4 *
5 * Copyright (c) 2001 David Howells (dhowells@redhat.com).
6 * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
7 * - Derived also from comments by Linus
8 */
9#include <linux/rwsem.h>
10#include <linux/sched/signal.h>
11#include <linux/sched/debug.h>
12#include <linux/export.h>
13
14enum rwsem_waiter_type {
15 RWSEM_WAITING_FOR_WRITE,
16 RWSEM_WAITING_FOR_READ
17};
18
19struct rwsem_waiter {
20 struct list_head list;
21 struct task_struct *task;
22 enum rwsem_waiter_type type;
23};
24
25int rwsem_is_locked(struct rw_semaphore *sem)
26{
27 int ret = 1;
28 unsigned long flags;
29
30 if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) {
31 ret = (sem->count != 0);
32 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
33 }
34 return ret;
35}
36EXPORT_SYMBOL(rwsem_is_locked);
37
38/*
39 * initialise the semaphore
40 */
41void __init_rwsem(struct rw_semaphore *sem, const char *name,
42 struct lock_class_key *key)
43{
44#ifdef CONFIG_DEBUG_LOCK_ALLOC
45 /*
46 * Make sure we are not reinitializing a held semaphore:
47 */
48 debug_check_no_locks_freed((void *)sem, sizeof(*sem));
49 lockdep_init_map(&sem->dep_map, name, key, 0);
50#endif
51 sem->count = 0;
52 raw_spin_lock_init(&sem->wait_lock);
53 INIT_LIST_HEAD(&sem->wait_list);
54}
55EXPORT_SYMBOL(__init_rwsem);
56
57/*
58 * handle the lock release when processes blocked on it that can now run
59 * - if we come here, then:
60 * - the 'active count' _reached_ zero
61 * - the 'waiting count' is non-zero
62 * - the spinlock must be held by the caller
63 * - woken process blocks are discarded from the list after having task zeroed
64 * - writers are only woken if wakewrite is non-zero
65 */
66static inline struct rw_semaphore *
67__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
68{
69 struct rwsem_waiter *waiter;
70 struct task_struct *tsk;
71 int woken;
72
73 waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
74
75 if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
76 if (wakewrite)
77 /* Wake up a writer. Note that we do not grant it the
78 * lock - it will have to acquire it when it runs. */
79 wake_up_process(waiter->task);
80 goto out;
81 }
82
83 /* grant an infinite number of read locks to the front of the queue */
84 woken = 0;
85 do {
86 struct list_head *next = waiter->list.next;
87
88 list_del(&waiter->list);
89 tsk = waiter->task;
90 /*
91 * Make sure we do not wakeup the next reader before
92 * setting the nil condition to grant the next reader;
93 * otherwise we could miss the wakeup on the other
94 * side and end up sleeping again. See the pairing
95 * in rwsem_down_read_failed().
96 */
97 smp_mb();
98 waiter->task = NULL;
99 wake_up_process(tsk);
100 put_task_struct(tsk);
101 woken++;
102 if (next == &sem->wait_list)
103 break;
104 waiter = list_entry(next, struct rwsem_waiter, list);
105 } while (waiter->type != RWSEM_WAITING_FOR_WRITE);
106
107 sem->count += woken;
108
109 out:
110 return sem;
111}
112
113/*
114 * wake a single writer
115 */
116static inline struct rw_semaphore *
117__rwsem_wake_one_writer(struct rw_semaphore *sem)
118{
119 struct rwsem_waiter *waiter;
120
121 waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
122 wake_up_process(waiter->task);
123
124 return sem;
125}
126
127/*
128 * get a read lock on the semaphore
129 */
130int __sched __down_read_common(struct rw_semaphore *sem, int state)
131{
132 struct rwsem_waiter waiter;
133 unsigned long flags;
134
135 raw_spin_lock_irqsave(&sem->wait_lock, flags);
136
137 if (sem->count >= 0 && list_empty(&sem->wait_list)) {
138 /* granted */
139 sem->count++;
140 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
141 goto out;
142 }
143
144 /* set up my own style of waitqueue */
145 waiter.task = current;
146 waiter.type = RWSEM_WAITING_FOR_READ;
147 get_task_struct(current);
148
149 list_add_tail(&waiter.list, &sem->wait_list);
150
151 /* wait to be given the lock */
152 for (;;) {
153 if (!waiter.task)
154 break;
155 if (signal_pending_state(state, current))
156 goto out_nolock;
157 set_current_state(state);
158 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
159 schedule();
160 raw_spin_lock_irqsave(&sem->wait_lock, flags);
161 }
162
163 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
164 out:
165 return 0;
166
167out_nolock:
168 /*
169 * We didn't take the lock, so that there is a writer, which
170 * is owner or the first waiter of the sem. If it's a waiter,
171 * it will be woken by current owner. Not need to wake anybody.
172 */
173 list_del(&waiter.list);
174 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
175 return -EINTR;
176}
177
178void __sched __down_read(struct rw_semaphore *sem)
179{
180 __down_read_common(sem, TASK_UNINTERRUPTIBLE);
181}
182
183int __sched __down_read_killable(struct rw_semaphore *sem)
184{
185 return __down_read_common(sem, TASK_KILLABLE);
186}
187
188/*
189 * trylock for reading -- returns 1 if successful, 0 if contention
190 */
191int __down_read_trylock(struct rw_semaphore *sem)
192{
193 unsigned long flags;
194 int ret = 0;
195
196
197 raw_spin_lock_irqsave(&sem->wait_lock, flags);
198
199 if (sem->count >= 0 && list_empty(&sem->wait_list)) {
200 /* granted */
201 sem->count++;
202 ret = 1;
203 }
204
205 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
206
207 return ret;
208}
209
210/*
211 * get a write lock on the semaphore
212 */
213int __sched __down_write_common(struct rw_semaphore *sem, int state)
214{
215 struct rwsem_waiter waiter;
216 unsigned long flags;
217 int ret = 0;
218
219 raw_spin_lock_irqsave(&sem->wait_lock, flags);
220
221 /* set up my own style of waitqueue */
222 waiter.task = current;
223 waiter.type = RWSEM_WAITING_FOR_WRITE;
224 list_add_tail(&waiter.list, &sem->wait_list);
225
226 /* wait for someone to release the lock */
227 for (;;) {
228 /*
229 * That is the key to support write lock stealing: allows the
230 * task already on CPU to get the lock soon rather than put
231 * itself into sleep and waiting for system woke it or someone
232 * else in the head of the wait list up.
233 */
234 if (sem->count == 0)
235 break;
236 if (signal_pending_state(state, current))
237 goto out_nolock;
238
239 set_current_state(state);
240 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
241 schedule();
242 raw_spin_lock_irqsave(&sem->wait_lock, flags);
243 }
244 /* got the lock */
245 sem->count = -1;
246 list_del(&waiter.list);
247
248 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
249
250 return ret;
251
252out_nolock:
253 list_del(&waiter.list);
254 if (!list_empty(&sem->wait_list) && sem->count >= 0)
255 __rwsem_do_wake(sem, 0);
256 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
257
258 return -EINTR;
259}
260
261void __sched __down_write(struct rw_semaphore *sem)
262{
263 __down_write_common(sem, TASK_UNINTERRUPTIBLE);
264}
265
266int __sched __down_write_killable(struct rw_semaphore *sem)
267{
268 return __down_write_common(sem, TASK_KILLABLE);
269}
270
271/*
272 * trylock for writing -- returns 1 if successful, 0 if contention
273 */
274int __down_write_trylock(struct rw_semaphore *sem)
275{
276 unsigned long flags;
277 int ret = 0;
278
279 raw_spin_lock_irqsave(&sem->wait_lock, flags);
280
281 if (sem->count == 0) {
282 /* got the lock */
283 sem->count = -1;
284 ret = 1;
285 }
286
287 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
288
289 return ret;
290}
291
292/*
293 * release a read lock on the semaphore
294 */
295void __up_read(struct rw_semaphore *sem)
296{
297 unsigned long flags;
298
299 raw_spin_lock_irqsave(&sem->wait_lock, flags);
300
301 if (--sem->count == 0 && !list_empty(&sem->wait_list))
302 sem = __rwsem_wake_one_writer(sem);
303
304 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
305}
306
307/*
308 * release a write lock on the semaphore
309 */
310void __up_write(struct rw_semaphore *sem)
311{
312 unsigned long flags;
313
314 raw_spin_lock_irqsave(&sem->wait_lock, flags);
315
316 sem->count = 0;
317 if (!list_empty(&sem->wait_list))
318 sem = __rwsem_do_wake(sem, 1);
319
320 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
321}
322
323/*
324 * downgrade a write lock into a read lock
325 * - just wake up any readers at the front of the queue
326 */
327void __downgrade_write(struct rw_semaphore *sem)
328{
329 unsigned long flags;
330
331 raw_spin_lock_irqsave(&sem->wait_lock, flags);
332
333 sem->count = 1;
334 if (!list_empty(&sem->wait_list))
335 sem = __rwsem_do_wake(sem, 0);
336
337 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
338}
339
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index fbe96341beee..6b3ee9948bf1 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -147,6 +147,7 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
147 * will notice the queued writer. 147 * will notice the queued writer.
148 */ 148 */
149 wake_q_add(wake_q, waiter->task); 149 wake_q_add(wake_q, waiter->task);
150 lockevent_inc(rwsem_wake_writer);
150 } 151 }
151 152
152 return; 153 return;
@@ -176,9 +177,8 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
176 goto try_reader_grant; 177 goto try_reader_grant;
177 } 178 }
178 /* 179 /*
179 * It is not really necessary to set it to reader-owned here, 180 * Set it to reader-owned to give spinners an early
180 * but it gives the spinners an early indication that the 181 * indication that readers now have the lock.
181 * readers now have the lock.
182 */ 182 */
183 __rwsem_set_reader_owned(sem, waiter->task); 183 __rwsem_set_reader_owned(sem, waiter->task);
184 } 184 }
@@ -215,6 +215,7 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
215 } 215 }
216 216
217 adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; 217 adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
218 lockevent_cond_inc(rwsem_wake_reader, woken);
218 if (list_empty(&sem->wait_list)) { 219 if (list_empty(&sem->wait_list)) {
219 /* hit end of list above */ 220 /* hit end of list above */
220 adjustment -= RWSEM_WAITING_BIAS; 221 adjustment -= RWSEM_WAITING_BIAS;
@@ -225,92 +226,6 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
225} 226}
226 227
227/* 228/*
228 * Wait for the read lock to be granted
229 */
230static inline struct rw_semaphore __sched *
231__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state)
232{
233 long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
234 struct rwsem_waiter waiter;
235 DEFINE_WAKE_Q(wake_q);
236
237 waiter.task = current;
238 waiter.type = RWSEM_WAITING_FOR_READ;
239
240 raw_spin_lock_irq(&sem->wait_lock);
241 if (list_empty(&sem->wait_list)) {
242 /*
243 * In case the wait queue is empty and the lock isn't owned
244 * by a writer, this reader can exit the slowpath and return
245 * immediately as its RWSEM_ACTIVE_READ_BIAS has already
246 * been set in the count.
247 */
248 if (atomic_long_read(&sem->count) >= 0) {
249 raw_spin_unlock_irq(&sem->wait_lock);
250 return sem;
251 }
252 adjustment += RWSEM_WAITING_BIAS;
253 }
254 list_add_tail(&waiter.list, &sem->wait_list);
255
256 /* we're now waiting on the lock, but no longer actively locking */
257 count = atomic_long_add_return(adjustment, &sem->count);
258
259 /*
260 * If there are no active locks, wake the front queued process(es).
261 *
262 * If there are no writers and we are first in the queue,
263 * wake our own waiter to join the existing active readers !
264 */
265 if (count == RWSEM_WAITING_BIAS ||
266 (count > RWSEM_WAITING_BIAS &&
267 adjustment != -RWSEM_ACTIVE_READ_BIAS))
268 __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
269
270 raw_spin_unlock_irq(&sem->wait_lock);
271 wake_up_q(&wake_q);
272
273 /* wait to be given the lock */
274 while (true) {
275 set_current_state(state);
276 if (!waiter.task)
277 break;
278 if (signal_pending_state(state, current)) {
279 raw_spin_lock_irq(&sem->wait_lock);
280 if (waiter.task)
281 goto out_nolock;
282 raw_spin_unlock_irq(&sem->wait_lock);
283 break;
284 }
285 schedule();
286 }
287
288 __set_current_state(TASK_RUNNING);
289 return sem;
290out_nolock:
291 list_del(&waiter.list);
292 if (list_empty(&sem->wait_list))
293 atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count);
294 raw_spin_unlock_irq(&sem->wait_lock);
295 __set_current_state(TASK_RUNNING);
296 return ERR_PTR(-EINTR);
297}
298
299__visible struct rw_semaphore * __sched
300rwsem_down_read_failed(struct rw_semaphore *sem)
301{
302 return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE);
303}
304EXPORT_SYMBOL(rwsem_down_read_failed);
305
306__visible struct rw_semaphore * __sched
307rwsem_down_read_failed_killable(struct rw_semaphore *sem)
308{
309 return __rwsem_down_read_failed_common(sem, TASK_KILLABLE);
310}
311EXPORT_SYMBOL(rwsem_down_read_failed_killable);
312
313/*
314 * This function must be called with the sem->wait_lock held to prevent 229 * This function must be called with the sem->wait_lock held to prevent
315 * race conditions between checking the rwsem wait list and setting the 230 * race conditions between checking the rwsem wait list and setting the
316 * sem->count accordingly. 231 * sem->count accordingly.
@@ -346,21 +261,17 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
346 */ 261 */
347static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) 262static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
348{ 263{
349 long old, count = atomic_long_read(&sem->count); 264 long count = atomic_long_read(&sem->count);
350
351 while (true) {
352 if (!(count == 0 || count == RWSEM_WAITING_BIAS))
353 return false;
354 265
355 old = atomic_long_cmpxchg_acquire(&sem->count, count, 266 while (!count || count == RWSEM_WAITING_BIAS) {
356 count + RWSEM_ACTIVE_WRITE_BIAS); 267 if (atomic_long_try_cmpxchg_acquire(&sem->count, &count,
357 if (old == count) { 268 count + RWSEM_ACTIVE_WRITE_BIAS)) {
358 rwsem_set_owner(sem); 269 rwsem_set_owner(sem);
270 lockevent_inc(rwsem_opt_wlock);
359 return true; 271 return true;
360 } 272 }
361
362 count = old;
363 } 273 }
274 return false;
364} 275}
365 276
366static inline bool owner_on_cpu(struct task_struct *owner) 277static inline bool owner_on_cpu(struct task_struct *owner)
@@ -481,6 +392,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
481 osq_unlock(&sem->osq); 392 osq_unlock(&sem->osq);
482done: 393done:
483 preempt_enable(); 394 preempt_enable();
395 lockevent_cond_inc(rwsem_opt_fail, !taken);
484 return taken; 396 return taken;
485} 397}
486 398
@@ -505,6 +417,97 @@ static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
505#endif 417#endif
506 418
507/* 419/*
420 * Wait for the read lock to be granted
421 */
422static inline struct rw_semaphore __sched *
423__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state)
424{
425 long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
426 struct rwsem_waiter waiter;
427 DEFINE_WAKE_Q(wake_q);
428
429 waiter.task = current;
430 waiter.type = RWSEM_WAITING_FOR_READ;
431
432 raw_spin_lock_irq(&sem->wait_lock);
433 if (list_empty(&sem->wait_list)) {
434 /*
435 * In case the wait queue is empty and the lock isn't owned
436 * by a writer, this reader can exit the slowpath and return
437 * immediately as its RWSEM_ACTIVE_READ_BIAS has already
438 * been set in the count.
439 */
440 if (atomic_long_read(&sem->count) >= 0) {
441 raw_spin_unlock_irq(&sem->wait_lock);
442 rwsem_set_reader_owned(sem);
443 lockevent_inc(rwsem_rlock_fast);
444 return sem;
445 }
446 adjustment += RWSEM_WAITING_BIAS;
447 }
448 list_add_tail(&waiter.list, &sem->wait_list);
449
450 /* we're now waiting on the lock, but no longer actively locking */
451 count = atomic_long_add_return(adjustment, &sem->count);
452
453 /*
454 * If there are no active locks, wake the front queued process(es).
455 *
456 * If there are no writers and we are first in the queue,
457 * wake our own waiter to join the existing active readers !
458 */
459 if (count == RWSEM_WAITING_BIAS ||
460 (count > RWSEM_WAITING_BIAS &&
461 adjustment != -RWSEM_ACTIVE_READ_BIAS))
462 __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
463
464 raw_spin_unlock_irq(&sem->wait_lock);
465 wake_up_q(&wake_q);
466
467 /* wait to be given the lock */
468 while (true) {
469 set_current_state(state);
470 if (!waiter.task)
471 break;
472 if (signal_pending_state(state, current)) {
473 raw_spin_lock_irq(&sem->wait_lock);
474 if (waiter.task)
475 goto out_nolock;
476 raw_spin_unlock_irq(&sem->wait_lock);
477 break;
478 }
479 schedule();
480 lockevent_inc(rwsem_sleep_reader);
481 }
482
483 __set_current_state(TASK_RUNNING);
484 lockevent_inc(rwsem_rlock);
485 return sem;
486out_nolock:
487 list_del(&waiter.list);
488 if (list_empty(&sem->wait_list))
489 atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count);
490 raw_spin_unlock_irq(&sem->wait_lock);
491 __set_current_state(TASK_RUNNING);
492 lockevent_inc(rwsem_rlock_fail);
493 return ERR_PTR(-EINTR);
494}
495
496__visible struct rw_semaphore * __sched
497rwsem_down_read_failed(struct rw_semaphore *sem)
498{
499 return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE);
500}
501EXPORT_SYMBOL(rwsem_down_read_failed);
502
503__visible struct rw_semaphore * __sched
504rwsem_down_read_failed_killable(struct rw_semaphore *sem)
505{
506 return __rwsem_down_read_failed_common(sem, TASK_KILLABLE);
507}
508EXPORT_SYMBOL(rwsem_down_read_failed_killable);
509
510/*
508 * Wait until we successfully acquire the write lock 511 * Wait until we successfully acquire the write lock
509 */ 512 */
510static inline struct rw_semaphore * 513static inline struct rw_semaphore *
@@ -580,6 +583,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
580 goto out_nolock; 583 goto out_nolock;
581 584
582 schedule(); 585 schedule();
586 lockevent_inc(rwsem_sleep_writer);
583 set_current_state(state); 587 set_current_state(state);
584 } while ((count = atomic_long_read(&sem->count)) & RWSEM_ACTIVE_MASK); 588 } while ((count = atomic_long_read(&sem->count)) & RWSEM_ACTIVE_MASK);
585 589
@@ -588,6 +592,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
588 __set_current_state(TASK_RUNNING); 592 __set_current_state(TASK_RUNNING);
589 list_del(&waiter.list); 593 list_del(&waiter.list);
590 raw_spin_unlock_irq(&sem->wait_lock); 594 raw_spin_unlock_irq(&sem->wait_lock);
595 lockevent_inc(rwsem_wlock);
591 596
592 return ret; 597 return ret;
593 598
@@ -601,6 +606,7 @@ out_nolock:
601 __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); 606 __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
602 raw_spin_unlock_irq(&sem->wait_lock); 607 raw_spin_unlock_irq(&sem->wait_lock);
603 wake_up_q(&wake_q); 608 wake_up_q(&wake_q);
609 lockevent_inc(rwsem_wlock_fail);
604 610
605 return ERR_PTR(-EINTR); 611 return ERR_PTR(-EINTR);
606} 612}
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index e586f0d03ad3..ccbf18f560ff 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -24,7 +24,6 @@ void __sched down_read(struct rw_semaphore *sem)
24 rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); 24 rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
25 25
26 LOCK_CONTENDED(sem, __down_read_trylock, __down_read); 26 LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
27 rwsem_set_reader_owned(sem);
28} 27}
29 28
30EXPORT_SYMBOL(down_read); 29EXPORT_SYMBOL(down_read);
@@ -39,7 +38,6 @@ int __sched down_read_killable(struct rw_semaphore *sem)
39 return -EINTR; 38 return -EINTR;
40 } 39 }
41 40
42 rwsem_set_reader_owned(sem);
43 return 0; 41 return 0;
44} 42}
45 43
@@ -52,10 +50,8 @@ int down_read_trylock(struct rw_semaphore *sem)
52{ 50{
53 int ret = __down_read_trylock(sem); 51 int ret = __down_read_trylock(sem);
54 52
55 if (ret == 1) { 53 if (ret == 1)
56 rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); 54 rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
57 rwsem_set_reader_owned(sem);
58 }
59 return ret; 55 return ret;
60} 56}
61 57
@@ -70,7 +66,6 @@ void __sched down_write(struct rw_semaphore *sem)
70 rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); 66 rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
71 67
72 LOCK_CONTENDED(sem, __down_write_trylock, __down_write); 68 LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
73 rwsem_set_owner(sem);
74} 69}
75 70
76EXPORT_SYMBOL(down_write); 71EXPORT_SYMBOL(down_write);
@@ -88,7 +83,6 @@ int __sched down_write_killable(struct rw_semaphore *sem)
88 return -EINTR; 83 return -EINTR;
89 } 84 }
90 85
91 rwsem_set_owner(sem);
92 return 0; 86 return 0;
93} 87}
94 88
@@ -101,10 +95,8 @@ int down_write_trylock(struct rw_semaphore *sem)
101{ 95{
102 int ret = __down_write_trylock(sem); 96 int ret = __down_write_trylock(sem);
103 97
104 if (ret == 1) { 98 if (ret == 1)
105 rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_); 99 rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
106 rwsem_set_owner(sem);
107 }
108 100
109 return ret; 101 return ret;
110} 102}
@@ -117,9 +109,7 @@ EXPORT_SYMBOL(down_write_trylock);
117void up_read(struct rw_semaphore *sem) 109void up_read(struct rw_semaphore *sem)
118{ 110{
119 rwsem_release(&sem->dep_map, 1, _RET_IP_); 111 rwsem_release(&sem->dep_map, 1, _RET_IP_);
120 DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED));
121 112
122 rwsem_clear_reader_owned(sem);
123 __up_read(sem); 113 __up_read(sem);
124} 114}
125 115
@@ -131,9 +121,7 @@ EXPORT_SYMBOL(up_read);
131void up_write(struct rw_semaphore *sem) 121void up_write(struct rw_semaphore *sem)
132{ 122{
133 rwsem_release(&sem->dep_map, 1, _RET_IP_); 123 rwsem_release(&sem->dep_map, 1, _RET_IP_);
134 DEBUG_RWSEMS_WARN_ON(sem->owner != current);
135 124
136 rwsem_clear_owner(sem);
137 __up_write(sem); 125 __up_write(sem);
138} 126}
139 127
@@ -145,9 +133,7 @@ EXPORT_SYMBOL(up_write);
145void downgrade_write(struct rw_semaphore *sem) 133void downgrade_write(struct rw_semaphore *sem)
146{ 134{
147 lock_downgrade(&sem->dep_map, _RET_IP_); 135 lock_downgrade(&sem->dep_map, _RET_IP_);
148 DEBUG_RWSEMS_WARN_ON(sem->owner != current);
149 136
150 rwsem_set_reader_owned(sem);
151 __downgrade_write(sem); 137 __downgrade_write(sem);
152} 138}
153 139
@@ -161,7 +147,6 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
161 rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); 147 rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
162 148
163 LOCK_CONTENDED(sem, __down_read_trylock, __down_read); 149 LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
164 rwsem_set_reader_owned(sem);
165} 150}
166 151
167EXPORT_SYMBOL(down_read_nested); 152EXPORT_SYMBOL(down_read_nested);
@@ -172,7 +157,6 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
172 rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); 157 rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
173 158
174 LOCK_CONTENDED(sem, __down_write_trylock, __down_write); 159 LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
175 rwsem_set_owner(sem);
176} 160}
177 161
178EXPORT_SYMBOL(_down_write_nest_lock); 162EXPORT_SYMBOL(_down_write_nest_lock);
@@ -193,7 +177,6 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
193 rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); 177 rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
194 178
195 LOCK_CONTENDED(sem, __down_write_trylock, __down_write); 179 LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
196 rwsem_set_owner(sem);
197} 180}
198 181
199EXPORT_SYMBOL(down_write_nested); 182EXPORT_SYMBOL(down_write_nested);
@@ -208,7 +191,6 @@ int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass)
208 return -EINTR; 191 return -EINTR;
209 } 192 }
210 193
211 rwsem_set_owner(sem);
212 return 0; 194 return 0;
213} 195}
214 196
@@ -216,7 +198,8 @@ EXPORT_SYMBOL(down_write_killable_nested);
216 198
217void up_read_non_owner(struct rw_semaphore *sem) 199void up_read_non_owner(struct rw_semaphore *sem)
218{ 200{
219 DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED)); 201 DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED),
202 sem);
220 __up_read(sem); 203 __up_read(sem);
221} 204}
222 205
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index bad2bca0268b..64877f5294e3 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -23,15 +23,44 @@
23 * is involved. Ideally we would like to track all the readers that own 23 * is involved. Ideally we would like to track all the readers that own
24 * a rwsem, but the overhead is simply too big. 24 * a rwsem, but the overhead is simply too big.
25 */ 25 */
26#include "lock_events.h"
27
26#define RWSEM_READER_OWNED (1UL << 0) 28#define RWSEM_READER_OWNED (1UL << 0)
27#define RWSEM_ANONYMOUSLY_OWNED (1UL << 1) 29#define RWSEM_ANONYMOUSLY_OWNED (1UL << 1)
28 30
29#ifdef CONFIG_DEBUG_RWSEMS 31#ifdef CONFIG_DEBUG_RWSEMS
30# define DEBUG_RWSEMS_WARN_ON(c) DEBUG_LOCKS_WARN_ON(c) 32# define DEBUG_RWSEMS_WARN_ON(c, sem) do { \
33 if (!debug_locks_silent && \
34 WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\
35 #c, atomic_long_read(&(sem)->count), \
36 (long)((sem)->owner), (long)current, \
37 list_empty(&(sem)->wait_list) ? "" : "not ")) \
38 debug_locks_off(); \
39 } while (0)
40#else
41# define DEBUG_RWSEMS_WARN_ON(c, sem)
42#endif
43
44/*
45 * R/W semaphores originally for PPC using the stuff in lib/rwsem.c.
46 * Adapted largely from include/asm-i386/rwsem.h
47 * by Paul Mackerras <paulus@samba.org>.
48 */
49
50/*
51 * the semaphore definition
52 */
53#ifdef CONFIG_64BIT
54# define RWSEM_ACTIVE_MASK 0xffffffffL
31#else 55#else
32# define DEBUG_RWSEMS_WARN_ON(c) 56# define RWSEM_ACTIVE_MASK 0x0000ffffL
33#endif 57#endif
34 58
59#define RWSEM_ACTIVE_BIAS 0x00000001L
60#define RWSEM_WAITING_BIAS (-RWSEM_ACTIVE_MASK-1)
61#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS
62#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
63
35#ifdef CONFIG_RWSEM_SPIN_ON_OWNER 64#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
36/* 65/*
37 * All writes to owner are protected by WRITE_ONCE() to make sure that 66 * All writes to owner are protected by WRITE_ONCE() to make sure that
@@ -132,3 +161,144 @@ static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
132{ 161{
133} 162}
134#endif 163#endif
164
165extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
166extern struct rw_semaphore *rwsem_down_read_failed_killable(struct rw_semaphore *sem);
167extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
168extern struct rw_semaphore *rwsem_down_write_failed_killable(struct rw_semaphore *sem);
169extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem);
170extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
171
172/*
173 * lock for reading
174 */
175static inline void __down_read(struct rw_semaphore *sem)
176{
177 if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) {
178 rwsem_down_read_failed(sem);
179 DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner &
180 RWSEM_READER_OWNED), sem);
181 } else {
182 rwsem_set_reader_owned(sem);
183 }
184}
185
186static inline int __down_read_killable(struct rw_semaphore *sem)
187{
188 if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) {
189 if (IS_ERR(rwsem_down_read_failed_killable(sem)))
190 return -EINTR;
191 DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner &
192 RWSEM_READER_OWNED), sem);
193 } else {
194 rwsem_set_reader_owned(sem);
195 }
196 return 0;
197}
198
199static inline int __down_read_trylock(struct rw_semaphore *sem)
200{
201 /*
202 * Optimize for the case when the rwsem is not locked at all.
203 */
204 long tmp = RWSEM_UNLOCKED_VALUE;
205
206 lockevent_inc(rwsem_rtrylock);
207 do {
208 if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
209 tmp + RWSEM_ACTIVE_READ_BIAS)) {
210 rwsem_set_reader_owned(sem);
211 return 1;
212 }
213 } while (tmp >= 0);
214 return 0;
215}
216
217/*
218 * lock for writing
219 */
220static inline void __down_write(struct rw_semaphore *sem)
221{
222 long tmp;
223
224 tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS,
225 &sem->count);
226 if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS))
227 rwsem_down_write_failed(sem);
228 rwsem_set_owner(sem);
229}
230
231static inline int __down_write_killable(struct rw_semaphore *sem)
232{
233 long tmp;
234
235 tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS,
236 &sem->count);
237 if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS))
238 if (IS_ERR(rwsem_down_write_failed_killable(sem)))
239 return -EINTR;
240 rwsem_set_owner(sem);
241 return 0;
242}
243
244static inline int __down_write_trylock(struct rw_semaphore *sem)
245{
246 long tmp;
247
248 lockevent_inc(rwsem_wtrylock);
249 tmp = atomic_long_cmpxchg_acquire(&sem->count, RWSEM_UNLOCKED_VALUE,
250 RWSEM_ACTIVE_WRITE_BIAS);
251 if (tmp == RWSEM_UNLOCKED_VALUE) {
252 rwsem_set_owner(sem);
253 return true;
254 }
255 return false;
256}
257
258/*
259 * unlock after reading
260 */
261static inline void __up_read(struct rw_semaphore *sem)
262{
263 long tmp;
264
265 DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED),
266 sem);
267 rwsem_clear_reader_owned(sem);
268 tmp = atomic_long_dec_return_release(&sem->count);
269 if (unlikely(tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0))
270 rwsem_wake(sem);
271}
272
273/*
274 * unlock after writing
275 */
276static inline void __up_write(struct rw_semaphore *sem)
277{
278 DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem);
279 rwsem_clear_owner(sem);
280 if (unlikely(atomic_long_sub_return_release(RWSEM_ACTIVE_WRITE_BIAS,
281 &sem->count) < 0))
282 rwsem_wake(sem);
283}
284
285/*
286 * downgrade write lock to read lock
287 */
288static inline void __downgrade_write(struct rw_semaphore *sem)
289{
290 long tmp;
291
292 /*
293 * When downgrading from exclusive to shared ownership,
294 * anything inside the write-locked region cannot leak
295 * into the read side. In contrast, anything in the
296 * read-locked region is ok to be re-ordered into the
297 * write side. As such, rely on RELEASE semantics.
298 */
299 DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem);
300 tmp = atomic_long_add_return_release(-RWSEM_WAITING_BIAS, &sem->count);
301 rwsem_set_reader_owned(sem);
302 if (tmp < 0)
303 rwsem_downgrade_wake(sem);
304}
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 936f3d14dd6b..0ff08380f531 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -22,6 +22,13 @@
22#include <linux/debug_locks.h> 22#include <linux/debug_locks.h>
23#include <linux/export.h> 23#include <linux/export.h>
24 24
25#ifdef CONFIG_MMIOWB
26#ifndef arch_mmiowb_state
27DEFINE_PER_CPU(struct mmiowb_state, __mmiowb_state);
28EXPORT_PER_CPU_SYMBOL(__mmiowb_state);
29#endif
30#endif
31
25/* 32/*
26 * If lockdep is enabled then we use the non-preemption spin-ops 33 * If lockdep is enabled then we use the non-preemption spin-ops
27 * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are 34 * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are
diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
index 9aa0fccd5d43..399669f7eba8 100644
--- a/kernel/locking/spinlock_debug.c
+++ b/kernel/locking/spinlock_debug.c
@@ -111,6 +111,7 @@ void do_raw_spin_lock(raw_spinlock_t *lock)
111{ 111{
112 debug_spin_lock_before(lock); 112 debug_spin_lock_before(lock);
113 arch_spin_lock(&lock->raw_lock); 113 arch_spin_lock(&lock->raw_lock);
114 mmiowb_spin_lock();
114 debug_spin_lock_after(lock); 115 debug_spin_lock_after(lock);
115} 116}
116 117
@@ -118,8 +119,10 @@ int do_raw_spin_trylock(raw_spinlock_t *lock)
118{ 119{
119 int ret = arch_spin_trylock(&lock->raw_lock); 120 int ret = arch_spin_trylock(&lock->raw_lock);
120 121
121 if (ret) 122 if (ret) {
123 mmiowb_spin_lock();
122 debug_spin_lock_after(lock); 124 debug_spin_lock_after(lock);
125 }
123#ifndef CONFIG_SMP 126#ifndef CONFIG_SMP
124 /* 127 /*
125 * Must not happen on UP: 128 * Must not happen on UP:
@@ -131,6 +134,7 @@ int do_raw_spin_trylock(raw_spinlock_t *lock)
131 134
132void do_raw_spin_unlock(raw_spinlock_t *lock) 135void do_raw_spin_unlock(raw_spinlock_t *lock)
133{ 136{
137 mmiowb_spin_unlock();
134 debug_spin_unlock(lock); 138 debug_spin_unlock(lock);
135 arch_spin_unlock(&lock->raw_lock); 139 arch_spin_unlock(&lock->raw_lock);
136} 140}
diff --git a/kernel/module.c b/kernel/module.c
index 0b9aa8ab89f0..a9020bdd4cf6 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -98,6 +98,10 @@ DEFINE_MUTEX(module_mutex);
98EXPORT_SYMBOL_GPL(module_mutex); 98EXPORT_SYMBOL_GPL(module_mutex);
99static LIST_HEAD(modules); 99static LIST_HEAD(modules);
100 100
101/* Work queue for freeing init sections in success case */
102static struct work_struct init_free_wq;
103static struct llist_head init_free_list;
104
101#ifdef CONFIG_MODULES_TREE_LOOKUP 105#ifdef CONFIG_MODULES_TREE_LOOKUP
102 106
103/* 107/*
@@ -1949,9 +1953,16 @@ void module_enable_ro(const struct module *mod, bool after_init)
1949 if (!rodata_enabled) 1953 if (!rodata_enabled)
1950 return; 1954 return;
1951 1955
1956 set_vm_flush_reset_perms(mod->core_layout.base);
1957 set_vm_flush_reset_perms(mod->init_layout.base);
1952 frob_text(&mod->core_layout, set_memory_ro); 1958 frob_text(&mod->core_layout, set_memory_ro);
1959 frob_text(&mod->core_layout, set_memory_x);
1960
1953 frob_rodata(&mod->core_layout, set_memory_ro); 1961 frob_rodata(&mod->core_layout, set_memory_ro);
1962
1954 frob_text(&mod->init_layout, set_memory_ro); 1963 frob_text(&mod->init_layout, set_memory_ro);
1964 frob_text(&mod->init_layout, set_memory_x);
1965
1955 frob_rodata(&mod->init_layout, set_memory_ro); 1966 frob_rodata(&mod->init_layout, set_memory_ro);
1956 1967
1957 if (after_init) 1968 if (after_init)
@@ -1967,15 +1978,6 @@ static void module_enable_nx(const struct module *mod)
1967 frob_writable_data(&mod->init_layout, set_memory_nx); 1978 frob_writable_data(&mod->init_layout, set_memory_nx);
1968} 1979}
1969 1980
1970static void module_disable_nx(const struct module *mod)
1971{
1972 frob_rodata(&mod->core_layout, set_memory_x);
1973 frob_ro_after_init(&mod->core_layout, set_memory_x);
1974 frob_writable_data(&mod->core_layout, set_memory_x);
1975 frob_rodata(&mod->init_layout, set_memory_x);
1976 frob_writable_data(&mod->init_layout, set_memory_x);
1977}
1978
1979/* Iterate through all modules and set each module's text as RW */ 1981/* Iterate through all modules and set each module's text as RW */
1980void set_all_modules_text_rw(void) 1982void set_all_modules_text_rw(void)
1981{ 1983{
@@ -2019,23 +2021,8 @@ void set_all_modules_text_ro(void)
2019 } 2021 }
2020 mutex_unlock(&module_mutex); 2022 mutex_unlock(&module_mutex);
2021} 2023}
2022
2023static void disable_ro_nx(const struct module_layout *layout)
2024{
2025 if (rodata_enabled) {
2026 frob_text(layout, set_memory_rw);
2027 frob_rodata(layout, set_memory_rw);
2028 frob_ro_after_init(layout, set_memory_rw);
2029 }
2030 frob_rodata(layout, set_memory_x);
2031 frob_ro_after_init(layout, set_memory_x);
2032 frob_writable_data(layout, set_memory_x);
2033}
2034
2035#else 2024#else
2036static void disable_ro_nx(const struct module_layout *layout) { }
2037static void module_enable_nx(const struct module *mod) { } 2025static void module_enable_nx(const struct module *mod) { }
2038static void module_disable_nx(const struct module *mod) { }
2039#endif 2026#endif
2040 2027
2041#ifdef CONFIG_LIVEPATCH 2028#ifdef CONFIG_LIVEPATCH
@@ -2115,6 +2102,11 @@ static void free_module_elf(struct module *mod)
2115 2102
2116void __weak module_memfree(void *module_region) 2103void __weak module_memfree(void *module_region)
2117{ 2104{
2105 /*
2106 * This memory may be RO, and freeing RO memory in an interrupt is not
2107 * supported by vmalloc.
2108 */
2109 WARN_ON(in_interrupt());
2118 vfree(module_region); 2110 vfree(module_region);
2119} 2111}
2120 2112
@@ -2166,7 +2158,6 @@ static void free_module(struct module *mod)
2166 mutex_unlock(&module_mutex); 2158 mutex_unlock(&module_mutex);
2167 2159
2168 /* This may be empty, but that's OK */ 2160 /* This may be empty, but that's OK */
2169 disable_ro_nx(&mod->init_layout);
2170 module_arch_freeing_init(mod); 2161 module_arch_freeing_init(mod);
2171 module_memfree(mod->init_layout.base); 2162 module_memfree(mod->init_layout.base);
2172 kfree(mod->args); 2163 kfree(mod->args);
@@ -2176,7 +2167,6 @@ static void free_module(struct module *mod)
2176 lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size); 2167 lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
2177 2168
2178 /* Finally, free the core (containing the module structure) */ 2169 /* Finally, free the core (containing the module structure) */
2179 disable_ro_nx(&mod->core_layout);
2180 module_memfree(mod->core_layout.base); 2170 module_memfree(mod->core_layout.base);
2181} 2171}
2182 2172
@@ -3415,17 +3405,34 @@ static void do_mod_ctors(struct module *mod)
3415 3405
3416/* For freeing module_init on success, in case kallsyms traversing */ 3406/* For freeing module_init on success, in case kallsyms traversing */
3417struct mod_initfree { 3407struct mod_initfree {
3418 struct rcu_head rcu; 3408 struct llist_node node;
3419 void *module_init; 3409 void *module_init;
3420}; 3410};
3421 3411
3422static void do_free_init(struct rcu_head *head) 3412static void do_free_init(struct work_struct *w)
3423{ 3413{
3424 struct mod_initfree *m = container_of(head, struct mod_initfree, rcu); 3414 struct llist_node *pos, *n, *list;
3425 module_memfree(m->module_init); 3415 struct mod_initfree *initfree;
3426 kfree(m); 3416
3417 list = llist_del_all(&init_free_list);
3418
3419 synchronize_rcu();
3420
3421 llist_for_each_safe(pos, n, list) {
3422 initfree = container_of(pos, struct mod_initfree, node);
3423 module_memfree(initfree->module_init);
3424 kfree(initfree);
3425 }
3427} 3426}
3428 3427
3428static int __init modules_wq_init(void)
3429{
3430 INIT_WORK(&init_free_wq, do_free_init);
3431 init_llist_head(&init_free_list);
3432 return 0;
3433}
3434module_init(modules_wq_init);
3435
3429/* 3436/*
3430 * This is where the real work happens. 3437 * This is where the real work happens.
3431 * 3438 *
@@ -3502,7 +3509,6 @@ static noinline int do_init_module(struct module *mod)
3502#endif 3509#endif
3503 module_enable_ro(mod, true); 3510 module_enable_ro(mod, true);
3504 mod_tree_remove_init(mod); 3511 mod_tree_remove_init(mod);
3505 disable_ro_nx(&mod->init_layout);
3506 module_arch_freeing_init(mod); 3512 module_arch_freeing_init(mod);
3507 mod->init_layout.base = NULL; 3513 mod->init_layout.base = NULL;
3508 mod->init_layout.size = 0; 3514 mod->init_layout.size = 0;
@@ -3513,14 +3519,18 @@ static noinline int do_init_module(struct module *mod)
3513 * We want to free module_init, but be aware that kallsyms may be 3519 * We want to free module_init, but be aware that kallsyms may be
3514 * walking this with preempt disabled. In all the failure paths, we 3520 * walking this with preempt disabled. In all the failure paths, we
3515 * call synchronize_rcu(), but we don't want to slow down the success 3521 * call synchronize_rcu(), but we don't want to slow down the success
3516 * path, so use actual RCU here. 3522 * path. module_memfree() cannot be called in an interrupt, so do the
3523 * work and call synchronize_rcu() in a work queue.
3524 *
3517 * Note that module_alloc() on most architectures creates W+X page 3525 * Note that module_alloc() on most architectures creates W+X page
3518 * mappings which won't be cleaned up until do_free_init() runs. Any 3526 * mappings which won't be cleaned up until do_free_init() runs. Any
3519 * code such as mark_rodata_ro() which depends on those mappings to 3527 * code such as mark_rodata_ro() which depends on those mappings to
3520 * be cleaned up needs to sync with the queued work - ie 3528 * be cleaned up needs to sync with the queued work - ie
3521 * rcu_barrier() 3529 * rcu_barrier()
3522 */ 3530 */
3523 call_rcu(&freeinit->rcu, do_free_init); 3531 if (llist_add(&freeinit->node, &init_free_list))
3532 schedule_work(&init_free_wq);
3533
3524 mutex_unlock(&module_mutex); 3534 mutex_unlock(&module_mutex);
3525 wake_up_all(&module_wq); 3535 wake_up_all(&module_wq);
3526 3536
@@ -3817,10 +3827,6 @@ static int load_module(struct load_info *info, const char __user *uargs,
3817 module_bug_cleanup(mod); 3827 module_bug_cleanup(mod);
3818 mutex_unlock(&module_mutex); 3828 mutex_unlock(&module_mutex);
3819 3829
3820 /* we can't deallocate the module until we clear memory protection */
3821 module_disable_ro(mod);
3822 module_disable_nx(mod);
3823
3824 ddebug_cleanup: 3830 ddebug_cleanup:
3825 ftrace_release_mod(mod); 3831 ftrace_release_mod(mod);
3826 dynamic_debug_remove(mod, info->debug); 3832 dynamic_debug_remove(mod, info->debug);
diff --git a/kernel/padata.c b/kernel/padata.c
index 3e2633ae3bca..2d2fddbb7a4c 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -957,6 +957,7 @@ static struct attribute *padata_default_attrs[] = {
957 &parallel_cpumask_attr.attr, 957 &parallel_cpumask_attr.attr,
958 NULL, 958 NULL,
959}; 959};
960ATTRIBUTE_GROUPS(padata_default);
960 961
961static ssize_t padata_sysfs_show(struct kobject *kobj, 962static ssize_t padata_sysfs_show(struct kobject *kobj,
962 struct attribute *attr, char *buf) 963 struct attribute *attr, char *buf)
@@ -995,7 +996,7 @@ static const struct sysfs_ops padata_sysfs_ops = {
995 996
996static struct kobj_type padata_attr_type = { 997static struct kobj_type padata_attr_type = {
997 .sysfs_ops = &padata_sysfs_ops, 998 .sysfs_ops = &padata_sysfs_ops,
998 .default_attrs = padata_default_attrs, 999 .default_groups = padata_default_groups,
999 .release = padata_sysfs_release, 1000 .release = padata_sysfs_release,
1000}; 1001};
1001 1002
diff --git a/kernel/panic.c b/kernel/panic.c
index 0ae0d7332f12..c1fcaad337b7 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -318,12 +318,7 @@ void panic(const char *fmt, ...)
318 } 318 }
319#endif 319#endif
320#if defined(CONFIG_S390) 320#if defined(CONFIG_S390)
321 { 321 disabled_wait();
322 unsigned long caller;
323
324 caller = (unsigned long)__builtin_return_address(0);
325 disabled_wait(caller);
326 }
327#endif 322#endif
328 pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf); 323 pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf);
329 local_irq_enable(); 324 local_irq_enable();
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index f8fe57d1022e..9bbaaab14b36 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -114,6 +114,15 @@ config PM_SLEEP_SMP
114 depends on PM_SLEEP 114 depends on PM_SLEEP
115 select HOTPLUG_CPU 115 select HOTPLUG_CPU
116 116
117config PM_SLEEP_SMP_NONZERO_CPU
118 def_bool y
119 depends on PM_SLEEP_SMP
120 depends on ARCH_SUSPEND_NONZERO_CPU
121 ---help---
122 If an arch can suspend (for suspend, hibernate, kexec, etc) on a
123 non-zero numbered CPU, it may define ARCH_SUSPEND_NONZERO_CPU. This
124 will allow nohz_full mask to include CPU0.
125
117config PM_AUTOSLEEP 126config PM_AUTOSLEEP
118 bool "Opportunistic sleep" 127 bool "Opportunistic sleep"
119 depends on PM_SLEEP 128 depends on PM_SLEEP
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index abef759de7c8..c8c272df7154 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -14,7 +14,6 @@
14 14
15#include <linux/export.h> 15#include <linux/export.h>
16#include <linux/suspend.h> 16#include <linux/suspend.h>
17#include <linux/syscalls.h>
18#include <linux/reboot.h> 17#include <linux/reboot.h>
19#include <linux/string.h> 18#include <linux/string.h>
20#include <linux/device.h> 19#include <linux/device.h>
@@ -281,7 +280,7 @@ static int create_image(int platform_mode)
281 if (error || hibernation_test(TEST_PLATFORM)) 280 if (error || hibernation_test(TEST_PLATFORM))
282 goto Platform_finish; 281 goto Platform_finish;
283 282
284 error = disable_nonboot_cpus(); 283 error = suspend_disable_secondary_cpus();
285 if (error || hibernation_test(TEST_CPUS)) 284 if (error || hibernation_test(TEST_CPUS))
286 goto Enable_cpus; 285 goto Enable_cpus;
287 286
@@ -323,7 +322,7 @@ static int create_image(int platform_mode)
323 local_irq_enable(); 322 local_irq_enable();
324 323
325 Enable_cpus: 324 Enable_cpus:
326 enable_nonboot_cpus(); 325 suspend_enable_secondary_cpus();
327 326
328 Platform_finish: 327 Platform_finish:
329 platform_finish(platform_mode); 328 platform_finish(platform_mode);
@@ -417,7 +416,7 @@ int hibernation_snapshot(int platform_mode)
417 416
418int __weak hibernate_resume_nonboot_cpu_disable(void) 417int __weak hibernate_resume_nonboot_cpu_disable(void)
419{ 418{
420 return disable_nonboot_cpus(); 419 return suspend_disable_secondary_cpus();
421} 420}
422 421
423/** 422/**
@@ -486,7 +485,7 @@ static int resume_target_kernel(bool platform_mode)
486 local_irq_enable(); 485 local_irq_enable();
487 486
488 Enable_cpus: 487 Enable_cpus:
489 enable_nonboot_cpus(); 488 suspend_enable_secondary_cpus();
490 489
491 Cleanup: 490 Cleanup:
492 platform_restore_cleanup(platform_mode); 491 platform_restore_cleanup(platform_mode);
@@ -564,7 +563,7 @@ int hibernation_platform_enter(void)
564 if (error) 563 if (error)
565 goto Platform_finish; 564 goto Platform_finish;
566 565
567 error = disable_nonboot_cpus(); 566 error = suspend_disable_secondary_cpus();
568 if (error) 567 if (error)
569 goto Enable_cpus; 568 goto Enable_cpus;
570 569
@@ -586,7 +585,7 @@ int hibernation_platform_enter(void)
586 local_irq_enable(); 585 local_irq_enable();
587 586
588 Enable_cpus: 587 Enable_cpus:
589 enable_nonboot_cpus(); 588 suspend_enable_secondary_cpus();
590 589
591 Platform_finish: 590 Platform_finish:
592 hibernation_ops->finish(); 591 hibernation_ops->finish();
@@ -709,9 +708,7 @@ int hibernate(void)
709 goto Exit; 708 goto Exit;
710 } 709 }
711 710
712 pr_info("Syncing filesystems ... \n"); 711 ksys_sync_helper();
713 ksys_sync();
714 pr_info("done.\n");
715 712
716 error = freeze_processes(); 713 error = freeze_processes();
717 if (error) 714 if (error)
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 98e76cad128b..4f43e724f6eb 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -16,6 +16,7 @@
16#include <linux/debugfs.h> 16#include <linux/debugfs.h>
17#include <linux/seq_file.h> 17#include <linux/seq_file.h>
18#include <linux/suspend.h> 18#include <linux/suspend.h>
19#include <linux/syscalls.h>
19 20
20#include "power.h" 21#include "power.h"
21 22
@@ -51,6 +52,19 @@ void unlock_system_sleep(void)
51} 52}
52EXPORT_SYMBOL_GPL(unlock_system_sleep); 53EXPORT_SYMBOL_GPL(unlock_system_sleep);
53 54
55void ksys_sync_helper(void)
56{
57 ktime_t start;
58 long elapsed_msecs;
59
60 start = ktime_get();
61 ksys_sync();
62 elapsed_msecs = ktime_to_ms(ktime_sub(ktime_get(), start));
63 pr_info("Filesystems sync: %ld.%03ld seconds\n",
64 elapsed_msecs / MSEC_PER_SEC, elapsed_msecs % MSEC_PER_SEC);
65}
66EXPORT_SYMBOL_GPL(ksys_sync_helper);
67
54/* Routines for PM-transition notifications */ 68/* Routines for PM-transition notifications */
55 69
56static BLOCKING_NOTIFIER_HEAD(pm_chain_head); 70static BLOCKING_NOTIFIER_HEAD(pm_chain_head);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index f08a1e4ee1d4..bc9558ab1e5b 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1342,8 +1342,9 @@ static inline void do_copy_page(long *dst, long *src)
1342 * safe_copy_page - Copy a page in a safe way. 1342 * safe_copy_page - Copy a page in a safe way.
1343 * 1343 *
1344 * Check if the page we are going to copy is marked as present in the kernel 1344 * Check if the page we are going to copy is marked as present in the kernel
1345 * page tables (this always is the case if CONFIG_DEBUG_PAGEALLOC is not set 1345 * page tables. This always is the case if CONFIG_DEBUG_PAGEALLOC or
1346 * and in that case kernel_page_present() always returns 'true'). 1346 * CONFIG_ARCH_HAS_SET_DIRECT_MAP is not set. In that case kernel_page_present()
1347 * always returns 'true'.
1347 */ 1348 */
1348static void safe_copy_page(void *dst, struct page *s_page) 1349static void safe_copy_page(void *dst, struct page *s_page)
1349{ 1350{
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 0bd595a0b610..ef908c134b34 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -17,7 +17,6 @@
17#include <linux/console.h> 17#include <linux/console.h>
18#include <linux/cpu.h> 18#include <linux/cpu.h>
19#include <linux/cpuidle.h> 19#include <linux/cpuidle.h>
20#include <linux/syscalls.h>
21#include <linux/gfp.h> 20#include <linux/gfp.h>
22#include <linux/io.h> 21#include <linux/io.h>
23#include <linux/kernel.h> 22#include <linux/kernel.h>
@@ -428,7 +427,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
428 if (suspend_test(TEST_PLATFORM)) 427 if (suspend_test(TEST_PLATFORM))
429 goto Platform_wake; 428 goto Platform_wake;
430 429
431 error = disable_nonboot_cpus(); 430 error = suspend_disable_secondary_cpus();
432 if (error || suspend_test(TEST_CPUS)) 431 if (error || suspend_test(TEST_CPUS))
433 goto Enable_cpus; 432 goto Enable_cpus;
434 433
@@ -458,7 +457,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
458 BUG_ON(irqs_disabled()); 457 BUG_ON(irqs_disabled());
459 458
460 Enable_cpus: 459 Enable_cpus:
461 enable_nonboot_cpus(); 460 suspend_enable_secondary_cpus();
462 461
463 Platform_wake: 462 Platform_wake:
464 platform_resume_noirq(state); 463 platform_resume_noirq(state);
@@ -568,13 +567,11 @@ static int enter_state(suspend_state_t state)
568 if (state == PM_SUSPEND_TO_IDLE) 567 if (state == PM_SUSPEND_TO_IDLE)
569 s2idle_begin(); 568 s2idle_begin();
570 569
571#ifndef CONFIG_SUSPEND_SKIP_SYNC 570 if (!IS_ENABLED(CONFIG_SUSPEND_SKIP_SYNC)) {
572 trace_suspend_resume(TPS("sync_filesystems"), 0, true); 571 trace_suspend_resume(TPS("sync_filesystems"), 0, true);
573 pr_info("Syncing filesystems ... "); 572 ksys_sync_helper();
574 ksys_sync(); 573 trace_suspend_resume(TPS("sync_filesystems"), 0, false);
575 pr_cont("done.\n"); 574 }
576 trace_suspend_resume(TPS("sync_filesystems"), 0, false);
577#endif
578 575
579 pm_pr_dbg("Preparing system for sleep (%s)\n", mem_sleep_labels[state]); 576 pm_pr_dbg("Preparing system for sleep (%s)\n", mem_sleep_labels[state]);
580 pm_suspend_clear_flags(); 577 pm_suspend_clear_flags();
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 2d8b60a3c86b..cb24e840a3e6 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -10,7 +10,6 @@
10 */ 10 */
11 11
12#include <linux/suspend.h> 12#include <linux/suspend.h>
13#include <linux/syscalls.h>
14#include <linux/reboot.h> 13#include <linux/reboot.h>
15#include <linux/string.h> 14#include <linux/string.h>
16#include <linux/device.h> 15#include <linux/device.h>
@@ -228,9 +227,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
228 if (data->frozen) 227 if (data->frozen)
229 break; 228 break;
230 229
231 printk("Syncing filesystems ... "); 230 ksys_sync_helper();
232 ksys_sync();
233 printk("done.\n");
234 231
235 error = freeze_processes(); 232 error = freeze_processes();
236 if (error) 233 if (error)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 771e93f9c43f..6f357f4fc859 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -29,6 +29,7 @@
29#include <linux/hw_breakpoint.h> 29#include <linux/hw_breakpoint.h>
30#include <linux/cn_proc.h> 30#include <linux/cn_proc.h>
31#include <linux/compat.h> 31#include <linux/compat.h>
32#include <linux/sched/signal.h>
32 33
33/* 34/*
34 * Access another process' address space via ptrace. 35 * Access another process' address space via ptrace.
@@ -924,18 +925,26 @@ int ptrace_request(struct task_struct *child, long request,
924 ret = ptrace_setsiginfo(child, &siginfo); 925 ret = ptrace_setsiginfo(child, &siginfo);
925 break; 926 break;
926 927
927 case PTRACE_GETSIGMASK: 928 case PTRACE_GETSIGMASK: {
929 sigset_t *mask;
930
928 if (addr != sizeof(sigset_t)) { 931 if (addr != sizeof(sigset_t)) {
929 ret = -EINVAL; 932 ret = -EINVAL;
930 break; 933 break;
931 } 934 }
932 935
933 if (copy_to_user(datavp, &child->blocked, sizeof(sigset_t))) 936 if (test_tsk_restore_sigmask(child))
937 mask = &child->saved_sigmask;
938 else
939 mask = &child->blocked;
940
941 if (copy_to_user(datavp, mask, sizeof(sigset_t)))
934 ret = -EFAULT; 942 ret = -EFAULT;
935 else 943 else
936 ret = 0; 944 ret = 0;
937 945
938 break; 946 break;
947 }
939 948
940 case PTRACE_SETSIGMASK: { 949 case PTRACE_SETSIGMASK: {
941 sigset_t new_set; 950 sigset_t new_set;
@@ -961,6 +970,8 @@ int ptrace_request(struct task_struct *child, long request,
961 child->blocked = new_set; 970 child->blocked = new_set;
962 spin_unlock_irq(&child->sighand->siglock); 971 spin_unlock_irq(&child->sighand->siglock);
963 972
973 clear_tsk_restore_sigmask(child);
974
964 ret = 0; 975 ret = 0;
965 break; 976 break;
966 } 977 }
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index acee72c0b24b..4b58c907b4b7 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -233,6 +233,7 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
233#ifdef CONFIG_RCU_STALL_COMMON 233#ifdef CONFIG_RCU_STALL_COMMON
234 234
235extern int rcu_cpu_stall_suppress; 235extern int rcu_cpu_stall_suppress;
236extern int rcu_cpu_stall_timeout;
236int rcu_jiffies_till_stall_check(void); 237int rcu_jiffies_till_stall_check(void);
237 238
238#define rcu_ftrace_dump_stall_suppress() \ 239#define rcu_ftrace_dump_stall_suppress() \
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index c29761152874..7a6890b23c5f 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -494,6 +494,10 @@ rcu_perf_cleanup(void)
494 494
495 if (torture_cleanup_begin()) 495 if (torture_cleanup_begin())
496 return; 496 return;
497 if (!cur_ops) {
498 torture_cleanup_end();
499 return;
500 }
497 501
498 if (reader_tasks) { 502 if (reader_tasks) {
499 for (i = 0; i < nrealreaders; i++) 503 for (i = 0; i < nrealreaders; i++)
@@ -614,6 +618,7 @@ rcu_perf_init(void)
614 pr_cont("\n"); 618 pr_cont("\n");
615 WARN_ON(!IS_MODULE(CONFIG_RCU_PERF_TEST)); 619 WARN_ON(!IS_MODULE(CONFIG_RCU_PERF_TEST));
616 firsterr = -EINVAL; 620 firsterr = -EINVAL;
621 cur_ops = NULL;
617 goto unwind; 622 goto unwind;
618 } 623 }
619 if (cur_ops->init) 624 if (cur_ops->init)
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index f14d1b18a74f..efaa5b3f4d3f 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -299,7 +299,6 @@ struct rcu_torture_ops {
299 int irq_capable; 299 int irq_capable;
300 int can_boost; 300 int can_boost;
301 int extendables; 301 int extendables;
302 int ext_irq_conflict;
303 const char *name; 302 const char *name;
304}; 303};
305 304
@@ -592,12 +591,7 @@ static void srcu_torture_init(void)
592 591
593static void srcu_torture_cleanup(void) 592static void srcu_torture_cleanup(void)
594{ 593{
595 static DEFINE_TORTURE_RANDOM(rand); 594 cleanup_srcu_struct(&srcu_ctld);
596
597 if (torture_random(&rand) & 0x800)
598 cleanup_srcu_struct(&srcu_ctld);
599 else
600 cleanup_srcu_struct_quiesced(&srcu_ctld);
601 srcu_ctlp = &srcu_ctl; /* In case of a later rcutorture run. */ 595 srcu_ctlp = &srcu_ctl; /* In case of a later rcutorture run. */
602} 596}
603 597
@@ -1160,7 +1154,7 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp)
1160 unsigned long randmask2 = randmask1 >> 3; 1154 unsigned long randmask2 = randmask1 >> 3;
1161 1155
1162 WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT); 1156 WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT);
1163 /* Most of the time lots of bits, half the time only one bit. */ 1157 /* Mostly only one bit (need preemption!), sometimes lots of bits. */
1164 if (!(randmask1 & 0x7)) 1158 if (!(randmask1 & 0x7))
1165 mask = mask & randmask2; 1159 mask = mask & randmask2;
1166 else 1160 else
@@ -1170,10 +1164,6 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp)
1170 ((!(mask & RCUTORTURE_RDR_BH) && (oldmask & RCUTORTURE_RDR_BH)) || 1164 ((!(mask & RCUTORTURE_RDR_BH) && (oldmask & RCUTORTURE_RDR_BH)) ||
1171 (!(mask & RCUTORTURE_RDR_RBH) && (oldmask & RCUTORTURE_RDR_RBH)))) 1165 (!(mask & RCUTORTURE_RDR_RBH) && (oldmask & RCUTORTURE_RDR_RBH))))
1172 mask |= RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH; 1166 mask |= RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH;
1173 if ((mask & RCUTORTURE_RDR_IRQ) &&
1174 !(mask & cur_ops->ext_irq_conflict) &&
1175 (oldmask & cur_ops->ext_irq_conflict))
1176 mask |= cur_ops->ext_irq_conflict; /* Or if readers object. */
1177 return mask ?: RCUTORTURE_RDR_RCU; 1167 return mask ?: RCUTORTURE_RDR_RCU;
1178} 1168}
1179 1169
@@ -1848,7 +1838,7 @@ static int rcutorture_oom_notify(struct notifier_block *self,
1848 WARN(1, "%s invoked upon OOM during forward-progress testing.\n", 1838 WARN(1, "%s invoked upon OOM during forward-progress testing.\n",
1849 __func__); 1839 __func__);
1850 rcu_torture_fwd_cb_hist(); 1840 rcu_torture_fwd_cb_hist();
1851 rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwd_startat) / 2)); 1841 rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwd_startat)) / 2);
1852 WRITE_ONCE(rcu_fwd_emergency_stop, true); 1842 WRITE_ONCE(rcu_fwd_emergency_stop, true);
1853 smp_mb(); /* Emergency stop before free and wait to avoid hangs. */ 1843 smp_mb(); /* Emergency stop before free and wait to avoid hangs. */
1854 pr_info("%s: Freed %lu RCU callbacks.\n", 1844 pr_info("%s: Freed %lu RCU callbacks.\n",
@@ -2094,6 +2084,10 @@ rcu_torture_cleanup(void)
2094 cur_ops->cb_barrier(); 2084 cur_ops->cb_barrier();
2095 return; 2085 return;
2096 } 2086 }
2087 if (!cur_ops) {
2088 torture_cleanup_end();
2089 return;
2090 }
2097 2091
2098 rcu_torture_barrier_cleanup(); 2092 rcu_torture_barrier_cleanup();
2099 torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task); 2093 torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task);
@@ -2267,6 +2261,7 @@ rcu_torture_init(void)
2267 pr_cont("\n"); 2261 pr_cont("\n");
2268 WARN_ON(!IS_MODULE(CONFIG_RCU_TORTURE_TEST)); 2262 WARN_ON(!IS_MODULE(CONFIG_RCU_TORTURE_TEST));
2269 firsterr = -EINVAL; 2263 firsterr = -EINVAL;
2264 cur_ops = NULL;
2270 goto unwind; 2265 goto unwind;
2271 } 2266 }
2272 if (cur_ops->fqs == NULL && fqs_duration != 0) { 2267 if (cur_ops->fqs == NULL && fqs_duration != 0) {
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 5d4a39a6505a..44d6606b8325 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -76,19 +76,16 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
76 * Must invoke this after you are finished using a given srcu_struct that 76 * Must invoke this after you are finished using a given srcu_struct that
77 * was initialized via init_srcu_struct(), else you leak memory. 77 * was initialized via init_srcu_struct(), else you leak memory.
78 */ 78 */
79void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced) 79void cleanup_srcu_struct(struct srcu_struct *ssp)
80{ 80{
81 WARN_ON(ssp->srcu_lock_nesting[0] || ssp->srcu_lock_nesting[1]); 81 WARN_ON(ssp->srcu_lock_nesting[0] || ssp->srcu_lock_nesting[1]);
82 if (quiesced) 82 flush_work(&ssp->srcu_work);
83 WARN_ON(work_pending(&ssp->srcu_work));
84 else
85 flush_work(&ssp->srcu_work);
86 WARN_ON(ssp->srcu_gp_running); 83 WARN_ON(ssp->srcu_gp_running);
87 WARN_ON(ssp->srcu_gp_waiting); 84 WARN_ON(ssp->srcu_gp_waiting);
88 WARN_ON(ssp->srcu_cb_head); 85 WARN_ON(ssp->srcu_cb_head);
89 WARN_ON(&ssp->srcu_cb_head != ssp->srcu_cb_tail); 86 WARN_ON(&ssp->srcu_cb_head != ssp->srcu_cb_tail);
90} 87}
91EXPORT_SYMBOL_GPL(_cleanup_srcu_struct); 88EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
92 89
93/* 90/*
94 * Removes the count for the old reader from the appropriate element of 91 * Removes the count for the old reader from the appropriate element of
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index a60b8ba9e1ac..9b761e546de8 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -360,8 +360,14 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp)
360 return SRCU_INTERVAL; 360 return SRCU_INTERVAL;
361} 361}
362 362
363/* Helper for cleanup_srcu_struct() and cleanup_srcu_struct_quiesced(). */ 363/**
364void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced) 364 * cleanup_srcu_struct - deconstruct a sleep-RCU structure
365 * @ssp: structure to clean up.
366 *
367 * Must invoke this after you are finished using a given srcu_struct that
368 * was initialized via init_srcu_struct(), else you leak memory.
369 */
370void cleanup_srcu_struct(struct srcu_struct *ssp)
365{ 371{
366 int cpu; 372 int cpu;
367 373
@@ -369,24 +375,14 @@ void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced)
369 return; /* Just leak it! */ 375 return; /* Just leak it! */
370 if (WARN_ON(srcu_readers_active(ssp))) 376 if (WARN_ON(srcu_readers_active(ssp)))
371 return; /* Just leak it! */ 377 return; /* Just leak it! */
372 if (quiesced) { 378 flush_delayed_work(&ssp->work);
373 if (WARN_ON(delayed_work_pending(&ssp->work)))
374 return; /* Just leak it! */
375 } else {
376 flush_delayed_work(&ssp->work);
377 }
378 for_each_possible_cpu(cpu) { 379 for_each_possible_cpu(cpu) {
379 struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); 380 struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu);
380 381
381 if (quiesced) { 382 del_timer_sync(&sdp->delay_work);
382 if (WARN_ON(timer_pending(&sdp->delay_work))) 383 flush_work(&sdp->work);
383 return; /* Just leak it! */ 384 if (WARN_ON(rcu_segcblist_n_cbs(&sdp->srcu_cblist)))
384 if (WARN_ON(work_pending(&sdp->work))) 385 return; /* Forgot srcu_barrier(), so just leak it! */
385 return; /* Just leak it! */
386 } else {
387 del_timer_sync(&sdp->delay_work);
388 flush_work(&sdp->work);
389 }
390 } 386 }
391 if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) || 387 if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
392 WARN_ON(srcu_readers_active(ssp))) { 388 WARN_ON(srcu_readers_active(ssp))) {
@@ -397,7 +393,7 @@ void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced)
397 free_percpu(ssp->sda); 393 free_percpu(ssp->sda);
398 ssp->sda = NULL; 394 ssp->sda = NULL;
399} 395}
400EXPORT_SYMBOL_GPL(_cleanup_srcu_struct); 396EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
401 397
402/* 398/*
403 * Counts the new reader in the appropriate per-CPU element of the 399 * Counts the new reader in the appropriate per-CPU element of the
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 911bd9076d43..477b4eb44af5 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -52,7 +52,7 @@ void rcu_qs(void)
52 local_irq_save(flags); 52 local_irq_save(flags);
53 if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) { 53 if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) {
54 rcu_ctrlblk.donetail = rcu_ctrlblk.curtail; 54 rcu_ctrlblk.donetail = rcu_ctrlblk.curtail;
55 raise_softirq(RCU_SOFTIRQ); 55 raise_softirq_irqoff(RCU_SOFTIRQ);
56 } 56 }
57 local_irq_restore(flags); 57 local_irq_restore(flags);
58} 58}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index acd6ccf56faf..b4d88a594785 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -102,11 +102,6 @@ int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
102/* Number of rcu_nodes at specified level. */ 102/* Number of rcu_nodes at specified level. */
103int num_rcu_lvl[] = NUM_RCU_LVL_INIT; 103int num_rcu_lvl[] = NUM_RCU_LVL_INIT;
104int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ 104int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
105/* panic() on RCU Stall sysctl. */
106int sysctl_panic_on_rcu_stall __read_mostly;
107/* Commandeer a sysrq key to dump RCU's tree. */
108static bool sysrq_rcu;
109module_param(sysrq_rcu, bool, 0444);
110 105
111/* 106/*
112 * The rcu_scheduler_active variable is initialized to the value 107 * The rcu_scheduler_active variable is initialized to the value
@@ -149,7 +144,7 @@ static void sync_sched_exp_online_cleanup(int cpu);
149 144
150/* rcuc/rcub kthread realtime priority */ 145/* rcuc/rcub kthread realtime priority */
151static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0; 146static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0;
152module_param(kthread_prio, int, 0644); 147module_param(kthread_prio, int, 0444);
153 148
154/* Delay in jiffies for grace-period initialization delays, debug only. */ 149/* Delay in jiffies for grace-period initialization delays, debug only. */
155 150
@@ -406,7 +401,7 @@ static bool rcu_kick_kthreads;
406 */ 401 */
407static ulong jiffies_till_sched_qs = ULONG_MAX; 402static ulong jiffies_till_sched_qs = ULONG_MAX;
408module_param(jiffies_till_sched_qs, ulong, 0444); 403module_param(jiffies_till_sched_qs, ulong, 0444);
409static ulong jiffies_to_sched_qs; /* Adjusted version of above if not default */ 404static ulong jiffies_to_sched_qs; /* See adjust_jiffies_till_sched_qs(). */
410module_param(jiffies_to_sched_qs, ulong, 0444); /* Display only! */ 405module_param(jiffies_to_sched_qs, ulong, 0444); /* Display only! */
411 406
412/* 407/*
@@ -424,6 +419,7 @@ static void adjust_jiffies_till_sched_qs(void)
424 WRITE_ONCE(jiffies_to_sched_qs, jiffies_till_sched_qs); 419 WRITE_ONCE(jiffies_to_sched_qs, jiffies_till_sched_qs);
425 return; 420 return;
426 } 421 }
422 /* Otherwise, set to third fqs scan, but bound below on large system. */
427 j = READ_ONCE(jiffies_till_first_fqs) + 423 j = READ_ONCE(jiffies_till_first_fqs) +
428 2 * READ_ONCE(jiffies_till_next_fqs); 424 2 * READ_ONCE(jiffies_till_next_fqs);
429 if (j < HZ / 10 + nr_cpu_ids / RCU_JIFFIES_FQS_DIV) 425 if (j < HZ / 10 + nr_cpu_ids / RCU_JIFFIES_FQS_DIV)
@@ -513,74 +509,6 @@ static const char *gp_state_getname(short gs)
513} 509}
514 510
515/* 511/*
516 * Show the state of the grace-period kthreads.
517 */
518void show_rcu_gp_kthreads(void)
519{
520 int cpu;
521 unsigned long j;
522 unsigned long ja;
523 unsigned long jr;
524 unsigned long jw;
525 struct rcu_data *rdp;
526 struct rcu_node *rnp;
527
528 j = jiffies;
529 ja = j - READ_ONCE(rcu_state.gp_activity);
530 jr = j - READ_ONCE(rcu_state.gp_req_activity);
531 jw = j - READ_ONCE(rcu_state.gp_wake_time);
532 pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n",
533 rcu_state.name, gp_state_getname(rcu_state.gp_state),
534 rcu_state.gp_state,
535 rcu_state.gp_kthread ? rcu_state.gp_kthread->state : 0x1ffffL,
536 ja, jr, jw, (long)READ_ONCE(rcu_state.gp_wake_seq),
537 (long)READ_ONCE(rcu_state.gp_seq),
538 (long)READ_ONCE(rcu_get_root()->gp_seq_needed),
539 READ_ONCE(rcu_state.gp_flags));
540 rcu_for_each_node_breadth_first(rnp) {
541 if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed))
542 continue;
543 pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n",
544 rnp->grplo, rnp->grphi, (long)rnp->gp_seq,
545 (long)rnp->gp_seq_needed);
546 if (!rcu_is_leaf_node(rnp))
547 continue;
548 for_each_leaf_node_possible_cpu(rnp, cpu) {
549 rdp = per_cpu_ptr(&rcu_data, cpu);
550 if (rdp->gpwrap ||
551 ULONG_CMP_GE(rcu_state.gp_seq,
552 rdp->gp_seq_needed))
553 continue;
554 pr_info("\tcpu %d ->gp_seq_needed %ld\n",
555 cpu, (long)rdp->gp_seq_needed);
556 }
557 }
558 /* sched_show_task(rcu_state.gp_kthread); */
559}
560EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads);
561
562/* Dump grace-period-request information due to commandeered sysrq. */
563static void sysrq_show_rcu(int key)
564{
565 show_rcu_gp_kthreads();
566}
567
568static struct sysrq_key_op sysrq_rcudump_op = {
569 .handler = sysrq_show_rcu,
570 .help_msg = "show-rcu(y)",
571 .action_msg = "Show RCU tree",
572 .enable_mask = SYSRQ_ENABLE_DUMP,
573};
574
575static int __init rcu_sysrq_init(void)
576{
577 if (sysrq_rcu)
578 return register_sysrq_key('y', &sysrq_rcudump_op);
579 return 0;
580}
581early_initcall(rcu_sysrq_init);
582
583/*
584 * Send along grace-period-related data for rcutorture diagnostics. 512 * Send along grace-period-related data for rcutorture diagnostics.
585 */ 513 */
586void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, 514void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
@@ -1034,27 +962,6 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
1034} 962}
1035 963
1036/* 964/*
1037 * Handler for the irq_work request posted when a grace period has
1038 * gone on for too long, but not yet long enough for an RCU CPU
1039 * stall warning. Set state appropriately, but just complain if
1040 * there is unexpected state on entry.
1041 */
1042static void rcu_iw_handler(struct irq_work *iwp)
1043{
1044 struct rcu_data *rdp;
1045 struct rcu_node *rnp;
1046
1047 rdp = container_of(iwp, struct rcu_data, rcu_iw);
1048 rnp = rdp->mynode;
1049 raw_spin_lock_rcu_node(rnp);
1050 if (!WARN_ON_ONCE(!rdp->rcu_iw_pending)) {
1051 rdp->rcu_iw_gp_seq = rnp->gp_seq;
1052 rdp->rcu_iw_pending = false;
1053 }
1054 raw_spin_unlock_rcu_node(rnp);
1055}
1056
1057/*
1058 * Return true if the specified CPU has passed through a quiescent 965 * Return true if the specified CPU has passed through a quiescent
1059 * state by virtue of being in or having passed through an dynticks 966 * state by virtue of being in or having passed through an dynticks
1060 * idle state since the last call to dyntick_save_progress_counter() 967 * idle state since the last call to dyntick_save_progress_counter()
@@ -1167,295 +1074,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
1167 return 0; 1074 return 0;
1168} 1075}
1169 1076
1170static void record_gp_stall_check_time(void)
1171{
1172 unsigned long j = jiffies;
1173 unsigned long j1;
1174
1175 rcu_state.gp_start = j;
1176 j1 = rcu_jiffies_till_stall_check();
1177 /* Record ->gp_start before ->jiffies_stall. */
1178 smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */
1179 rcu_state.jiffies_resched = j + j1 / 2;
1180 rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs);
1181}
1182
1183/*
1184 * Complain about starvation of grace-period kthread.
1185 */
1186static void rcu_check_gp_kthread_starvation(void)
1187{
1188 struct task_struct *gpk = rcu_state.gp_kthread;
1189 unsigned long j;
1190
1191 j = jiffies - READ_ONCE(rcu_state.gp_activity);
1192 if (j > 2 * HZ) {
1193 pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n",
1194 rcu_state.name, j,
1195 (long)rcu_seq_current(&rcu_state.gp_seq),
1196 READ_ONCE(rcu_state.gp_flags),
1197 gp_state_getname(rcu_state.gp_state), rcu_state.gp_state,
1198 gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1);
1199 if (gpk) {
1200 pr_err("RCU grace-period kthread stack dump:\n");
1201 sched_show_task(gpk);
1202 wake_up_process(gpk);
1203 }
1204 }
1205}
1206
1207/*
1208 * Dump stacks of all tasks running on stalled CPUs. First try using
1209 * NMIs, but fall back to manual remote stack tracing on architectures
1210 * that don't support NMI-based stack dumps. The NMI-triggered stack
1211 * traces are more accurate because they are printed by the target CPU.
1212 */
1213static void rcu_dump_cpu_stacks(void)
1214{
1215 int cpu;
1216 unsigned long flags;
1217 struct rcu_node *rnp;
1218
1219 rcu_for_each_leaf_node(rnp) {
1220 raw_spin_lock_irqsave_rcu_node(rnp, flags);
1221 for_each_leaf_node_possible_cpu(rnp, cpu)
1222 if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu))
1223 if (!trigger_single_cpu_backtrace(cpu))
1224 dump_cpu_task(cpu);
1225 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1226 }
1227}
1228
1229/*
1230 * If too much time has passed in the current grace period, and if
1231 * so configured, go kick the relevant kthreads.
1232 */
1233static void rcu_stall_kick_kthreads(void)
1234{
1235 unsigned long j;
1236
1237 if (!rcu_kick_kthreads)
1238 return;
1239 j = READ_ONCE(rcu_state.jiffies_kick_kthreads);
1240 if (time_after(jiffies, j) && rcu_state.gp_kthread &&
1241 (rcu_gp_in_progress() || READ_ONCE(rcu_state.gp_flags))) {
1242 WARN_ONCE(1, "Kicking %s grace-period kthread\n",
1243 rcu_state.name);
1244 rcu_ftrace_dump(DUMP_ALL);
1245 wake_up_process(rcu_state.gp_kthread);
1246 WRITE_ONCE(rcu_state.jiffies_kick_kthreads, j + HZ);
1247 }
1248}
1249
1250static void panic_on_rcu_stall(void)
1251{
1252 if (sysctl_panic_on_rcu_stall)
1253 panic("RCU Stall\n");
1254}
1255
1256static void print_other_cpu_stall(unsigned long gp_seq)
1257{
1258 int cpu;
1259 unsigned long flags;
1260 unsigned long gpa;
1261 unsigned long j;
1262 int ndetected = 0;
1263 struct rcu_node *rnp = rcu_get_root();
1264 long totqlen = 0;
1265
1266 /* Kick and suppress, if so configured. */
1267 rcu_stall_kick_kthreads();
1268 if (rcu_cpu_stall_suppress)
1269 return;
1270
1271 /*
1272 * OK, time to rat on our buddy...
1273 * See Documentation/RCU/stallwarn.txt for info on how to debug
1274 * RCU CPU stall warnings.
1275 */
1276 pr_err("INFO: %s detected stalls on CPUs/tasks:", rcu_state.name);
1277 print_cpu_stall_info_begin();
1278 rcu_for_each_leaf_node(rnp) {
1279 raw_spin_lock_irqsave_rcu_node(rnp, flags);
1280 ndetected += rcu_print_task_stall(rnp);
1281 if (rnp->qsmask != 0) {
1282 for_each_leaf_node_possible_cpu(rnp, cpu)
1283 if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) {
1284 print_cpu_stall_info(cpu);
1285 ndetected++;
1286 }
1287 }
1288 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1289 }
1290
1291 print_cpu_stall_info_end();
1292 for_each_possible_cpu(cpu)
1293 totqlen += rcu_get_n_cbs_cpu(cpu);
1294 pr_cont("(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n",
1295 smp_processor_id(), (long)(jiffies - rcu_state.gp_start),
1296 (long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
1297 if (ndetected) {
1298 rcu_dump_cpu_stacks();
1299
1300 /* Complain about tasks blocking the grace period. */
1301 rcu_print_detail_task_stall();
1302 } else {
1303 if (rcu_seq_current(&rcu_state.gp_seq) != gp_seq) {
1304 pr_err("INFO: Stall ended before state dump start\n");
1305 } else {
1306 j = jiffies;
1307 gpa = READ_ONCE(rcu_state.gp_activity);
1308 pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n",
1309 rcu_state.name, j - gpa, j, gpa,
1310 READ_ONCE(jiffies_till_next_fqs),
1311 rcu_get_root()->qsmask);
1312 /* In this case, the current CPU might be at fault. */
1313 sched_show_task(current);
1314 }
1315 }
1316 /* Rewrite if needed in case of slow consoles. */
1317 if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall)))
1318 WRITE_ONCE(rcu_state.jiffies_stall,
1319 jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
1320
1321 rcu_check_gp_kthread_starvation();
1322
1323 panic_on_rcu_stall();
1324
1325 rcu_force_quiescent_state(); /* Kick them all. */
1326}
1327
1328static void print_cpu_stall(void)
1329{
1330 int cpu;
1331 unsigned long flags;
1332 struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
1333 struct rcu_node *rnp = rcu_get_root();
1334 long totqlen = 0;
1335
1336 /* Kick and suppress, if so configured. */
1337 rcu_stall_kick_kthreads();
1338 if (rcu_cpu_stall_suppress)
1339 return;
1340
1341 /*
1342 * OK, time to rat on ourselves...
1343 * See Documentation/RCU/stallwarn.txt for info on how to debug
1344 * RCU CPU stall warnings.
1345 */
1346 pr_err("INFO: %s self-detected stall on CPU", rcu_state.name);
1347 print_cpu_stall_info_begin();
1348 raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags);
1349 print_cpu_stall_info(smp_processor_id());
1350 raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags);
1351 print_cpu_stall_info_end();
1352 for_each_possible_cpu(cpu)
1353 totqlen += rcu_get_n_cbs_cpu(cpu);
1354 pr_cont(" (t=%lu jiffies g=%ld q=%lu)\n",
1355 jiffies - rcu_state.gp_start,
1356 (long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
1357
1358 rcu_check_gp_kthread_starvation();
1359
1360 rcu_dump_cpu_stacks();
1361
1362 raw_spin_lock_irqsave_rcu_node(rnp, flags);
1363 /* Rewrite if needed in case of slow consoles. */
1364 if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall)))
1365 WRITE_ONCE(rcu_state.jiffies_stall,
1366 jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
1367 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1368
1369 panic_on_rcu_stall();
1370
1371 /*
1372 * Attempt to revive the RCU machinery by forcing a context switch.
1373 *
1374 * A context switch would normally allow the RCU state machine to make
1375 * progress and it could be we're stuck in kernel space without context
1376 * switches for an entirely unreasonable amount of time.
1377 */
1378 set_tsk_need_resched(current);
1379 set_preempt_need_resched();
1380}
1381
1382static void check_cpu_stall(struct rcu_data *rdp)
1383{
1384 unsigned long gs1;
1385 unsigned long gs2;
1386 unsigned long gps;
1387 unsigned long j;
1388 unsigned long jn;
1389 unsigned long js;
1390 struct rcu_node *rnp;
1391
1392 if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) ||
1393 !rcu_gp_in_progress())
1394 return;
1395 rcu_stall_kick_kthreads();
1396 j = jiffies;
1397
1398 /*
1399 * Lots of memory barriers to reject false positives.
1400 *
1401 * The idea is to pick up rcu_state.gp_seq, then
1402 * rcu_state.jiffies_stall, then rcu_state.gp_start, and finally
1403 * another copy of rcu_state.gp_seq. These values are updated in
1404 * the opposite order with memory barriers (or equivalent) during
1405 * grace-period initialization and cleanup. Now, a false positive
1406 * can occur if we get an new value of rcu_state.gp_start and a old
1407 * value of rcu_state.jiffies_stall. But given the memory barriers,
1408 * the only way that this can happen is if one grace period ends
1409 * and another starts between these two fetches. This is detected
1410 * by comparing the second fetch of rcu_state.gp_seq with the
1411 * previous fetch from rcu_state.gp_seq.
1412 *
1413 * Given this check, comparisons of jiffies, rcu_state.jiffies_stall,
1414 * and rcu_state.gp_start suffice to forestall false positives.
1415 */
1416 gs1 = READ_ONCE(rcu_state.gp_seq);
1417 smp_rmb(); /* Pick up ->gp_seq first... */
1418 js = READ_ONCE(rcu_state.jiffies_stall);
1419 smp_rmb(); /* ...then ->jiffies_stall before the rest... */
1420 gps = READ_ONCE(rcu_state.gp_start);
1421 smp_rmb(); /* ...and finally ->gp_start before ->gp_seq again. */
1422 gs2 = READ_ONCE(rcu_state.gp_seq);
1423 if (gs1 != gs2 ||
1424 ULONG_CMP_LT(j, js) ||
1425 ULONG_CMP_GE(gps, js))
1426 return; /* No stall or GP completed since entering function. */
1427 rnp = rdp->mynode;
1428 jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
1429 if (rcu_gp_in_progress() &&
1430 (READ_ONCE(rnp->qsmask) & rdp->grpmask) &&
1431 cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
1432
1433 /* We haven't checked in, so go dump stack. */
1434 print_cpu_stall();
1435
1436 } else if (rcu_gp_in_progress() &&
1437 ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) &&
1438 cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
1439
1440 /* They had a few time units to dump stack, so complain. */
1441 print_other_cpu_stall(gs2);
1442 }
1443}
1444
1445/**
1446 * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
1447 *
1448 * Set the stall-warning timeout way off into the future, thus preventing
1449 * any RCU CPU stall-warning messages from appearing in the current set of
1450 * RCU grace periods.
1451 *
1452 * The caller must disable hard irqs.
1453 */
1454void rcu_cpu_stall_reset(void)
1455{
1456 WRITE_ONCE(rcu_state.jiffies_stall, jiffies + ULONG_MAX / 2);
1457}
1458
1459/* Trace-event wrapper function for trace_rcu_future_grace_period. */ 1077/* Trace-event wrapper function for trace_rcu_future_grace_period. */
1460static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, 1078static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp,
1461 unsigned long gp_seq_req, const char *s) 1079 unsigned long gp_seq_req, const char *s)
@@ -1585,7 +1203,7 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp)
1585static void rcu_gp_kthread_wake(void) 1203static void rcu_gp_kthread_wake(void)
1586{ 1204{
1587 if ((current == rcu_state.gp_kthread && 1205 if ((current == rcu_state.gp_kthread &&
1588 !in_interrupt() && !in_serving_softirq()) || 1206 !in_irq() && !in_serving_softirq()) ||
1589 !READ_ONCE(rcu_state.gp_flags) || 1207 !READ_ONCE(rcu_state.gp_flags) ||
1590 !rcu_state.gp_kthread) 1208 !rcu_state.gp_kthread)
1591 return; 1209 return;
@@ -2295,11 +1913,10 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp)
2295 return; 1913 return;
2296 } 1914 }
2297 mask = rdp->grpmask; 1915 mask = rdp->grpmask;
1916 rdp->core_needs_qs = false;
2298 if ((rnp->qsmask & mask) == 0) { 1917 if ((rnp->qsmask & mask) == 0) {
2299 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1918 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
2300 } else { 1919 } else {
2301 rdp->core_needs_qs = false;
2302
2303 /* 1920 /*
2304 * This GP can't end until cpu checks in, so all of our 1921 * This GP can't end until cpu checks in, so all of our
2305 * callbacks can be processed during the next GP. 1922 * callbacks can be processed during the next GP.
@@ -2548,11 +2165,11 @@ void rcu_sched_clock_irq(int user)
2548} 2165}
2549 2166
2550/* 2167/*
2551 * Scan the leaf rcu_node structures, processing dyntick state for any that 2168 * Scan the leaf rcu_node structures. For each structure on which all
2552 * have not yet encountered a quiescent state, using the function specified. 2169 * CPUs have reported a quiescent state and on which there are tasks
2553 * Also initiate boosting for any threads blocked on the root rcu_node. 2170 * blocking the current grace period, initiate RCU priority boosting.
2554 * 2171 * Otherwise, invoke the specified function to check dyntick state for
2555 * The caller must have suppressed start of new grace periods. 2172 * each CPU that has not yet reported a quiescent state.
2556 */ 2173 */
2557static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) 2174static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
2558{ 2175{
@@ -2635,101 +2252,6 @@ void rcu_force_quiescent_state(void)
2635} 2252}
2636EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); 2253EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
2637 2254
2638/*
2639 * This function checks for grace-period requests that fail to motivate
2640 * RCU to come out of its idle mode.
2641 */
2642void
2643rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp,
2644 const unsigned long gpssdelay)
2645{
2646 unsigned long flags;
2647 unsigned long j;
2648 struct rcu_node *rnp_root = rcu_get_root();
2649 static atomic_t warned = ATOMIC_INIT(0);
2650
2651 if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress() ||
2652 ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed))
2653 return;
2654 j = jiffies; /* Expensive access, and in common case don't get here. */
2655 if (time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) ||
2656 time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) ||
2657 atomic_read(&warned))
2658 return;
2659
2660 raw_spin_lock_irqsave_rcu_node(rnp, flags);
2661 j = jiffies;
2662 if (rcu_gp_in_progress() ||
2663 ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) ||
2664 time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) ||
2665 time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) ||
2666 atomic_read(&warned)) {
2667 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
2668 return;
2669 }
2670 /* Hold onto the leaf lock to make others see warned==1. */
2671
2672 if (rnp_root != rnp)
2673 raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */
2674 j = jiffies;
2675 if (rcu_gp_in_progress() ||
2676 ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) ||
2677 time_before(j, rcu_state.gp_req_activity + gpssdelay) ||
2678 time_before(j, rcu_state.gp_activity + gpssdelay) ||
2679 atomic_xchg(&warned, 1)) {
2680 raw_spin_unlock_rcu_node(rnp_root); /* irqs remain disabled. */
2681 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
2682 return;
2683 }
2684 WARN_ON(1);
2685 if (rnp_root != rnp)
2686 raw_spin_unlock_rcu_node(rnp_root);
2687 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
2688 show_rcu_gp_kthreads();
2689}
2690
2691/*
2692 * Do a forward-progress check for rcutorture. This is normally invoked
2693 * due to an OOM event. The argument "j" gives the time period during
2694 * which rcutorture would like progress to have been made.
2695 */
2696void rcu_fwd_progress_check(unsigned long j)
2697{
2698 unsigned long cbs;
2699 int cpu;
2700 unsigned long max_cbs = 0;
2701 int max_cpu = -1;
2702 struct rcu_data *rdp;
2703
2704 if (rcu_gp_in_progress()) {
2705 pr_info("%s: GP age %lu jiffies\n",
2706 __func__, jiffies - rcu_state.gp_start);
2707 show_rcu_gp_kthreads();
2708 } else {
2709 pr_info("%s: Last GP end %lu jiffies ago\n",
2710 __func__, jiffies - rcu_state.gp_end);
2711 preempt_disable();
2712 rdp = this_cpu_ptr(&rcu_data);
2713 rcu_check_gp_start_stall(rdp->mynode, rdp, j);
2714 preempt_enable();
2715 }
2716 for_each_possible_cpu(cpu) {
2717 cbs = rcu_get_n_cbs_cpu(cpu);
2718 if (!cbs)
2719 continue;
2720 if (max_cpu < 0)
2721 pr_info("%s: callbacks", __func__);
2722 pr_cont(" %d: %lu", cpu, cbs);
2723 if (cbs <= max_cbs)
2724 continue;
2725 max_cbs = cbs;
2726 max_cpu = cpu;
2727 }
2728 if (max_cpu >= 0)
2729 pr_cont("\n");
2730}
2731EXPORT_SYMBOL_GPL(rcu_fwd_progress_check);
2732
2733/* Perform RCU core processing work for the current CPU. */ 2255/* Perform RCU core processing work for the current CPU. */
2734static __latent_entropy void rcu_core(struct softirq_action *unused) 2256static __latent_entropy void rcu_core(struct softirq_action *unused)
2735{ 2257{
@@ -2870,7 +2392,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy)
2870 * Use rcu:rcu_callback trace event to find the previous 2392 * Use rcu:rcu_callback trace event to find the previous
2871 * time callback was passed to __call_rcu(). 2393 * time callback was passed to __call_rcu().
2872 */ 2394 */
2873 WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pF()!!!\n", 2395 WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pS()!!!\n",
2874 head, head->func); 2396 head, head->func);
2875 WRITE_ONCE(head->func, rcu_leak_callback); 2397 WRITE_ONCE(head->func, rcu_leak_callback);
2876 return; 2398 return;
@@ -3559,13 +3081,11 @@ static int rcu_pm_notify(struct notifier_block *self,
3559 switch (action) { 3081 switch (action) {
3560 case PM_HIBERNATION_PREPARE: 3082 case PM_HIBERNATION_PREPARE:
3561 case PM_SUSPEND_PREPARE: 3083 case PM_SUSPEND_PREPARE:
3562 if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ 3084 rcu_expedite_gp();
3563 rcu_expedite_gp();
3564 break; 3085 break;
3565 case PM_POST_HIBERNATION: 3086 case PM_POST_HIBERNATION:
3566 case PM_POST_SUSPEND: 3087 case PM_POST_SUSPEND:
3567 if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ 3088 rcu_unexpedite_gp();
3568 rcu_unexpedite_gp();
3569 break; 3089 break;
3570 default: 3090 default:
3571 break; 3091 break;
@@ -3742,8 +3262,7 @@ static void __init rcu_init_geometry(void)
3742 jiffies_till_first_fqs = d; 3262 jiffies_till_first_fqs = d;
3743 if (jiffies_till_next_fqs == ULONG_MAX) 3263 if (jiffies_till_next_fqs == ULONG_MAX)
3744 jiffies_till_next_fqs = d; 3264 jiffies_till_next_fqs = d;
3745 if (jiffies_till_sched_qs == ULONG_MAX) 3265 adjust_jiffies_till_sched_qs();
3746 adjust_jiffies_till_sched_qs();
3747 3266
3748 /* If the compile-time values are accurate, just leave. */ 3267 /* If the compile-time values are accurate, just leave. */
3749 if (rcu_fanout_leaf == RCU_FANOUT_LEAF && 3268 if (rcu_fanout_leaf == RCU_FANOUT_LEAF &&
@@ -3858,5 +3377,6 @@ void __init rcu_init(void)
3858 srcu_init(); 3377 srcu_init();
3859} 3378}
3860 3379
3380#include "tree_stall.h"
3861#include "tree_exp.h" 3381#include "tree_exp.h"
3862#include "tree_plugin.h" 3382#include "tree_plugin.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index bb4f995f2d3f..e253d11af3c4 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -393,15 +393,13 @@ static const char *tp_rcu_varname __used __tracepoint_string = rcu_name;
393 393
394int rcu_dynticks_snap(struct rcu_data *rdp); 394int rcu_dynticks_snap(struct rcu_data *rdp);
395 395
396/* Forward declarations for rcutree_plugin.h */ 396/* Forward declarations for tree_plugin.h */
397static void rcu_bootup_announce(void); 397static void rcu_bootup_announce(void);
398static void rcu_qs(void); 398static void rcu_qs(void);
399static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); 399static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
400#ifdef CONFIG_HOTPLUG_CPU 400#ifdef CONFIG_HOTPLUG_CPU
401static bool rcu_preempt_has_tasks(struct rcu_node *rnp); 401static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
402#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 402#endif /* #ifdef CONFIG_HOTPLUG_CPU */
403static void rcu_print_detail_task_stall(void);
404static int rcu_print_task_stall(struct rcu_node *rnp);
405static int rcu_print_task_exp_stall(struct rcu_node *rnp); 403static int rcu_print_task_exp_stall(struct rcu_node *rnp);
406static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); 404static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
407static void rcu_flavor_sched_clock_irq(int user); 405static void rcu_flavor_sched_clock_irq(int user);
@@ -418,9 +416,6 @@ static void rcu_prepare_for_idle(void);
418static bool rcu_preempt_has_tasks(struct rcu_node *rnp); 416static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
419static bool rcu_preempt_need_deferred_qs(struct task_struct *t); 417static bool rcu_preempt_need_deferred_qs(struct task_struct *t);
420static void rcu_preempt_deferred_qs(struct task_struct *t); 418static void rcu_preempt_deferred_qs(struct task_struct *t);
421static void print_cpu_stall_info_begin(void);
422static void print_cpu_stall_info(int cpu);
423static void print_cpu_stall_info_end(void);
424static void zero_cpu_stall_ticks(struct rcu_data *rdp); 419static void zero_cpu_stall_ticks(struct rcu_data *rdp);
425static bool rcu_nocb_cpu_needs_barrier(int cpu); 420static bool rcu_nocb_cpu_needs_barrier(int cpu);
426static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp); 421static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
@@ -445,3 +440,10 @@ static void rcu_bind_gp_kthread(void);
445static bool rcu_nohz_full_cpu(void); 440static bool rcu_nohz_full_cpu(void);
446static void rcu_dynticks_task_enter(void); 441static void rcu_dynticks_task_enter(void);
447static void rcu_dynticks_task_exit(void); 442static void rcu_dynticks_task_exit(void);
443
444/* Forward declarations for tree_stall.h */
445static void record_gp_stall_check_time(void);
446static void rcu_iw_handler(struct irq_work *iwp);
447static void check_cpu_stall(struct rcu_data *rdp);
448static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp,
449 const unsigned long gpssdelay);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 4c2a0189e748..9c990df880d1 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -10,6 +10,7 @@
10#include <linux/lockdep.h> 10#include <linux/lockdep.h>
11 11
12static void rcu_exp_handler(void *unused); 12static void rcu_exp_handler(void *unused);
13static int rcu_print_task_exp_stall(struct rcu_node *rnp);
13 14
14/* 15/*
15 * Record the start of an expedited grace period. 16 * Record the start of an expedited grace period.
@@ -633,7 +634,7 @@ static void rcu_exp_handler(void *unused)
633 raw_spin_lock_irqsave_rcu_node(rnp, flags); 634 raw_spin_lock_irqsave_rcu_node(rnp, flags);
634 if (rnp->expmask & rdp->grpmask) { 635 if (rnp->expmask & rdp->grpmask) {
635 rdp->deferred_qs = true; 636 rdp->deferred_qs = true;
636 WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, true); 637 t->rcu_read_unlock_special.b.exp_hint = true;
637 } 638 }
638 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 639 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
639 return; 640 return;
@@ -648,7 +649,7 @@ static void rcu_exp_handler(void *unused)
648 * 649 *
649 * If the CPU is fully enabled (or if some buggy RCU-preempt 650 * If the CPU is fully enabled (or if some buggy RCU-preempt
650 * read-side critical section is being used from idle), just 651 * read-side critical section is being used from idle), just
651 * invoke rcu_preempt_defer_qs() to immediately report the 652 * invoke rcu_preempt_deferred_qs() to immediately report the
652 * quiescent state. We cannot use rcu_read_unlock_special() 653 * quiescent state. We cannot use rcu_read_unlock_special()
653 * because we are in an interrupt handler, which will cause that 654 * because we are in an interrupt handler, which will cause that
654 * function to take an early exit without doing anything. 655 * function to take an early exit without doing anything.
@@ -670,6 +671,27 @@ static void sync_sched_exp_online_cleanup(int cpu)
670{ 671{
671} 672}
672 673
674/*
675 * Scan the current list of tasks blocked within RCU read-side critical
676 * sections, printing out the tid of each that is blocking the current
677 * expedited grace period.
678 */
679static int rcu_print_task_exp_stall(struct rcu_node *rnp)
680{
681 struct task_struct *t;
682 int ndetected = 0;
683
684 if (!rnp->exp_tasks)
685 return 0;
686 t = list_entry(rnp->exp_tasks->prev,
687 struct task_struct, rcu_node_entry);
688 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
689 pr_cont(" P%d", t->pid);
690 ndetected++;
691 }
692 return ndetected;
693}
694
673#else /* #ifdef CONFIG_PREEMPT_RCU */ 695#else /* #ifdef CONFIG_PREEMPT_RCU */
674 696
675/* Invoked on each online non-idle CPU for expedited quiescent state. */ 697/* Invoked on each online non-idle CPU for expedited quiescent state. */
@@ -709,6 +731,16 @@ static void sync_sched_exp_online_cleanup(int cpu)
709 WARN_ON_ONCE(ret); 731 WARN_ON_ONCE(ret);
710} 732}
711 733
734/*
735 * Because preemptible RCU does not exist, we never have to check for
736 * tasks blocked within RCU read-side critical sections that are
737 * blocking the current expedited grace period.
738 */
739static int rcu_print_task_exp_stall(struct rcu_node *rnp)
740{
741 return 0;
742}
743
712#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ 744#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
713 745
714/** 746/**
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 97dba50f6fb2..1102765f91fd 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -285,7 +285,7 @@ static void rcu_qs(void)
285 TPS("cpuqs")); 285 TPS("cpuqs"));
286 __this_cpu_write(rcu_data.cpu_no_qs.b.norm, false); 286 __this_cpu_write(rcu_data.cpu_no_qs.b.norm, false);
287 barrier(); /* Coordinate with rcu_flavor_sched_clock_irq(). */ 287 barrier(); /* Coordinate with rcu_flavor_sched_clock_irq(). */
288 current->rcu_read_unlock_special.b.need_qs = false; 288 WRITE_ONCE(current->rcu_read_unlock_special.b.need_qs, false);
289 } 289 }
290} 290}
291 291
@@ -643,100 +643,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
643} 643}
644 644
645/* 645/*
646 * Dump detailed information for all tasks blocking the current RCU
647 * grace period on the specified rcu_node structure.
648 */
649static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
650{
651 unsigned long flags;
652 struct task_struct *t;
653
654 raw_spin_lock_irqsave_rcu_node(rnp, flags);
655 if (!rcu_preempt_blocked_readers_cgp(rnp)) {
656 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
657 return;
658 }
659 t = list_entry(rnp->gp_tasks->prev,
660 struct task_struct, rcu_node_entry);
661 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
662 /*
663 * We could be printing a lot while holding a spinlock.
664 * Avoid triggering hard lockup.
665 */
666 touch_nmi_watchdog();
667 sched_show_task(t);
668 }
669 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
670}
671
672/*
673 * Dump detailed information for all tasks blocking the current RCU
674 * grace period.
675 */
676static void rcu_print_detail_task_stall(void)
677{
678 struct rcu_node *rnp = rcu_get_root();
679
680 rcu_print_detail_task_stall_rnp(rnp);
681 rcu_for_each_leaf_node(rnp)
682 rcu_print_detail_task_stall_rnp(rnp);
683}
684
685static void rcu_print_task_stall_begin(struct rcu_node *rnp)
686{
687 pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
688 rnp->level, rnp->grplo, rnp->grphi);
689}
690
691static void rcu_print_task_stall_end(void)
692{
693 pr_cont("\n");
694}
695
696/*
697 * Scan the current list of tasks blocked within RCU read-side critical
698 * sections, printing out the tid of each.
699 */
700static int rcu_print_task_stall(struct rcu_node *rnp)
701{
702 struct task_struct *t;
703 int ndetected = 0;
704
705 if (!rcu_preempt_blocked_readers_cgp(rnp))
706 return 0;
707 rcu_print_task_stall_begin(rnp);
708 t = list_entry(rnp->gp_tasks->prev,
709 struct task_struct, rcu_node_entry);
710 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
711 pr_cont(" P%d", t->pid);
712 ndetected++;
713 }
714 rcu_print_task_stall_end();
715 return ndetected;
716}
717
718/*
719 * Scan the current list of tasks blocked within RCU read-side critical
720 * sections, printing out the tid of each that is blocking the current
721 * expedited grace period.
722 */
723static int rcu_print_task_exp_stall(struct rcu_node *rnp)
724{
725 struct task_struct *t;
726 int ndetected = 0;
727
728 if (!rnp->exp_tasks)
729 return 0;
730 t = list_entry(rnp->exp_tasks->prev,
731 struct task_struct, rcu_node_entry);
732 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
733 pr_cont(" P%d", t->pid);
734 ndetected++;
735 }
736 return ndetected;
737}
738
739/*
740 * Check that the list of blocked tasks for the newly completed grace 646 * Check that the list of blocked tasks for the newly completed grace
741 * period is in fact empty. It is a serious bug to complete a grace 647 * period is in fact empty. It is a serious bug to complete a grace
742 * period that still has RCU readers blocked! This function must be 648 * period that still has RCU readers blocked! This function must be
@@ -804,19 +710,25 @@ static void rcu_flavor_sched_clock_irq(int user)
804 710
805/* 711/*
806 * Check for a task exiting while in a preemptible-RCU read-side 712 * Check for a task exiting while in a preemptible-RCU read-side
807 * critical section, clean up if so. No need to issue warnings, 713 * critical section, clean up if so. No need to issue warnings, as
808 * as debug_check_no_locks_held() already does this if lockdep 714 * debug_check_no_locks_held() already does this if lockdep is enabled.
809 * is enabled. 715 * Besides, if this function does anything other than just immediately
716 * return, there was a bug of some sort. Spewing warnings from this
717 * function is like as not to simply obscure important prior warnings.
810 */ 718 */
811void exit_rcu(void) 719void exit_rcu(void)
812{ 720{
813 struct task_struct *t = current; 721 struct task_struct *t = current;
814 722
815 if (likely(list_empty(&current->rcu_node_entry))) 723 if (unlikely(!list_empty(&current->rcu_node_entry))) {
724 t->rcu_read_lock_nesting = 1;
725 barrier();
726 WRITE_ONCE(t->rcu_read_unlock_special.b.blocked, true);
727 } else if (unlikely(t->rcu_read_lock_nesting)) {
728 t->rcu_read_lock_nesting = 1;
729 } else {
816 return; 730 return;
817 t->rcu_read_lock_nesting = 1; 731 }
818 barrier();
819 t->rcu_read_unlock_special.b.blocked = true;
820 __rcu_read_unlock(); 732 __rcu_read_unlock();
821 rcu_preempt_deferred_qs(current); 733 rcu_preempt_deferred_qs(current);
822} 734}
@@ -980,33 +892,6 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
980static void rcu_preempt_deferred_qs(struct task_struct *t) { } 892static void rcu_preempt_deferred_qs(struct task_struct *t) { }
981 893
982/* 894/*
983 * Because preemptible RCU does not exist, we never have to check for
984 * tasks blocked within RCU read-side critical sections.
985 */
986static void rcu_print_detail_task_stall(void)
987{
988}
989
990/*
991 * Because preemptible RCU does not exist, we never have to check for
992 * tasks blocked within RCU read-side critical sections.
993 */
994static int rcu_print_task_stall(struct rcu_node *rnp)
995{
996 return 0;
997}
998
999/*
1000 * Because preemptible RCU does not exist, we never have to check for
1001 * tasks blocked within RCU read-side critical sections that are
1002 * blocking the current expedited grace period.
1003 */
1004static int rcu_print_task_exp_stall(struct rcu_node *rnp)
1005{
1006 return 0;
1007}
1008
1009/*
1010 * Because there is no preemptible RCU, there can be no readers blocked, 895 * Because there is no preemptible RCU, there can be no readers blocked,
1011 * so there is no need to check for blocked tasks. So check only for 896 * so there is no need to check for blocked tasks. So check only for
1012 * bogus qsmask values. 897 * bogus qsmask values.
@@ -1185,8 +1070,6 @@ static int rcu_boost_kthread(void *arg)
1185static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) 1070static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1186 __releases(rnp->lock) 1071 __releases(rnp->lock)
1187{ 1072{
1188 struct task_struct *t;
1189
1190 raw_lockdep_assert_held_rcu_node(rnp); 1073 raw_lockdep_assert_held_rcu_node(rnp);
1191 if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { 1074 if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
1192 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1075 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -1200,9 +1083,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1200 if (rnp->exp_tasks == NULL) 1083 if (rnp->exp_tasks == NULL)
1201 rnp->boost_tasks = rnp->gp_tasks; 1084 rnp->boost_tasks = rnp->gp_tasks;
1202 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1085 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1203 t = rnp->boost_kthread_task; 1086 rcu_wake_cond(rnp->boost_kthread_task,
1204 if (t) 1087 rnp->boost_kthread_status);
1205 rcu_wake_cond(t, rnp->boost_kthread_status);
1206 } else { 1088 } else {
1207 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1089 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1208 } 1090 }
@@ -1649,98 +1531,6 @@ static void rcu_cleanup_after_idle(void)
1649 1531
1650#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 1532#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
1651 1533
1652#ifdef CONFIG_RCU_FAST_NO_HZ
1653
1654static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
1655{
1656 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
1657
1658 sprintf(cp, "last_accelerate: %04lx/%04lx, Nonlazy posted: %c%c%c",
1659 rdp->last_accelerate & 0xffff, jiffies & 0xffff,
1660 ".l"[rdp->all_lazy],
1661 ".L"[!rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)],
1662 ".D"[!rdp->tick_nohz_enabled_snap]);
1663}
1664
1665#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
1666
1667static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
1668{
1669 *cp = '\0';
1670}
1671
1672#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
1673
1674/* Initiate the stall-info list. */
1675static void print_cpu_stall_info_begin(void)
1676{
1677 pr_cont("\n");
1678}
1679
1680/*
1681 * Print out diagnostic information for the specified stalled CPU.
1682 *
1683 * If the specified CPU is aware of the current RCU grace period, then
1684 * print the number of scheduling clock interrupts the CPU has taken
1685 * during the time that it has been aware. Otherwise, print the number
1686 * of RCU grace periods that this CPU is ignorant of, for example, "1"
1687 * if the CPU was aware of the previous grace period.
1688 *
1689 * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info.
1690 */
1691static void print_cpu_stall_info(int cpu)
1692{
1693 unsigned long delta;
1694 char fast_no_hz[72];
1695 struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
1696 char *ticks_title;
1697 unsigned long ticks_value;
1698
1699 /*
1700 * We could be printing a lot while holding a spinlock. Avoid
1701 * triggering hard lockup.
1702 */
1703 touch_nmi_watchdog();
1704
1705 ticks_value = rcu_seq_ctr(rcu_state.gp_seq - rdp->gp_seq);
1706 if (ticks_value) {
1707 ticks_title = "GPs behind";
1708 } else {
1709 ticks_title = "ticks this GP";
1710 ticks_value = rdp->ticks_this_gp;
1711 }
1712 print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
1713 delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq);
1714 pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n",
1715 cpu,
1716 "O."[!!cpu_online(cpu)],
1717 "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)],
1718 "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)],
1719 !IS_ENABLED(CONFIG_IRQ_WORK) ? '?' :
1720 rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' :
1721 "!."[!delta],
1722 ticks_value, ticks_title,
1723 rcu_dynticks_snap(rdp) & 0xfff,
1724 rdp->dynticks_nesting, rdp->dynticks_nmi_nesting,
1725 rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
1726 READ_ONCE(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart,
1727 fast_no_hz);
1728}
1729
1730/* Terminate the stall-info list. */
1731static void print_cpu_stall_info_end(void)
1732{
1733 pr_err("\t");
1734}
1735
1736/* Zero ->ticks_this_gp and snapshot the number of RCU softirq handlers. */
1737static void zero_cpu_stall_ticks(struct rcu_data *rdp)
1738{
1739 rdp->ticks_this_gp = 0;
1740 rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id());
1741 WRITE_ONCE(rdp->last_fqs_resched, jiffies);
1742}
1743
1744#ifdef CONFIG_RCU_NOCB_CPU 1534#ifdef CONFIG_RCU_NOCB_CPU
1745 1535
1746/* 1536/*
@@ -1766,11 +1556,22 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp)
1766 */ 1556 */
1767 1557
1768 1558
1769/* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */ 1559/*
1560 * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters.
1561 * The string after the "rcu_nocbs=" is either "all" for all CPUs, or a
1562 * comma-separated list of CPUs and/or CPU ranges. If an invalid list is
1563 * given, a warning is emitted and all CPUs are offloaded.
1564 */
1770static int __init rcu_nocb_setup(char *str) 1565static int __init rcu_nocb_setup(char *str)
1771{ 1566{
1772 alloc_bootmem_cpumask_var(&rcu_nocb_mask); 1567 alloc_bootmem_cpumask_var(&rcu_nocb_mask);
1773 cpulist_parse(str, rcu_nocb_mask); 1568 if (!strcasecmp(str, "all"))
1569 cpumask_setall(rcu_nocb_mask);
1570 else
1571 if (cpulist_parse(str, rcu_nocb_mask)) {
1572 pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n");
1573 cpumask_setall(rcu_nocb_mask);
1574 }
1774 return 1; 1575 return 1;
1775} 1576}
1776__setup("rcu_nocbs=", rcu_nocb_setup); 1577__setup("rcu_nocbs=", rcu_nocb_setup);
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
new file mode 100644
index 000000000000..f65a73a97323
--- /dev/null
+++ b/kernel/rcu/tree_stall.h
@@ -0,0 +1,709 @@
1// SPDX-License-Identifier: GPL-2.0+
2/*
3 * RCU CPU stall warnings for normal RCU grace periods
4 *
5 * Copyright IBM Corporation, 2019
6 *
7 * Author: Paul E. McKenney <paulmck@linux.ibm.com>
8 */
9
10//////////////////////////////////////////////////////////////////////////////
11//
12// Controlling CPU stall warnings, including delay calculation.
13
14/* panic() on RCU Stall sysctl. */
15int sysctl_panic_on_rcu_stall __read_mostly;
16
17#ifdef CONFIG_PROVE_RCU
18#define RCU_STALL_DELAY_DELTA (5 * HZ)
19#else
20#define RCU_STALL_DELAY_DELTA 0
21#endif
22
23/* Limit-check stall timeouts specified at boottime and runtime. */
24int rcu_jiffies_till_stall_check(void)
25{
26 int till_stall_check = READ_ONCE(rcu_cpu_stall_timeout);
27
28 /*
29 * Limit check must be consistent with the Kconfig limits
30 * for CONFIG_RCU_CPU_STALL_TIMEOUT.
31 */
32 if (till_stall_check < 3) {
33 WRITE_ONCE(rcu_cpu_stall_timeout, 3);
34 till_stall_check = 3;
35 } else if (till_stall_check > 300) {
36 WRITE_ONCE(rcu_cpu_stall_timeout, 300);
37 till_stall_check = 300;
38 }
39 return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
40}
41EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check);
42
43/* Don't do RCU CPU stall warnings during long sysrq printouts. */
44void rcu_sysrq_start(void)
45{
46 if (!rcu_cpu_stall_suppress)
47 rcu_cpu_stall_suppress = 2;
48}
49
50void rcu_sysrq_end(void)
51{
52 if (rcu_cpu_stall_suppress == 2)
53 rcu_cpu_stall_suppress = 0;
54}
55
56/* Don't print RCU CPU stall warnings during a kernel panic. */
57static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
58{
59 rcu_cpu_stall_suppress = 1;
60 return NOTIFY_DONE;
61}
62
63static struct notifier_block rcu_panic_block = {
64 .notifier_call = rcu_panic,
65};
66
67static int __init check_cpu_stall_init(void)
68{
69 atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
70 return 0;
71}
72early_initcall(check_cpu_stall_init);
73
74/* If so specified via sysctl, panic, yielding cleaner stall-warning output. */
75static void panic_on_rcu_stall(void)
76{
77 if (sysctl_panic_on_rcu_stall)
78 panic("RCU Stall\n");
79}
80
81/**
82 * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
83 *
84 * Set the stall-warning timeout way off into the future, thus preventing
85 * any RCU CPU stall-warning messages from appearing in the current set of
86 * RCU grace periods.
87 *
88 * The caller must disable hard irqs.
89 */
90void rcu_cpu_stall_reset(void)
91{
92 WRITE_ONCE(rcu_state.jiffies_stall, jiffies + ULONG_MAX / 2);
93}
94
95//////////////////////////////////////////////////////////////////////////////
96//
97// Interaction with RCU grace periods
98
99/* Start of new grace period, so record stall time (and forcing times). */
100static void record_gp_stall_check_time(void)
101{
102 unsigned long j = jiffies;
103 unsigned long j1;
104
105 rcu_state.gp_start = j;
106 j1 = rcu_jiffies_till_stall_check();
107 /* Record ->gp_start before ->jiffies_stall. */
108 smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */
109 rcu_state.jiffies_resched = j + j1 / 2;
110 rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs);
111}
112
113/* Zero ->ticks_this_gp and snapshot the number of RCU softirq handlers. */
114static void zero_cpu_stall_ticks(struct rcu_data *rdp)
115{
116 rdp->ticks_this_gp = 0;
117 rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id());
118 WRITE_ONCE(rdp->last_fqs_resched, jiffies);
119}
120
121/*
122 * If too much time has passed in the current grace period, and if
123 * so configured, go kick the relevant kthreads.
124 */
125static void rcu_stall_kick_kthreads(void)
126{
127 unsigned long j;
128
129 if (!rcu_kick_kthreads)
130 return;
131 j = READ_ONCE(rcu_state.jiffies_kick_kthreads);
132 if (time_after(jiffies, j) && rcu_state.gp_kthread &&
133 (rcu_gp_in_progress() || READ_ONCE(rcu_state.gp_flags))) {
134 WARN_ONCE(1, "Kicking %s grace-period kthread\n",
135 rcu_state.name);
136 rcu_ftrace_dump(DUMP_ALL);
137 wake_up_process(rcu_state.gp_kthread);
138 WRITE_ONCE(rcu_state.jiffies_kick_kthreads, j + HZ);
139 }
140}
141
142/*
143 * Handler for the irq_work request posted about halfway into the RCU CPU
144 * stall timeout, and used to detect excessive irq disabling. Set state
145 * appropriately, but just complain if there is unexpected state on entry.
146 */
147static void rcu_iw_handler(struct irq_work *iwp)
148{
149 struct rcu_data *rdp;
150 struct rcu_node *rnp;
151
152 rdp = container_of(iwp, struct rcu_data, rcu_iw);
153 rnp = rdp->mynode;
154 raw_spin_lock_rcu_node(rnp);
155 if (!WARN_ON_ONCE(!rdp->rcu_iw_pending)) {
156 rdp->rcu_iw_gp_seq = rnp->gp_seq;
157 rdp->rcu_iw_pending = false;
158 }
159 raw_spin_unlock_rcu_node(rnp);
160}
161
162//////////////////////////////////////////////////////////////////////////////
163//
164// Printing RCU CPU stall warnings
165
166#ifdef CONFIG_PREEMPT
167
168/*
169 * Dump detailed information for all tasks blocking the current RCU
170 * grace period on the specified rcu_node structure.
171 */
172static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
173{
174 unsigned long flags;
175 struct task_struct *t;
176
177 raw_spin_lock_irqsave_rcu_node(rnp, flags);
178 if (!rcu_preempt_blocked_readers_cgp(rnp)) {
179 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
180 return;
181 }
182 t = list_entry(rnp->gp_tasks->prev,
183 struct task_struct, rcu_node_entry);
184 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
185 /*
186 * We could be printing a lot while holding a spinlock.
187 * Avoid triggering hard lockup.
188 */
189 touch_nmi_watchdog();
190 sched_show_task(t);
191 }
192 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
193}
194
195/*
196 * Scan the current list of tasks blocked within RCU read-side critical
197 * sections, printing out the tid of each.
198 */
199static int rcu_print_task_stall(struct rcu_node *rnp)
200{
201 struct task_struct *t;
202 int ndetected = 0;
203
204 if (!rcu_preempt_blocked_readers_cgp(rnp))
205 return 0;
206 pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
207 rnp->level, rnp->grplo, rnp->grphi);
208 t = list_entry(rnp->gp_tasks->prev,
209 struct task_struct, rcu_node_entry);
210 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
211 pr_cont(" P%d", t->pid);
212 ndetected++;
213 }
214 pr_cont("\n");
215 return ndetected;
216}
217
218#else /* #ifdef CONFIG_PREEMPT */
219
220/*
221 * Because preemptible RCU does not exist, we never have to check for
222 * tasks blocked within RCU read-side critical sections.
223 */
224static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
225{
226}
227
228/*
229 * Because preemptible RCU does not exist, we never have to check for
230 * tasks blocked within RCU read-side critical sections.
231 */
232static int rcu_print_task_stall(struct rcu_node *rnp)
233{
234 return 0;
235}
236#endif /* #else #ifdef CONFIG_PREEMPT */
237
238/*
239 * Dump stacks of all tasks running on stalled CPUs. First try using
240 * NMIs, but fall back to manual remote stack tracing on architectures
241 * that don't support NMI-based stack dumps. The NMI-triggered stack
242 * traces are more accurate because they are printed by the target CPU.
243 */
244static void rcu_dump_cpu_stacks(void)
245{
246 int cpu;
247 unsigned long flags;
248 struct rcu_node *rnp;
249
250 rcu_for_each_leaf_node(rnp) {
251 raw_spin_lock_irqsave_rcu_node(rnp, flags);
252 for_each_leaf_node_possible_cpu(rnp, cpu)
253 if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu))
254 if (!trigger_single_cpu_backtrace(cpu))
255 dump_cpu_task(cpu);
256 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
257 }
258}
259
260#ifdef CONFIG_RCU_FAST_NO_HZ
261
262static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
263{
264 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
265
266 sprintf(cp, "last_accelerate: %04lx/%04lx, Nonlazy posted: %c%c%c",
267 rdp->last_accelerate & 0xffff, jiffies & 0xffff,
268 ".l"[rdp->all_lazy],
269 ".L"[!rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)],
270 ".D"[!!rdp->tick_nohz_enabled_snap]);
271}
272
273#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
274
275static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
276{
277 *cp = '\0';
278}
279
280#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
281
282/*
283 * Print out diagnostic information for the specified stalled CPU.
284 *
285 * If the specified CPU is aware of the current RCU grace period, then
286 * print the number of scheduling clock interrupts the CPU has taken
287 * during the time that it has been aware. Otherwise, print the number
288 * of RCU grace periods that this CPU is ignorant of, for example, "1"
289 * if the CPU was aware of the previous grace period.
290 *
291 * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info.
292 */
293static void print_cpu_stall_info(int cpu)
294{
295 unsigned long delta;
296 char fast_no_hz[72];
297 struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
298 char *ticks_title;
299 unsigned long ticks_value;
300
301 /*
302 * We could be printing a lot while holding a spinlock. Avoid
303 * triggering hard lockup.
304 */
305 touch_nmi_watchdog();
306
307 ticks_value = rcu_seq_ctr(rcu_state.gp_seq - rdp->gp_seq);
308 if (ticks_value) {
309 ticks_title = "GPs behind";
310 } else {
311 ticks_title = "ticks this GP";
312 ticks_value = rdp->ticks_this_gp;
313 }
314 print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
315 delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq);
316 pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n",
317 cpu,
318 "O."[!!cpu_online(cpu)],
319 "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)],
320 "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)],
321 !IS_ENABLED(CONFIG_IRQ_WORK) ? '?' :
322 rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' :
323 "!."[!delta],
324 ticks_value, ticks_title,
325 rcu_dynticks_snap(rdp) & 0xfff,
326 rdp->dynticks_nesting, rdp->dynticks_nmi_nesting,
327 rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
328 READ_ONCE(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart,
329 fast_no_hz);
330}
331
332/* Complain about starvation of grace-period kthread. */
333static void rcu_check_gp_kthread_starvation(void)
334{
335 struct task_struct *gpk = rcu_state.gp_kthread;
336 unsigned long j;
337
338 j = jiffies - READ_ONCE(rcu_state.gp_activity);
339 if (j > 2 * HZ) {
340 pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n",
341 rcu_state.name, j,
342 (long)rcu_seq_current(&rcu_state.gp_seq),
343 READ_ONCE(rcu_state.gp_flags),
344 gp_state_getname(rcu_state.gp_state), rcu_state.gp_state,
345 gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1);
346 if (gpk) {
347 pr_err("RCU grace-period kthread stack dump:\n");
348 sched_show_task(gpk);
349 wake_up_process(gpk);
350 }
351 }
352}
353
354static void print_other_cpu_stall(unsigned long gp_seq)
355{
356 int cpu;
357 unsigned long flags;
358 unsigned long gpa;
359 unsigned long j;
360 int ndetected = 0;
361 struct rcu_node *rnp;
362 long totqlen = 0;
363
364 /* Kick and suppress, if so configured. */
365 rcu_stall_kick_kthreads();
366 if (rcu_cpu_stall_suppress)
367 return;
368
369 /*
370 * OK, time to rat on our buddy...
371 * See Documentation/RCU/stallwarn.txt for info on how to debug
372 * RCU CPU stall warnings.
373 */
374 pr_err("INFO: %s detected stalls on CPUs/tasks:\n", rcu_state.name);
375 rcu_for_each_leaf_node(rnp) {
376 raw_spin_lock_irqsave_rcu_node(rnp, flags);
377 ndetected += rcu_print_task_stall(rnp);
378 if (rnp->qsmask != 0) {
379 for_each_leaf_node_possible_cpu(rnp, cpu)
380 if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) {
381 print_cpu_stall_info(cpu);
382 ndetected++;
383 }
384 }
385 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
386 }
387
388 for_each_possible_cpu(cpu)
389 totqlen += rcu_get_n_cbs_cpu(cpu);
390 pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n",
391 smp_processor_id(), (long)(jiffies - rcu_state.gp_start),
392 (long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
393 if (ndetected) {
394 rcu_dump_cpu_stacks();
395
396 /* Complain about tasks blocking the grace period. */
397 rcu_for_each_leaf_node(rnp)
398 rcu_print_detail_task_stall_rnp(rnp);
399 } else {
400 if (rcu_seq_current(&rcu_state.gp_seq) != gp_seq) {
401 pr_err("INFO: Stall ended before state dump start\n");
402 } else {
403 j = jiffies;
404 gpa = READ_ONCE(rcu_state.gp_activity);
405 pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n",
406 rcu_state.name, j - gpa, j, gpa,
407 READ_ONCE(jiffies_till_next_fqs),
408 rcu_get_root()->qsmask);
409 /* In this case, the current CPU might be at fault. */
410 sched_show_task(current);
411 }
412 }
413 /* Rewrite if needed in case of slow consoles. */
414 if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall)))
415 WRITE_ONCE(rcu_state.jiffies_stall,
416 jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
417
418 rcu_check_gp_kthread_starvation();
419
420 panic_on_rcu_stall();
421
422 rcu_force_quiescent_state(); /* Kick them all. */
423}
424
425static void print_cpu_stall(void)
426{
427 int cpu;
428 unsigned long flags;
429 struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
430 struct rcu_node *rnp = rcu_get_root();
431 long totqlen = 0;
432
433 /* Kick and suppress, if so configured. */
434 rcu_stall_kick_kthreads();
435 if (rcu_cpu_stall_suppress)
436 return;
437
438 /*
439 * OK, time to rat on ourselves...
440 * See Documentation/RCU/stallwarn.txt for info on how to debug
441 * RCU CPU stall warnings.
442 */
443 pr_err("INFO: %s self-detected stall on CPU\n", rcu_state.name);
444 raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags);
445 print_cpu_stall_info(smp_processor_id());
446 raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags);
447 for_each_possible_cpu(cpu)
448 totqlen += rcu_get_n_cbs_cpu(cpu);
449 pr_cont("\t(t=%lu jiffies g=%ld q=%lu)\n",
450 jiffies - rcu_state.gp_start,
451 (long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
452
453 rcu_check_gp_kthread_starvation();
454
455 rcu_dump_cpu_stacks();
456
457 raw_spin_lock_irqsave_rcu_node(rnp, flags);
458 /* Rewrite if needed in case of slow consoles. */
459 if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall)))
460 WRITE_ONCE(rcu_state.jiffies_stall,
461 jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
462 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
463
464 panic_on_rcu_stall();
465
466 /*
467 * Attempt to revive the RCU machinery by forcing a context switch.
468 *
469 * A context switch would normally allow the RCU state machine to make
470 * progress and it could be we're stuck in kernel space without context
471 * switches for an entirely unreasonable amount of time.
472 */
473 set_tsk_need_resched(current);
474 set_preempt_need_resched();
475}
476
477static void check_cpu_stall(struct rcu_data *rdp)
478{
479 unsigned long gs1;
480 unsigned long gs2;
481 unsigned long gps;
482 unsigned long j;
483 unsigned long jn;
484 unsigned long js;
485 struct rcu_node *rnp;
486
487 if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) ||
488 !rcu_gp_in_progress())
489 return;
490 rcu_stall_kick_kthreads();
491 j = jiffies;
492
493 /*
494 * Lots of memory barriers to reject false positives.
495 *
496 * The idea is to pick up rcu_state.gp_seq, then
497 * rcu_state.jiffies_stall, then rcu_state.gp_start, and finally
498 * another copy of rcu_state.gp_seq. These values are updated in
499 * the opposite order with memory barriers (or equivalent) during
500 * grace-period initialization and cleanup. Now, a false positive
501 * can occur if we get an new value of rcu_state.gp_start and a old
502 * value of rcu_state.jiffies_stall. But given the memory barriers,
503 * the only way that this can happen is if one grace period ends
504 * and another starts between these two fetches. This is detected
505 * by comparing the second fetch of rcu_state.gp_seq with the
506 * previous fetch from rcu_state.gp_seq.
507 *
508 * Given this check, comparisons of jiffies, rcu_state.jiffies_stall,
509 * and rcu_state.gp_start suffice to forestall false positives.
510 */
511 gs1 = READ_ONCE(rcu_state.gp_seq);
512 smp_rmb(); /* Pick up ->gp_seq first... */
513 js = READ_ONCE(rcu_state.jiffies_stall);
514 smp_rmb(); /* ...then ->jiffies_stall before the rest... */
515 gps = READ_ONCE(rcu_state.gp_start);
516 smp_rmb(); /* ...and finally ->gp_start before ->gp_seq again. */
517 gs2 = READ_ONCE(rcu_state.gp_seq);
518 if (gs1 != gs2 ||
519 ULONG_CMP_LT(j, js) ||
520 ULONG_CMP_GE(gps, js))
521 return; /* No stall or GP completed since entering function. */
522 rnp = rdp->mynode;
523 jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
524 if (rcu_gp_in_progress() &&
525 (READ_ONCE(rnp->qsmask) & rdp->grpmask) &&
526 cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
527
528 /* We haven't checked in, so go dump stack. */
529 print_cpu_stall();
530
531 } else if (rcu_gp_in_progress() &&
532 ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) &&
533 cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
534
535 /* They had a few time units to dump stack, so complain. */
536 print_other_cpu_stall(gs2);
537 }
538}
539
540//////////////////////////////////////////////////////////////////////////////
541//
542// RCU forward-progress mechanisms, including of callback invocation.
543
544
545/*
546 * Show the state of the grace-period kthreads.
547 */
548void show_rcu_gp_kthreads(void)
549{
550 int cpu;
551 unsigned long j;
552 unsigned long ja;
553 unsigned long jr;
554 unsigned long jw;
555 struct rcu_data *rdp;
556 struct rcu_node *rnp;
557
558 j = jiffies;
559 ja = j - READ_ONCE(rcu_state.gp_activity);
560 jr = j - READ_ONCE(rcu_state.gp_req_activity);
561 jw = j - READ_ONCE(rcu_state.gp_wake_time);
562 pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n",
563 rcu_state.name, gp_state_getname(rcu_state.gp_state),
564 rcu_state.gp_state,
565 rcu_state.gp_kthread ? rcu_state.gp_kthread->state : 0x1ffffL,
566 ja, jr, jw, (long)READ_ONCE(rcu_state.gp_wake_seq),
567 (long)READ_ONCE(rcu_state.gp_seq),
568 (long)READ_ONCE(rcu_get_root()->gp_seq_needed),
569 READ_ONCE(rcu_state.gp_flags));
570 rcu_for_each_node_breadth_first(rnp) {
571 if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed))
572 continue;
573 pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n",
574 rnp->grplo, rnp->grphi, (long)rnp->gp_seq,
575 (long)rnp->gp_seq_needed);
576 if (!rcu_is_leaf_node(rnp))
577 continue;
578 for_each_leaf_node_possible_cpu(rnp, cpu) {
579 rdp = per_cpu_ptr(&rcu_data, cpu);
580 if (rdp->gpwrap ||
581 ULONG_CMP_GE(rcu_state.gp_seq,
582 rdp->gp_seq_needed))
583 continue;
584 pr_info("\tcpu %d ->gp_seq_needed %ld\n",
585 cpu, (long)rdp->gp_seq_needed);
586 }
587 }
588 /* sched_show_task(rcu_state.gp_kthread); */
589}
590EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads);
591
592/*
593 * This function checks for grace-period requests that fail to motivate
594 * RCU to come out of its idle mode.
595 */
596static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp,
597 const unsigned long gpssdelay)
598{
599 unsigned long flags;
600 unsigned long j;
601 struct rcu_node *rnp_root = rcu_get_root();
602 static atomic_t warned = ATOMIC_INIT(0);
603
604 if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress() ||
605 ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed))
606 return;
607 j = jiffies; /* Expensive access, and in common case don't get here. */
608 if (time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) ||
609 time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) ||
610 atomic_read(&warned))
611 return;
612
613 raw_spin_lock_irqsave_rcu_node(rnp, flags);
614 j = jiffies;
615 if (rcu_gp_in_progress() ||
616 ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) ||
617 time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) ||
618 time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) ||
619 atomic_read(&warned)) {
620 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
621 return;
622 }
623 /* Hold onto the leaf lock to make others see warned==1. */
624
625 if (rnp_root != rnp)
626 raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */
627 j = jiffies;
628 if (rcu_gp_in_progress() ||
629 ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) ||
630 time_before(j, rcu_state.gp_req_activity + gpssdelay) ||
631 time_before(j, rcu_state.gp_activity + gpssdelay) ||
632 atomic_xchg(&warned, 1)) {
633 raw_spin_unlock_rcu_node(rnp_root); /* irqs remain disabled. */
634 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
635 return;
636 }
637 WARN_ON(1);
638 if (rnp_root != rnp)
639 raw_spin_unlock_rcu_node(rnp_root);
640 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
641 show_rcu_gp_kthreads();
642}
643
644/*
645 * Do a forward-progress check for rcutorture. This is normally invoked
646 * due to an OOM event. The argument "j" gives the time period during
647 * which rcutorture would like progress to have been made.
648 */
649void rcu_fwd_progress_check(unsigned long j)
650{
651 unsigned long cbs;
652 int cpu;
653 unsigned long max_cbs = 0;
654 int max_cpu = -1;
655 struct rcu_data *rdp;
656
657 if (rcu_gp_in_progress()) {
658 pr_info("%s: GP age %lu jiffies\n",
659 __func__, jiffies - rcu_state.gp_start);
660 show_rcu_gp_kthreads();
661 } else {
662 pr_info("%s: Last GP end %lu jiffies ago\n",
663 __func__, jiffies - rcu_state.gp_end);
664 preempt_disable();
665 rdp = this_cpu_ptr(&rcu_data);
666 rcu_check_gp_start_stall(rdp->mynode, rdp, j);
667 preempt_enable();
668 }
669 for_each_possible_cpu(cpu) {
670 cbs = rcu_get_n_cbs_cpu(cpu);
671 if (!cbs)
672 continue;
673 if (max_cpu < 0)
674 pr_info("%s: callbacks", __func__);
675 pr_cont(" %d: %lu", cpu, cbs);
676 if (cbs <= max_cbs)
677 continue;
678 max_cbs = cbs;
679 max_cpu = cpu;
680 }
681 if (max_cpu >= 0)
682 pr_cont("\n");
683}
684EXPORT_SYMBOL_GPL(rcu_fwd_progress_check);
685
686/* Commandeer a sysrq key to dump RCU's tree. */
687static bool sysrq_rcu;
688module_param(sysrq_rcu, bool, 0444);
689
690/* Dump grace-period-request information due to commandeered sysrq. */
691static void sysrq_show_rcu(int key)
692{
693 show_rcu_gp_kthreads();
694}
695
696static struct sysrq_key_op sysrq_rcudump_op = {
697 .handler = sysrq_show_rcu,
698 .help_msg = "show-rcu(y)",
699 .action_msg = "Show RCU tree",
700 .enable_mask = SYSRQ_ENABLE_DUMP,
701};
702
703static int __init rcu_sysrq_init(void)
704{
705 if (sysrq_rcu)
706 return register_sysrq_key('y', &sysrq_rcudump_op);
707 return 0;
708}
709early_initcall(rcu_sysrq_init);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index cbaa976c5945..c3bf44ba42e5 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -424,68 +424,11 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
424#endif 424#endif
425 425
426#ifdef CONFIG_RCU_STALL_COMMON 426#ifdef CONFIG_RCU_STALL_COMMON
427
428#ifdef CONFIG_PROVE_RCU
429#define RCU_STALL_DELAY_DELTA (5 * HZ)
430#else
431#define RCU_STALL_DELAY_DELTA 0
432#endif
433
434int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ 427int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
435EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress); 428EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress);
436static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
437
438module_param(rcu_cpu_stall_suppress, int, 0644); 429module_param(rcu_cpu_stall_suppress, int, 0644);
430int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
439module_param(rcu_cpu_stall_timeout, int, 0644); 431module_param(rcu_cpu_stall_timeout, int, 0644);
440
441int rcu_jiffies_till_stall_check(void)
442{
443 int till_stall_check = READ_ONCE(rcu_cpu_stall_timeout);
444
445 /*
446 * Limit check must be consistent with the Kconfig limits
447 * for CONFIG_RCU_CPU_STALL_TIMEOUT.
448 */
449 if (till_stall_check < 3) {
450 WRITE_ONCE(rcu_cpu_stall_timeout, 3);
451 till_stall_check = 3;
452 } else if (till_stall_check > 300) {
453 WRITE_ONCE(rcu_cpu_stall_timeout, 300);
454 till_stall_check = 300;
455 }
456 return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
457}
458EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check);
459
460void rcu_sysrq_start(void)
461{
462 if (!rcu_cpu_stall_suppress)
463 rcu_cpu_stall_suppress = 2;
464}
465
466void rcu_sysrq_end(void)
467{
468 if (rcu_cpu_stall_suppress == 2)
469 rcu_cpu_stall_suppress = 0;
470}
471
472static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
473{
474 rcu_cpu_stall_suppress = 1;
475 return NOTIFY_DONE;
476}
477
478static struct notifier_block rcu_panic_block = {
479 .notifier_call = rcu_panic,
480};
481
482static int __init check_cpu_stall_init(void)
483{
484 atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
485 return 0;
486}
487early_initcall(check_cpu_stall_init);
488
489#endif /* #ifdef CONFIG_RCU_STALL_COMMON */ 432#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
490 433
491#ifdef CONFIG_TASKS_RCU 434#ifdef CONFIG_TASKS_RCU
diff --git a/kernel/resource.c b/kernel/resource.c
index 92190f62ebc5..8c15f846e8ef 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -520,21 +520,20 @@ EXPORT_SYMBOL_GPL(page_is_ram);
520int region_intersects(resource_size_t start, size_t size, unsigned long flags, 520int region_intersects(resource_size_t start, size_t size, unsigned long flags,
521 unsigned long desc) 521 unsigned long desc)
522{ 522{
523 resource_size_t end = start + size - 1; 523 struct resource res;
524 int type = 0; int other = 0; 524 int type = 0; int other = 0;
525 struct resource *p; 525 struct resource *p;
526 526
527 res.start = start;
528 res.end = start + size - 1;
529
527 read_lock(&resource_lock); 530 read_lock(&resource_lock);
528 for (p = iomem_resource.child; p ; p = p->sibling) { 531 for (p = iomem_resource.child; p ; p = p->sibling) {
529 bool is_type = (((p->flags & flags) == flags) && 532 bool is_type = (((p->flags & flags) == flags) &&
530 ((desc == IORES_DESC_NONE) || 533 ((desc == IORES_DESC_NONE) ||
531 (desc == p->desc))); 534 (desc == p->desc)));
532 535
533 if (start >= p->start && start <= p->end) 536 if (resource_overlaps(p, &res))
534 is_type ? type++ : other++;
535 if (end >= p->start && end <= p->end)
536 is_type ? type++ : other++;
537 if (p->start >= start && p->end <= end)
538 is_type ? type++ : other++; 537 is_type ? type++ : other++;
539 } 538 }
540 read_unlock(&resource_lock); 539 read_unlock(&resource_lock);
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 25e9a7b60eba..9424ee90589e 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -254,8 +254,7 @@ static int rseq_ip_fixup(struct pt_regs *regs)
254 * - signal delivery, 254 * - signal delivery,
255 * and return to user-space. 255 * and return to user-space.
256 * 256 *
257 * This is how we can ensure that the entire rseq critical section, 257 * This is how we can ensure that the entire rseq critical section
258 * consisting of both the C part and the assembly instruction sequence,
259 * will issue the commit instruction only if executed atomically with 258 * will issue the commit instruction only if executed atomically with
260 * respect to other threads scheduled on the same CPU, and with respect 259 * respect to other threads scheduled on the same CPU, and with respect
261 * to signal handlers. 260 * to signal handlers.
@@ -314,7 +313,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
314 /* Unregister rseq for current thread. */ 313 /* Unregister rseq for current thread. */
315 if (current->rseq != rseq || !current->rseq) 314 if (current->rseq != rseq || !current->rseq)
316 return -EINVAL; 315 return -EINVAL;
317 if (current->rseq_len != rseq_len) 316 if (rseq_len != sizeof(*rseq))
318 return -EINVAL; 317 return -EINVAL;
319 if (current->rseq_sig != sig) 318 if (current->rseq_sig != sig)
320 return -EPERM; 319 return -EPERM;
@@ -322,7 +321,6 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
322 if (ret) 321 if (ret)
323 return ret; 322 return ret;
324 current->rseq = NULL; 323 current->rseq = NULL;
325 current->rseq_len = 0;
326 current->rseq_sig = 0; 324 current->rseq_sig = 0;
327 return 0; 325 return 0;
328 } 326 }
@@ -336,7 +334,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
336 * the provided address differs from the prior 334 * the provided address differs from the prior
337 * one. 335 * one.
338 */ 336 */
339 if (current->rseq != rseq || current->rseq_len != rseq_len) 337 if (current->rseq != rseq || rseq_len != sizeof(*rseq))
340 return -EINVAL; 338 return -EINVAL;
341 if (current->rseq_sig != sig) 339 if (current->rseq_sig != sig)
342 return -EPERM; 340 return -EPERM;
@@ -354,7 +352,6 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
354 if (!access_ok(rseq, rseq_len)) 352 if (!access_ok(rseq, rseq_len))
355 return -EFAULT; 353 return -EFAULT;
356 current->rseq = rseq; 354 current->rseq = rseq;
357 current->rseq_len = rseq_len;
358 current->rseq_sig = sig; 355 current->rseq_sig = sig;
359 /* 356 /*
360 * If rseq was previously inactive, and has just been 357 * If rseq was previously inactive, and has just been
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ead464a0f2e5..102dfcf0a29a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -792,10 +792,14 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags)
792 rq->nr_uninterruptible--; 792 rq->nr_uninterruptible--;
793 793
794 enqueue_task(rq, p, flags); 794 enqueue_task(rq, p, flags);
795
796 p->on_rq = TASK_ON_RQ_QUEUED;
795} 797}
796 798
797void deactivate_task(struct rq *rq, struct task_struct *p, int flags) 799void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
798{ 800{
801 p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
802
799 if (task_contributes_to_load(p)) 803 if (task_contributes_to_load(p))
800 rq->nr_uninterruptible++; 804 rq->nr_uninterruptible++;
801 805
@@ -920,7 +924,7 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
920} 924}
921 925
922/* 926/*
923 * Per-CPU kthreads are allowed to run on !actie && online CPUs, see 927 * Per-CPU kthreads are allowed to run on !active && online CPUs, see
924 * __set_cpus_allowed_ptr() and select_fallback_rq(). 928 * __set_cpus_allowed_ptr() and select_fallback_rq().
925 */ 929 */
926static inline bool is_cpu_allowed(struct task_struct *p, int cpu) 930static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
@@ -1151,7 +1155,6 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
1151 /* Need help from migration thread: drop lock and wait. */ 1155 /* Need help from migration thread: drop lock and wait. */
1152 task_rq_unlock(rq, p, &rf); 1156 task_rq_unlock(rq, p, &rf);
1153 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 1157 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
1154 tlb_migrate_finish(p->mm);
1155 return 0; 1158 return 0;
1156 } else if (task_on_rq_queued(p)) { 1159 } else if (task_on_rq_queued(p)) {
1157 /* 1160 /*
@@ -1237,11 +1240,9 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
1237 rq_pin_lock(src_rq, &srf); 1240 rq_pin_lock(src_rq, &srf);
1238 rq_pin_lock(dst_rq, &drf); 1241 rq_pin_lock(dst_rq, &drf);
1239 1242
1240 p->on_rq = TASK_ON_RQ_MIGRATING;
1241 deactivate_task(src_rq, p, 0); 1243 deactivate_task(src_rq, p, 0);
1242 set_task_cpu(p, cpu); 1244 set_task_cpu(p, cpu);
1243 activate_task(dst_rq, p, 0); 1245 activate_task(dst_rq, p, 0);
1244 p->on_rq = TASK_ON_RQ_QUEUED;
1245 check_preempt_curr(dst_rq, p, 0); 1246 check_preempt_curr(dst_rq, p, 0);
1246 1247
1247 rq_unpin_lock(dst_rq, &drf); 1248 rq_unpin_lock(dst_rq, &drf);
@@ -1681,16 +1682,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
1681 __schedstat_inc(p->se.statistics.nr_wakeups_sync); 1682 __schedstat_inc(p->se.statistics.nr_wakeups_sync);
1682} 1683}
1683 1684
1684static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
1685{
1686 activate_task(rq, p, en_flags);
1687 p->on_rq = TASK_ON_RQ_QUEUED;
1688
1689 /* If a worker is waking up, notify the workqueue: */
1690 if (p->flags & PF_WQ_WORKER)
1691 wq_worker_waking_up(p, cpu_of(rq));
1692}
1693
1694/* 1685/*
1695 * Mark the task runnable and perform wakeup-preemption. 1686 * Mark the task runnable and perform wakeup-preemption.
1696 */ 1687 */
@@ -1742,7 +1733,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
1742 en_flags |= ENQUEUE_MIGRATED; 1733 en_flags |= ENQUEUE_MIGRATED;
1743#endif 1734#endif
1744 1735
1745 ttwu_activate(rq, p, en_flags); 1736 activate_task(rq, p, en_flags);
1746 ttwu_do_wakeup(rq, p, wake_flags, rf); 1737 ttwu_do_wakeup(rq, p, wake_flags, rf);
1747} 1738}
1748 1739
@@ -2107,56 +2098,6 @@ out:
2107} 2098}
2108 2099
2109/** 2100/**
2110 * try_to_wake_up_local - try to wake up a local task with rq lock held
2111 * @p: the thread to be awakened
2112 * @rf: request-queue flags for pinning
2113 *
2114 * Put @p on the run-queue if it's not already there. The caller must
2115 * ensure that this_rq() is locked, @p is bound to this_rq() and not
2116 * the current task.
2117 */
2118static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
2119{
2120 struct rq *rq = task_rq(p);
2121
2122 if (WARN_ON_ONCE(rq != this_rq()) ||
2123 WARN_ON_ONCE(p == current))
2124 return;
2125
2126 lockdep_assert_held(&rq->lock);
2127
2128 if (!raw_spin_trylock(&p->pi_lock)) {
2129 /*
2130 * This is OK, because current is on_cpu, which avoids it being
2131 * picked for load-balance and preemption/IRQs are still
2132 * disabled avoiding further scheduler activity on it and we've
2133 * not yet picked a replacement task.
2134 */
2135 rq_unlock(rq, rf);
2136 raw_spin_lock(&p->pi_lock);
2137 rq_relock(rq, rf);
2138 }
2139
2140 if (!(p->state & TASK_NORMAL))
2141 goto out;
2142
2143 trace_sched_waking(p);
2144
2145 if (!task_on_rq_queued(p)) {
2146 if (p->in_iowait) {
2147 delayacct_blkio_end(p);
2148 atomic_dec(&rq->nr_iowait);
2149 }
2150 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
2151 }
2152
2153 ttwu_do_wakeup(rq, p, 0, rf);
2154 ttwu_stat(p, smp_processor_id(), 0);
2155out:
2156 raw_spin_unlock(&p->pi_lock);
2157}
2158
2159/**
2160 * wake_up_process - Wake up a specific process 2101 * wake_up_process - Wake up a specific process
2161 * @p: The process to be woken up. 2102 * @p: The process to be woken up.
2162 * 2103 *
@@ -2467,7 +2408,6 @@ void wake_up_new_task(struct task_struct *p)
2467 post_init_entity_util_avg(p); 2408 post_init_entity_util_avg(p);
2468 2409
2469 activate_task(rq, p, ENQUEUE_NOCLOCK); 2410 activate_task(rq, p, ENQUEUE_NOCLOCK);
2470 p->on_rq = TASK_ON_RQ_QUEUED;
2471 trace_sched_wakeup_new(p); 2411 trace_sched_wakeup_new(p);
2472 check_preempt_curr(rq, p, WF_FORK); 2412 check_preempt_curr(rq, p, WF_FORK);
2473#ifdef CONFIG_SMP 2413#ifdef CONFIG_SMP
@@ -3466,25 +3406,11 @@ static void __sched notrace __schedule(bool preempt)
3466 prev->state = TASK_RUNNING; 3406 prev->state = TASK_RUNNING;
3467 } else { 3407 } else {
3468 deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); 3408 deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
3469 prev->on_rq = 0;
3470 3409
3471 if (prev->in_iowait) { 3410 if (prev->in_iowait) {
3472 atomic_inc(&rq->nr_iowait); 3411 atomic_inc(&rq->nr_iowait);
3473 delayacct_blkio_start(); 3412 delayacct_blkio_start();
3474 } 3413 }
3475
3476 /*
3477 * If a worker went to sleep, notify and ask workqueue
3478 * whether it wants to wake up a task to maintain
3479 * concurrency.
3480 */
3481 if (prev->flags & PF_WQ_WORKER) {
3482 struct task_struct *to_wakeup;
3483
3484 to_wakeup = wq_worker_sleeping(prev);
3485 if (to_wakeup)
3486 try_to_wake_up_local(to_wakeup, &rf);
3487 }
3488 } 3414 }
3489 switch_count = &prev->nvcsw; 3415 switch_count = &prev->nvcsw;
3490 } 3416 }
@@ -3544,6 +3470,20 @@ static inline void sched_submit_work(struct task_struct *tsk)
3544{ 3470{
3545 if (!tsk->state || tsk_is_pi_blocked(tsk)) 3471 if (!tsk->state || tsk_is_pi_blocked(tsk))
3546 return; 3472 return;
3473
3474 /*
3475 * If a worker went to sleep, notify and ask workqueue whether
3476 * it wants to wake up a task to maintain concurrency.
3477 * As this function is called inside the schedule() context,
3478 * we disable preemption to avoid it calling schedule() again
3479 * in the possible wakeup of a kworker.
3480 */
3481 if (tsk->flags & PF_WQ_WORKER) {
3482 preempt_disable();
3483 wq_worker_sleeping(tsk);
3484 preempt_enable_no_resched();
3485 }
3486
3547 /* 3487 /*
3548 * If we are going to sleep and we have plugged IO queued, 3488 * If we are going to sleep and we have plugged IO queued,
3549 * make sure to submit it to avoid deadlocks. 3489 * make sure to submit it to avoid deadlocks.
@@ -3552,6 +3492,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
3552 blk_schedule_flush_plug(tsk); 3492 blk_schedule_flush_plug(tsk);
3553} 3493}
3554 3494
3495static void sched_update_worker(struct task_struct *tsk)
3496{
3497 if (tsk->flags & PF_WQ_WORKER)
3498 wq_worker_running(tsk);
3499}
3500
3555asmlinkage __visible void __sched schedule(void) 3501asmlinkage __visible void __sched schedule(void)
3556{ 3502{
3557 struct task_struct *tsk = current; 3503 struct task_struct *tsk = current;
@@ -3562,6 +3508,7 @@ asmlinkage __visible void __sched schedule(void)
3562 __schedule(false); 3508 __schedule(false);
3563 sched_preempt_enable_no_resched(); 3509 sched_preempt_enable_no_resched();
3564 } while (need_resched()); 3510 } while (need_resched());
3511 sched_update_worker(tsk);
3565} 3512}
3566EXPORT_SYMBOL(schedule); 3513EXPORT_SYMBOL(schedule);
3567 3514
@@ -5918,7 +5865,7 @@ void __init sched_init_smp(void)
5918 5865
5919static int __init migration_init(void) 5866static int __init migration_init(void)
5920{ 5867{
5921 sched_rq_cpu_starting(smp_processor_id()); 5868 sched_cpu_starting(smp_processor_id());
5922 return 0; 5869 return 0;
5923} 5870}
5924early_initcall(migration_init); 5871early_initcall(migration_init);
@@ -6559,6 +6506,8 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
6559static int cpu_shares_write_u64(struct cgroup_subsys_state *css, 6506static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
6560 struct cftype *cftype, u64 shareval) 6507 struct cftype *cftype, u64 shareval)
6561{ 6508{
6509 if (shareval > scale_load_down(ULONG_MAX))
6510 shareval = MAX_SHARES;
6562 return sched_group_set_shares(css_tg(css), scale_load(shareval)); 6511 return sched_group_set_shares(css_tg(css), scale_load(shareval));
6563} 6512}
6564 6513
@@ -6574,7 +6523,7 @@ static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
6574static DEFINE_MUTEX(cfs_constraints_mutex); 6523static DEFINE_MUTEX(cfs_constraints_mutex);
6575 6524
6576const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ 6525const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
6577const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ 6526static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
6578 6527
6579static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); 6528static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
6580 6529
@@ -6654,20 +6603,22 @@ out_unlock:
6654 return ret; 6603 return ret;
6655} 6604}
6656 6605
6657int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) 6606static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
6658{ 6607{
6659 u64 quota, period; 6608 u64 quota, period;
6660 6609
6661 period = ktime_to_ns(tg->cfs_bandwidth.period); 6610 period = ktime_to_ns(tg->cfs_bandwidth.period);
6662 if (cfs_quota_us < 0) 6611 if (cfs_quota_us < 0)
6663 quota = RUNTIME_INF; 6612 quota = RUNTIME_INF;
6664 else 6613 else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC)
6665 quota = (u64)cfs_quota_us * NSEC_PER_USEC; 6614 quota = (u64)cfs_quota_us * NSEC_PER_USEC;
6615 else
6616 return -EINVAL;
6666 6617
6667 return tg_set_cfs_bandwidth(tg, period, quota); 6618 return tg_set_cfs_bandwidth(tg, period, quota);
6668} 6619}
6669 6620
6670long tg_get_cfs_quota(struct task_group *tg) 6621static long tg_get_cfs_quota(struct task_group *tg)
6671{ 6622{
6672 u64 quota_us; 6623 u64 quota_us;
6673 6624
@@ -6680,17 +6631,20 @@ long tg_get_cfs_quota(struct task_group *tg)
6680 return quota_us; 6631 return quota_us;
6681} 6632}
6682 6633
6683int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) 6634static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
6684{ 6635{
6685 u64 quota, period; 6636 u64 quota, period;
6686 6637
6638 if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC)
6639 return -EINVAL;
6640
6687 period = (u64)cfs_period_us * NSEC_PER_USEC; 6641 period = (u64)cfs_period_us * NSEC_PER_USEC;
6688 quota = tg->cfs_bandwidth.quota; 6642 quota = tg->cfs_bandwidth.quota;
6689 6643
6690 return tg_set_cfs_bandwidth(tg, period, quota); 6644 return tg_set_cfs_bandwidth(tg, period, quota);
6691} 6645}
6692 6646
6693long tg_get_cfs_period(struct task_group *tg) 6647static long tg_get_cfs_period(struct task_group *tg)
6694{ 6648{
6695 u64 cfs_period_us; 6649 u64 cfs_period_us;
6696 6650
@@ -6998,7 +6952,7 @@ static int __maybe_unused cpu_period_quota_parse(char *buf,
6998{ 6952{
6999 char tok[21]; /* U64_MAX */ 6953 char tok[21]; /* U64_MAX */
7000 6954
7001 if (!sscanf(buf, "%s %llu", tok, periodp)) 6955 if (sscanf(buf, "%20s %llu", tok, periodp) < 1)
7002 return -EINVAL; 6956 return -EINVAL;
7003 6957
7004 *periodp *= NSEC_PER_USEC; 6958 *periodp *= NSEC_PER_USEC;
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index 835671f0f917..b5dcd1d83c7f 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -7,7 +7,7 @@
7 */ 7 */
8#include "sched.h" 8#include "sched.h"
9 9
10DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); 10DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);
11 11
12/** 12/**
13 * cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer. 13 * cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer.
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 2efe629425be..962cf343f798 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -13,6 +13,8 @@
13#include <linux/sched/cpufreq.h> 13#include <linux/sched/cpufreq.h>
14#include <trace/events/power.h> 14#include <trace/events/power.h>
15 15
16#define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8)
17
16struct sugov_tunables { 18struct sugov_tunables {
17 struct gov_attr_set attr_set; 19 struct gov_attr_set attr_set;
18 unsigned int rate_limit_us; 20 unsigned int rate_limit_us;
@@ -48,7 +50,6 @@ struct sugov_cpu {
48 50
49 bool iowait_boost_pending; 51 bool iowait_boost_pending;
50 unsigned int iowait_boost; 52 unsigned int iowait_boost;
51 unsigned int iowait_boost_max;
52 u64 last_update; 53 u64 last_update;
53 54
54 unsigned long bw_dl; 55 unsigned long bw_dl;
@@ -291,8 +292,8 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
291 * 292 *
292 * The IO wait boost of a task is disabled after a tick since the last update 293 * The IO wait boost of a task is disabled after a tick since the last update
293 * of a CPU. If a new IO wait boost is requested after more then a tick, then 294 * of a CPU. If a new IO wait boost is requested after more then a tick, then
294 * we enable the boost starting from the minimum frequency, which improves 295 * we enable the boost starting from IOWAIT_BOOST_MIN, which improves energy
295 * energy efficiency by ignoring sporadic wakeups from IO. 296 * efficiency by ignoring sporadic wakeups from IO.
296 */ 297 */
297static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time, 298static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time,
298 bool set_iowait_boost) 299 bool set_iowait_boost)
@@ -303,8 +304,7 @@ static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time,
303 if (delta_ns <= TICK_NSEC) 304 if (delta_ns <= TICK_NSEC)
304 return false; 305 return false;
305 306
306 sg_cpu->iowait_boost = set_iowait_boost 307 sg_cpu->iowait_boost = set_iowait_boost ? IOWAIT_BOOST_MIN : 0;
307 ? sg_cpu->sg_policy->policy->min : 0;
308 sg_cpu->iowait_boost_pending = set_iowait_boost; 308 sg_cpu->iowait_boost_pending = set_iowait_boost;
309 309
310 return true; 310 return true;
@@ -318,8 +318,9 @@ static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time,
318 * 318 *
319 * Each time a task wakes up after an IO operation, the CPU utilization can be 319 * Each time a task wakes up after an IO operation, the CPU utilization can be
320 * boosted to a certain utilization which doubles at each "frequent and 320 * boosted to a certain utilization which doubles at each "frequent and
321 * successive" wakeup from IO, ranging from the utilization of the minimum 321 * successive" wakeup from IO, ranging from IOWAIT_BOOST_MIN to the utilization
322 * OPP to the utilization of the maximum OPP. 322 * of the maximum OPP.
323 *
323 * To keep doubling, an IO boost has to be requested at least once per tick, 324 * To keep doubling, an IO boost has to be requested at least once per tick,
324 * otherwise we restart from the utilization of the minimum OPP. 325 * otherwise we restart from the utilization of the minimum OPP.
325 */ 326 */
@@ -344,14 +345,13 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
344 345
345 /* Double the boost at each request */ 346 /* Double the boost at each request */
346 if (sg_cpu->iowait_boost) { 347 if (sg_cpu->iowait_boost) {
347 sg_cpu->iowait_boost <<= 1; 348 sg_cpu->iowait_boost =
348 if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max) 349 min_t(unsigned int, sg_cpu->iowait_boost << 1, SCHED_CAPACITY_SCALE);
349 sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
350 return; 350 return;
351 } 351 }
352 352
353 /* First wakeup after IO: start with minimum boost */ 353 /* First wakeup after IO: start with minimum boost */
354 sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min; 354 sg_cpu->iowait_boost = IOWAIT_BOOST_MIN;
355} 355}
356 356
357/** 357/**
@@ -373,47 +373,38 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
373 * This mechanism is designed to boost high frequently IO waiting tasks, while 373 * This mechanism is designed to boost high frequently IO waiting tasks, while
374 * being more conservative on tasks which does sporadic IO operations. 374 * being more conservative on tasks which does sporadic IO operations.
375 */ 375 */
376static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, 376static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
377 unsigned long *util, unsigned long *max) 377 unsigned long util, unsigned long max)
378{ 378{
379 unsigned int boost_util, boost_max; 379 unsigned long boost;
380 380
381 /* No boost currently required */ 381 /* No boost currently required */
382 if (!sg_cpu->iowait_boost) 382 if (!sg_cpu->iowait_boost)
383 return; 383 return util;
384 384
385 /* Reset boost if the CPU appears to have been idle enough */ 385 /* Reset boost if the CPU appears to have been idle enough */
386 if (sugov_iowait_reset(sg_cpu, time, false)) 386 if (sugov_iowait_reset(sg_cpu, time, false))
387 return; 387 return util;
388 388
389 /* 389 if (!sg_cpu->iowait_boost_pending) {
390 * An IO waiting task has just woken up:
391 * allow to further double the boost value
392 */
393 if (sg_cpu->iowait_boost_pending) {
394 sg_cpu->iowait_boost_pending = false;
395 } else {
396 /* 390 /*
397 * Otherwise: reduce the boost value and disable it when we 391 * No boost pending; reduce the boost value.
398 * reach the minimum.
399 */ 392 */
400 sg_cpu->iowait_boost >>= 1; 393 sg_cpu->iowait_boost >>= 1;
401 if (sg_cpu->iowait_boost < sg_cpu->sg_policy->policy->min) { 394 if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) {
402 sg_cpu->iowait_boost = 0; 395 sg_cpu->iowait_boost = 0;
403 return; 396 return util;
404 } 397 }
405 } 398 }
406 399
400 sg_cpu->iowait_boost_pending = false;
401
407 /* 402 /*
408 * Apply the current boost value: a CPU is boosted only if its current 403 * @util is already in capacity scale; convert iowait_boost
409 * utilization is smaller then the current IO boost level. 404 * into the same scale so we can compare.
410 */ 405 */
411 boost_util = sg_cpu->iowait_boost; 406 boost = (sg_cpu->iowait_boost * max) >> SCHED_CAPACITY_SHIFT;
412 boost_max = sg_cpu->iowait_boost_max; 407 return max(boost, util);
413 if (*util * boost_max < *max * boost_util) {
414 *util = boost_util;
415 *max = boost_max;
416 }
417} 408}
418 409
419#ifdef CONFIG_NO_HZ_COMMON 410#ifdef CONFIG_NO_HZ_COMMON
@@ -460,7 +451,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
460 451
461 util = sugov_get_util(sg_cpu); 452 util = sugov_get_util(sg_cpu);
462 max = sg_cpu->max; 453 max = sg_cpu->max;
463 sugov_iowait_apply(sg_cpu, time, &util, &max); 454 util = sugov_iowait_apply(sg_cpu, time, util, max);
464 next_f = get_next_freq(sg_policy, util, max); 455 next_f = get_next_freq(sg_policy, util, max);
465 /* 456 /*
466 * Do not reduce the frequency if the CPU has not been idle 457 * Do not reduce the frequency if the CPU has not been idle
@@ -500,7 +491,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
500 491
501 j_util = sugov_get_util(j_sg_cpu); 492 j_util = sugov_get_util(j_sg_cpu);
502 j_max = j_sg_cpu->max; 493 j_max = j_sg_cpu->max;
503 sugov_iowait_apply(j_sg_cpu, time, &j_util, &j_max); 494 j_util = sugov_iowait_apply(j_sg_cpu, time, j_util, j_max);
504 495
505 if (j_util * max > j_max * util) { 496 if (j_util * max > j_max * util) {
506 util = j_util; 497 util = j_util;
@@ -609,13 +600,14 @@ rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count
609 600
610static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us); 601static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us);
611 602
612static struct attribute *sugov_attributes[] = { 603static struct attribute *sugov_attrs[] = {
613 &rate_limit_us.attr, 604 &rate_limit_us.attr,
614 NULL 605 NULL
615}; 606};
607ATTRIBUTE_GROUPS(sugov);
616 608
617static struct kobj_type sugov_tunables_ktype = { 609static struct kobj_type sugov_tunables_ktype = {
618 .default_attrs = sugov_attributes, 610 .default_groups = sugov_groups,
619 .sysfs_ops = &governor_sysfs_ops, 611 .sysfs_ops = &governor_sysfs_ops,
620}; 612};
621 613
@@ -782,6 +774,7 @@ out:
782 return 0; 774 return 0;
783 775
784fail: 776fail:
777 kobject_put(&tunables->attr_set.kobj);
785 policy->governor_data = NULL; 778 policy->governor_data = NULL;
786 sugov_tunables_free(tunables); 779 sugov_tunables_free(tunables);
787 780
@@ -837,7 +830,6 @@ static int sugov_start(struct cpufreq_policy *policy)
837 memset(sg_cpu, 0, sizeof(*sg_cpu)); 830 memset(sg_cpu, 0, sizeof(*sg_cpu));
838 sg_cpu->cpu = cpu; 831 sg_cpu->cpu = cpu;
839 sg_cpu->sg_policy = sg_policy; 832 sg_cpu->sg_policy = sg_policy;
840 sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
841 } 833 }
842 834
843 for_each_cpu(cpu, policy->cpus) { 835 for_each_cpu(cpu, policy->cpus) {
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 6a73e41a2016..43901fa3f269 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -252,7 +252,6 @@ static void task_non_contending(struct task_struct *p)
252 if (dl_entity_is_special(dl_se)) 252 if (dl_entity_is_special(dl_se))
253 return; 253 return;
254 254
255 WARN_ON(hrtimer_active(&dl_se->inactive_timer));
256 WARN_ON(dl_se->dl_non_contending); 255 WARN_ON(dl_se->dl_non_contending);
257 256
258 zerolag_time = dl_se->deadline - 257 zerolag_time = dl_se->deadline -
@@ -269,7 +268,7 @@ static void task_non_contending(struct task_struct *p)
269 * If the "0-lag time" already passed, decrease the active 268 * If the "0-lag time" already passed, decrease the active
270 * utilization now, instead of starting a timer 269 * utilization now, instead of starting a timer
271 */ 270 */
272 if (zerolag_time < 0) { 271 if ((zerolag_time < 0) || hrtimer_active(&dl_se->inactive_timer)) {
273 if (dl_task(p)) 272 if (dl_task(p))
274 sub_running_bw(dl_se, dl_rq); 273 sub_running_bw(dl_se, dl_rq);
275 if (!dl_task(p) || p->state == TASK_DEAD) { 274 if (!dl_task(p) || p->state == TASK_DEAD) {
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 8039d62ae36e..678bfb9bd87f 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -702,7 +702,7 @@ do { \
702 702
703static const char *sched_tunable_scaling_names[] = { 703static const char *sched_tunable_scaling_names[] = {
704 "none", 704 "none",
705 "logaritmic", 705 "logarithmic",
706 "linear" 706 "linear"
707}; 707};
708 708
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ea74d43924b2..f35930f5e528 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2007,6 +2007,10 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
2007 if (p->last_task_numa_placement) { 2007 if (p->last_task_numa_placement) {
2008 delta = runtime - p->last_sum_exec_runtime; 2008 delta = runtime - p->last_sum_exec_runtime;
2009 *period = now - p->last_task_numa_placement; 2009 *period = now - p->last_task_numa_placement;
2010
2011 /* Avoid time going backwards, prevent potential divide error: */
2012 if (unlikely((s64)*period < 0))
2013 *period = 0;
2010 } else { 2014 } else {
2011 delta = p->se.avg.load_sum; 2015 delta = p->se.avg.load_sum;
2012 *period = LOAD_AVG_MAX; 2016 *period = LOAD_AVG_MAX;
@@ -2593,7 +2597,7 @@ out:
2593/* 2597/*
2594 * Drive the periodic memory faults.. 2598 * Drive the periodic memory faults..
2595 */ 2599 */
2596void task_tick_numa(struct rq *rq, struct task_struct *curr) 2600static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2597{ 2601{
2598 struct callback_head *work = &curr->numa_work; 2602 struct callback_head *work = &curr->numa_work;
2599 u64 period, now; 2603 u64 period, now;
@@ -3567,7 +3571,7 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3567 * Synchronize entity load avg of dequeued entity without locking 3571 * Synchronize entity load avg of dequeued entity without locking
3568 * the previous rq. 3572 * the previous rq.
3569 */ 3573 */
3570void sync_entity_load_avg(struct sched_entity *se) 3574static void sync_entity_load_avg(struct sched_entity *se)
3571{ 3575{
3572 struct cfs_rq *cfs_rq = cfs_rq_of(se); 3576 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3573 u64 last_update_time; 3577 u64 last_update_time;
@@ -3580,7 +3584,7 @@ void sync_entity_load_avg(struct sched_entity *se)
3580 * Task first catches up with cfs_rq, and then subtract 3584 * Task first catches up with cfs_rq, and then subtract
3581 * itself from the cfs_rq (task must be off the queue now). 3585 * itself from the cfs_rq (task must be off the queue now).
3582 */ 3586 */
3583void remove_entity_load_avg(struct sched_entity *se) 3587static void remove_entity_load_avg(struct sched_entity *se)
3584{ 3588{
3585 struct cfs_rq *cfs_rq = cfs_rq_of(se); 3589 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3586 unsigned long flags; 3590 unsigned long flags;
@@ -4885,6 +4889,8 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
4885 return HRTIMER_NORESTART; 4889 return HRTIMER_NORESTART;
4886} 4890}
4887 4891
4892extern const u64 max_cfs_quota_period;
4893
4888static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) 4894static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
4889{ 4895{
4890 struct cfs_bandwidth *cfs_b = 4896 struct cfs_bandwidth *cfs_b =
@@ -4892,6 +4898,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
4892 unsigned long flags; 4898 unsigned long flags;
4893 int overrun; 4899 int overrun;
4894 int idle = 0; 4900 int idle = 0;
4901 int count = 0;
4895 4902
4896 raw_spin_lock_irqsave(&cfs_b->lock, flags); 4903 raw_spin_lock_irqsave(&cfs_b->lock, flags);
4897 for (;;) { 4904 for (;;) {
@@ -4899,6 +4906,28 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
4899 if (!overrun) 4906 if (!overrun)
4900 break; 4907 break;
4901 4908
4909 if (++count > 3) {
4910 u64 new, old = ktime_to_ns(cfs_b->period);
4911
4912 new = (old * 147) / 128; /* ~115% */
4913 new = min(new, max_cfs_quota_period);
4914
4915 cfs_b->period = ns_to_ktime(new);
4916
4917 /* since max is 1s, this is limited to 1e9^2, which fits in u64 */
4918 cfs_b->quota *= new;
4919 cfs_b->quota = div64_u64(cfs_b->quota, old);
4920
4921 pr_warn_ratelimited(
4922 "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n",
4923 smp_processor_id(),
4924 div_u64(new, NSEC_PER_USEC),
4925 div_u64(cfs_b->quota, NSEC_PER_USEC));
4926
4927 /* reset count so we don't come right back in here */
4928 count = 0;
4929 }
4930
4902 idle = do_sched_cfs_period_timer(cfs_b, overrun, flags); 4931 idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
4903 } 4932 }
4904 if (idle) 4933 if (idle)
@@ -5116,7 +5145,6 @@ static inline void hrtick_update(struct rq *rq)
5116 5145
5117#ifdef CONFIG_SMP 5146#ifdef CONFIG_SMP
5118static inline unsigned long cpu_util(int cpu); 5147static inline unsigned long cpu_util(int cpu);
5119static unsigned long capacity_of(int cpu);
5120 5148
5121static inline bool cpu_overutilized(int cpu) 5149static inline bool cpu_overutilized(int cpu)
5122{ 5150{
@@ -7492,7 +7520,6 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
7492{ 7520{
7493 lockdep_assert_held(&env->src_rq->lock); 7521 lockdep_assert_held(&env->src_rq->lock);
7494 7522
7495 p->on_rq = TASK_ON_RQ_MIGRATING;
7496 deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); 7523 deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
7497 set_task_cpu(p, env->dst_cpu); 7524 set_task_cpu(p, env->dst_cpu);
7498} 7525}
@@ -7628,7 +7655,6 @@ static void attach_task(struct rq *rq, struct task_struct *p)
7628 7655
7629 BUG_ON(task_rq(p) != rq); 7656 BUG_ON(task_rq(p) != rq);
7630 activate_task(rq, p, ENQUEUE_NOCLOCK); 7657 activate_task(rq, p, ENQUEUE_NOCLOCK);
7631 p->on_rq = TASK_ON_RQ_QUEUED;
7632 check_preempt_curr(rq, p, 0); 7658 check_preempt_curr(rq, p, 0);
7633} 7659}
7634 7660
@@ -7784,10 +7810,10 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
7784 if (cfs_rq->last_h_load_update == now) 7810 if (cfs_rq->last_h_load_update == now)
7785 return; 7811 return;
7786 7812
7787 cfs_rq->h_load_next = NULL; 7813 WRITE_ONCE(cfs_rq->h_load_next, NULL);
7788 for_each_sched_entity(se) { 7814 for_each_sched_entity(se) {
7789 cfs_rq = cfs_rq_of(se); 7815 cfs_rq = cfs_rq_of(se);
7790 cfs_rq->h_load_next = se; 7816 WRITE_ONCE(cfs_rq->h_load_next, se);
7791 if (cfs_rq->last_h_load_update == now) 7817 if (cfs_rq->last_h_load_update == now)
7792 break; 7818 break;
7793 } 7819 }
@@ -7797,7 +7823,7 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
7797 cfs_rq->last_h_load_update = now; 7823 cfs_rq->last_h_load_update = now;
7798 } 7824 }
7799 7825
7800 while ((se = cfs_rq->h_load_next) != NULL) { 7826 while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
7801 load = cfs_rq->h_load; 7827 load = cfs_rq->h_load;
7802 load = div64_ul(load * se->avg.load_avg, 7828 load = div64_ul(load * se->avg.load_avg,
7803 cfs_rq_load_avg(cfs_rq) + 1); 7829 cfs_rq_load_avg(cfs_rq) + 1);
@@ -8060,6 +8086,18 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
8060} 8086}
8061 8087
8062/* 8088/*
8089 * Check whether a rq has a misfit task and if it looks like we can actually
8090 * help that task: we can migrate the task to a CPU of higher capacity, or
8091 * the task's current CPU is heavily pressured.
8092 */
8093static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
8094{
8095 return rq->misfit_task_load &&
8096 (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity ||
8097 check_cpu_capacity(rq, sd));
8098}
8099
8100/*
8063 * Group imbalance indicates (and tries to solve) the problem where balancing 8101 * Group imbalance indicates (and tries to solve) the problem where balancing
8064 * groups is inadequate due to ->cpus_allowed constraints. 8102 * groups is inadequate due to ->cpus_allowed constraints.
8065 * 8103 *
@@ -9510,22 +9548,26 @@ static inline int on_null_domain(struct rq *rq)
9510 * - When one of the busy CPUs notice that there may be an idle rebalancing 9548 * - When one of the busy CPUs notice that there may be an idle rebalancing
9511 * needed, they will kick the idle load balancer, which then does idle 9549 * needed, they will kick the idle load balancer, which then does idle
9512 * load balancing for all the idle CPUs. 9550 * load balancing for all the idle CPUs.
9551 * - HK_FLAG_MISC CPUs are used for this task, because HK_FLAG_SCHED not set
9552 * anywhere yet.
9513 */ 9553 */
9514 9554
9515static inline int find_new_ilb(void) 9555static inline int find_new_ilb(void)
9516{ 9556{
9517 int ilb = cpumask_first(nohz.idle_cpus_mask); 9557 int ilb;
9518 9558
9519 if (ilb < nr_cpu_ids && idle_cpu(ilb)) 9559 for_each_cpu_and(ilb, nohz.idle_cpus_mask,
9520 return ilb; 9560 housekeeping_cpumask(HK_FLAG_MISC)) {
9561 if (idle_cpu(ilb))
9562 return ilb;
9563 }
9521 9564
9522 return nr_cpu_ids; 9565 return nr_cpu_ids;
9523} 9566}
9524 9567
9525/* 9568/*
9526 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the 9569 * Kick a CPU to do the nohz balancing, if it is time for it. We pick any
9527 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle 9570 * idle CPU in the HK_FLAG_MISC housekeeping set (if there is one).
9528 * CPU (if there is one).
9529 */ 9571 */
9530static void kick_ilb(unsigned int flags) 9572static void kick_ilb(unsigned int flags)
9531{ 9573{
@@ -9586,35 +9628,21 @@ static void nohz_balancer_kick(struct rq *rq)
9586 if (time_before(now, nohz.next_balance)) 9628 if (time_before(now, nohz.next_balance))
9587 goto out; 9629 goto out;
9588 9630
9589 if (rq->nr_running >= 2 || rq->misfit_task_load) { 9631 if (rq->nr_running >= 2) {
9590 flags = NOHZ_KICK_MASK; 9632 flags = NOHZ_KICK_MASK;
9591 goto out; 9633 goto out;
9592 } 9634 }
9593 9635
9594 rcu_read_lock(); 9636 rcu_read_lock();
9595 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
9596 if (sds) {
9597 /*
9598 * If there is an imbalance between LLC domains (IOW we could
9599 * increase the overall cache use), we need some less-loaded LLC
9600 * domain to pull some load. Likewise, we may need to spread
9601 * load within the current LLC domain (e.g. packed SMT cores but
9602 * other CPUs are idle). We can't really know from here how busy
9603 * the others are - so just get a nohz balance going if it looks
9604 * like this LLC domain has tasks we could move.
9605 */
9606 nr_busy = atomic_read(&sds->nr_busy_cpus);
9607 if (nr_busy > 1) {
9608 flags = NOHZ_KICK_MASK;
9609 goto unlock;
9610 }
9611
9612 }
9613 9637
9614 sd = rcu_dereference(rq->sd); 9638 sd = rcu_dereference(rq->sd);
9615 if (sd) { 9639 if (sd) {
9616 if ((rq->cfs.h_nr_running >= 1) && 9640 /*
9617 check_cpu_capacity(rq, sd)) { 9641 * If there's a CFS task and the current CPU has reduced
9642 * capacity; kick the ILB to see if there's a better CPU to run
9643 * on.
9644 */
9645 if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
9618 flags = NOHZ_KICK_MASK; 9646 flags = NOHZ_KICK_MASK;
9619 goto unlock; 9647 goto unlock;
9620 } 9648 }
@@ -9622,6 +9650,11 @@ static void nohz_balancer_kick(struct rq *rq)
9622 9650
9623 sd = rcu_dereference(per_cpu(sd_asym_packing, cpu)); 9651 sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
9624 if (sd) { 9652 if (sd) {
9653 /*
9654 * When ASYM_PACKING; see if there's a more preferred CPU
9655 * currently idle; in which case, kick the ILB to move tasks
9656 * around.
9657 */
9625 for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { 9658 for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
9626 if (sched_asym_prefer(i, cpu)) { 9659 if (sched_asym_prefer(i, cpu)) {
9627 flags = NOHZ_KICK_MASK; 9660 flags = NOHZ_KICK_MASK;
@@ -9629,6 +9662,45 @@ static void nohz_balancer_kick(struct rq *rq)
9629 } 9662 }
9630 } 9663 }
9631 } 9664 }
9665
9666 sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
9667 if (sd) {
9668 /*
9669 * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
9670 * to run the misfit task on.
9671 */
9672 if (check_misfit_status(rq, sd)) {
9673 flags = NOHZ_KICK_MASK;
9674 goto unlock;
9675 }
9676
9677 /*
9678 * For asymmetric systems, we do not want to nicely balance
9679 * cache use, instead we want to embrace asymmetry and only
9680 * ensure tasks have enough CPU capacity.
9681 *
9682 * Skip the LLC logic because it's not relevant in that case.
9683 */
9684 goto unlock;
9685 }
9686
9687 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
9688 if (sds) {
9689 /*
9690 * If there is an imbalance between LLC domains (IOW we could
9691 * increase the overall cache use), we need some less-loaded LLC
9692 * domain to pull some load. Likewise, we may need to spread
9693 * load within the current LLC domain (e.g. packed SMT cores but
9694 * other CPUs are idle). We can't really know from here how busy
9695 * the others are - so just get a nohz balance going if it looks
9696 * like this LLC domain has tasks we could move.
9697 */
9698 nr_busy = atomic_read(&sds->nr_busy_cpus);
9699 if (nr_busy > 1) {
9700 flags = NOHZ_KICK_MASK;
9701 goto unlock;
9702 }
9703 }
9632unlock: 9704unlock:
9633 rcu_read_unlock(); 9705 rcu_read_unlock();
9634out: 9706out:
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index b02d148e7672..687302051a27 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -65,6 +65,7 @@ void __init housekeeping_init(void)
65static int __init housekeeping_setup(char *str, enum hk_flags flags) 65static int __init housekeeping_setup(char *str, enum hk_flags flags)
66{ 66{
67 cpumask_var_t non_housekeeping_mask; 67 cpumask_var_t non_housekeeping_mask;
68 cpumask_var_t tmp;
68 int err; 69 int err;
69 70
70 alloc_bootmem_cpumask_var(&non_housekeeping_mask); 71 alloc_bootmem_cpumask_var(&non_housekeeping_mask);
@@ -75,16 +76,23 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags)
75 return 0; 76 return 0;
76 } 77 }
77 78
79 alloc_bootmem_cpumask_var(&tmp);
78 if (!housekeeping_flags) { 80 if (!housekeeping_flags) {
79 alloc_bootmem_cpumask_var(&housekeeping_mask); 81 alloc_bootmem_cpumask_var(&housekeeping_mask);
80 cpumask_andnot(housekeeping_mask, 82 cpumask_andnot(housekeeping_mask,
81 cpu_possible_mask, non_housekeeping_mask); 83 cpu_possible_mask, non_housekeeping_mask);
82 if (cpumask_empty(housekeeping_mask)) 84
85 cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask);
86 if (cpumask_empty(tmp)) {
87 pr_warn("Housekeeping: must include one present CPU, "
88 "using boot CPU:%d\n", smp_processor_id());
83 __cpumask_set_cpu(smp_processor_id(), housekeeping_mask); 89 __cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
90 __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask);
91 }
84 } else { 92 } else {
85 cpumask_var_t tmp; 93 cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask);
86 94 if (cpumask_empty(tmp))
87 alloc_bootmem_cpumask_var(&tmp); 95 __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask);
88 cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask); 96 cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask);
89 if (!cpumask_equal(tmp, housekeeping_mask)) { 97 if (!cpumask_equal(tmp, housekeeping_mask)) {
90 pr_warn("Housekeeping: nohz_full= must match isolcpus=\n"); 98 pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
@@ -92,8 +100,8 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags)
92 free_bootmem_cpumask_var(non_housekeeping_mask); 100 free_bootmem_cpumask_var(non_housekeeping_mask);
93 return 0; 101 return 0;
94 } 102 }
95 free_bootmem_cpumask_var(tmp);
96 } 103 }
104 free_bootmem_cpumask_var(tmp);
97 105
98 if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) { 106 if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) {
99 if (IS_ENABLED(CONFIG_NO_HZ_FULL)) { 107 if (IS_ENABLED(CONFIG_NO_HZ_FULL)) {
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 90fa23d36565..1e6b909dca36 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2555,6 +2555,8 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
2555 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; 2555 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
2556 if (rt_runtime_us < 0) 2556 if (rt_runtime_us < 0)
2557 rt_runtime = RUNTIME_INF; 2557 rt_runtime = RUNTIME_INF;
2558 else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC)
2559 return -EINVAL;
2558 2560
2559 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 2561 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
2560} 2562}
@@ -2575,6 +2577,9 @@ int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
2575{ 2577{
2576 u64 rt_runtime, rt_period; 2578 u64 rt_runtime, rt_period;
2577 2579
2580 if (rt_period_us > U64_MAX / NSEC_PER_USEC)
2581 return -EINVAL;
2582
2578 rt_period = rt_period_us * NSEC_PER_USEC; 2583 rt_period = rt_period_us * NSEC_PER_USEC;
2579 rt_runtime = tg->rt_bandwidth.rt_runtime; 2584 rt_runtime = tg->rt_bandwidth.rt_runtime;
2580 2585
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index efa686eeff26..b52ed1ada0be 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -780,7 +780,7 @@ struct root_domain {
780 * NULL-terminated list of performance domains intersecting with the 780 * NULL-terminated list of performance domains intersecting with the
781 * CPUs of the rd. Protected by RCU. 781 * CPUs of the rd. Protected by RCU.
782 */ 782 */
783 struct perf_domain *pd; 783 struct perf_domain __rcu *pd;
784}; 784};
785 785
786extern struct root_domain def_root_domain; 786extern struct root_domain def_root_domain;
@@ -869,8 +869,8 @@ struct rq {
869 atomic_t nr_iowait; 869 atomic_t nr_iowait;
870 870
871#ifdef CONFIG_SMP 871#ifdef CONFIG_SMP
872 struct root_domain *rd; 872 struct root_domain *rd;
873 struct sched_domain *sd; 873 struct sched_domain __rcu *sd;
874 874
875 unsigned long cpu_capacity; 875 unsigned long cpu_capacity;
876 unsigned long cpu_capacity_orig; 876 unsigned long cpu_capacity_orig;
@@ -1324,13 +1324,13 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
1324 return sd; 1324 return sd;
1325} 1325}
1326 1326
1327DECLARE_PER_CPU(struct sched_domain *, sd_llc); 1327DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
1328DECLARE_PER_CPU(int, sd_llc_size); 1328DECLARE_PER_CPU(int, sd_llc_size);
1329DECLARE_PER_CPU(int, sd_llc_id); 1329DECLARE_PER_CPU(int, sd_llc_id);
1330DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); 1330DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
1331DECLARE_PER_CPU(struct sched_domain *, sd_numa); 1331DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
1332DECLARE_PER_CPU(struct sched_domain *, sd_asym_packing); 1332DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
1333DECLARE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity); 1333DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
1334extern struct static_key_false sched_asym_cpucapacity; 1334extern struct static_key_false sched_asym_cpucapacity;
1335 1335
1336struct sched_group_capacity { 1336struct sched_group_capacity {
@@ -2185,7 +2185,7 @@ static inline u64 irq_time_read(int cpu)
2185#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ 2185#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
2186 2186
2187#ifdef CONFIG_CPU_FREQ 2187#ifdef CONFIG_CPU_FREQ
2188DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); 2188DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);
2189 2189
2190/** 2190/**
2191 * cpufreq_update_util - Take a note about CPU utilization changes. 2191 * cpufreq_update_util - Take a note about CPU utilization changes.
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index ab7f371a3a17..f53f89df837d 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -615,13 +615,13 @@ static void destroy_sched_domains(struct sched_domain *sd)
615 * the cpumask of the domain), this allows us to quickly tell if 615 * the cpumask of the domain), this allows us to quickly tell if
616 * two CPUs are in the same cache domain, see cpus_share_cache(). 616 * two CPUs are in the same cache domain, see cpus_share_cache().
617 */ 617 */
618DEFINE_PER_CPU(struct sched_domain *, sd_llc); 618DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
619DEFINE_PER_CPU(int, sd_llc_size); 619DEFINE_PER_CPU(int, sd_llc_size);
620DEFINE_PER_CPU(int, sd_llc_id); 620DEFINE_PER_CPU(int, sd_llc_id);
621DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); 621DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
622DEFINE_PER_CPU(struct sched_domain *, sd_numa); 622DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
623DEFINE_PER_CPU(struct sched_domain *, sd_asym_packing); 623DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
624DEFINE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity); 624DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
625DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); 625DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
626 626
627static void update_top_cache_domain(int cpu) 627static void update_top_cache_domain(int cpu)
@@ -1059,6 +1059,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
1059 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); 1059 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
1060 struct sched_domain *child = sd->child; 1060 struct sched_domain *child = sd->child;
1061 struct sched_group *sg; 1061 struct sched_group *sg;
1062 bool already_visited;
1062 1063
1063 if (child) 1064 if (child)
1064 cpu = cpumask_first(sched_domain_span(child)); 1065 cpu = cpumask_first(sched_domain_span(child));
@@ -1066,9 +1067,14 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
1066 sg = *per_cpu_ptr(sdd->sg, cpu); 1067 sg = *per_cpu_ptr(sdd->sg, cpu);
1067 sg->sgc = *per_cpu_ptr(sdd->sgc, cpu); 1068 sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
1068 1069
1069 /* For claim_allocations: */ 1070 /* Increase refcounts for claim_allocations: */
1070 atomic_inc(&sg->ref); 1071 already_visited = atomic_inc_return(&sg->ref) > 1;
1071 atomic_inc(&sg->sgc->ref); 1072 /* sgc visits should follow a similar trend as sg */
1073 WARN_ON(already_visited != (atomic_inc_return(&sg->sgc->ref) > 1));
1074
1075 /* If we have already visited that group, it's already initialized. */
1076 if (already_visited)
1077 return sg;
1072 1078
1073 if (child) { 1079 if (child) {
1074 cpumask_copy(sched_group_span(sg), sched_domain_span(child)); 1080 cpumask_copy(sched_group_span(sg), sched_domain_span(child));
@@ -1087,8 +1093,8 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
1087 1093
1088/* 1094/*
1089 * build_sched_groups will build a circular linked list of the groups 1095 * build_sched_groups will build a circular linked list of the groups
1090 * covered by the given span, and will set each group's ->cpumask correctly, 1096 * covered by the given span, will set each group's ->cpumask correctly,
1091 * and ->cpu_capacity to 0. 1097 * and will initialize their ->sgc.
1092 * 1098 *
1093 * Assumes the sched_domain tree is fully constructed 1099 * Assumes the sched_domain tree is fully constructed
1094 */ 1100 */
@@ -2075,9 +2081,8 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
2075} 2081}
2076 2082
2077/* 2083/*
2078 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 2084 * Set up scheduler domains and groups. For now this just excludes isolated
2079 * For now this just excludes isolated CPUs, but could be used to 2085 * CPUs, but could be used to exclude other special cases in the future.
2080 * exclude other special cases in the future.
2081 */ 2086 */
2082int sched_init_domains(const struct cpumask *cpu_map) 2087int sched_init_domains(const struct cpumask *cpu_map)
2083{ 2088{
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 54a0347ca812..a635ecba6fe2 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -149,7 +149,7 @@ static void populate_seccomp_data(struct seccomp_data *sd)
149 149
150 sd->nr = syscall_get_nr(task, regs); 150 sd->nr = syscall_get_nr(task, regs);
151 sd->arch = syscall_get_arch(); 151 sd->arch = syscall_get_arch();
152 syscall_get_arguments(task, regs, 0, 6, args); 152 syscall_get_arguments(task, regs, args);
153 sd->args[0] = args[0]; 153 sd->args[0] = args[0];
154 sd->args[1] = args[1]; 154 sd->args[1] = args[1];
155 sd->args[2] = args[2]; 155 sd->args[2] = args[2];
@@ -331,7 +331,7 @@ static int is_ancestor(struct seccomp_filter *parent,
331 * Expects sighand and cred_guard_mutex locks to be held. 331 * Expects sighand and cred_guard_mutex locks to be held.
332 * 332 *
333 * Returns 0 on success, -ve on error, or the pid of a thread which was 333 * Returns 0 on success, -ve on error, or the pid of a thread which was
334 * either not in the correct seccomp mode or it did not have an ancestral 334 * either not in the correct seccomp mode or did not have an ancestral
335 * seccomp filter. 335 * seccomp filter.
336 */ 336 */
337static inline pid_t seccomp_can_sync_threads(void) 337static inline pid_t seccomp_can_sync_threads(void)
@@ -502,7 +502,10 @@ out:
502 * 502 *
503 * Caller must be holding current->sighand->siglock lock. 503 * Caller must be holding current->sighand->siglock lock.
504 * 504 *
505 * Returns 0 on success, -ve on error. 505 * Returns 0 on success, -ve on error, or
506 * - in TSYNC mode: the pid of a thread which was either not in the correct
507 * seccomp mode or did not have an ancestral seccomp filter
508 * - in NEW_LISTENER mode: the fd of the new listener
506 */ 509 */
507static long seccomp_attach_filter(unsigned int flags, 510static long seccomp_attach_filter(unsigned int flags,
508 struct seccomp_filter *filter) 511 struct seccomp_filter *filter)
@@ -1258,6 +1261,16 @@ static long seccomp_set_mode_filter(unsigned int flags,
1258 if (flags & ~SECCOMP_FILTER_FLAG_MASK) 1261 if (flags & ~SECCOMP_FILTER_FLAG_MASK)
1259 return -EINVAL; 1262 return -EINVAL;
1260 1263
1264 /*
1265 * In the successful case, NEW_LISTENER returns the new listener fd.
1266 * But in the failure case, TSYNC returns the thread that died. If you
1267 * combine these two flags, there's no way to tell whether something
1268 * succeeded or failed. So, let's disallow this combination.
1269 */
1270 if ((flags & SECCOMP_FILTER_FLAG_TSYNC) &&
1271 (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER))
1272 return -EINVAL;
1273
1261 /* Prepare the new filter before holding any locks. */ 1274 /* Prepare the new filter before holding any locks. */
1262 prepared = seccomp_prepare_user_filter(filter); 1275 prepared = seccomp_prepare_user_filter(filter);
1263 if (IS_ERR(prepared)) 1276 if (IS_ERR(prepared))
@@ -1304,7 +1317,7 @@ out:
1304 mutex_unlock(&current->signal->cred_guard_mutex); 1317 mutex_unlock(&current->signal->cred_guard_mutex);
1305out_put_fd: 1318out_put_fd:
1306 if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) { 1319 if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) {
1307 if (ret < 0) { 1320 if (ret) {
1308 listener_f->private_data = NULL; 1321 listener_f->private_data = NULL;
1309 fput(listener_f); 1322 fput(listener_f);
1310 put_unused_fd(listener); 1323 put_unused_fd(listener);
diff --git a/kernel/signal.c b/kernel/signal.c
index b7953934aa99..cd83cc376767 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3513,7 +3513,6 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
3513 return kill_something_info(sig, &info, pid); 3513 return kill_something_info(sig, &info, pid);
3514} 3514}
3515 3515
3516#ifdef CONFIG_PROC_FS
3517/* 3516/*
3518 * Verify that the signaler and signalee either are in the same pid namespace 3517 * Verify that the signaler and signalee either are in the same pid namespace
3519 * or that the signaler's pid namespace is an ancestor of the signalee's pid 3518 * or that the signaler's pid namespace is an ancestor of the signalee's pid
@@ -3550,6 +3549,14 @@ static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, siginfo_t *info)
3550 return copy_siginfo_from_user(kinfo, info); 3549 return copy_siginfo_from_user(kinfo, info);
3551} 3550}
3552 3551
3552static struct pid *pidfd_to_pid(const struct file *file)
3553{
3554 if (file->f_op == &pidfd_fops)
3555 return file->private_data;
3556
3557 return tgid_pidfd_to_pid(file);
3558}
3559
3553/** 3560/**
3554 * sys_pidfd_send_signal - send a signal to a process through a task file 3561 * sys_pidfd_send_signal - send a signal to a process through a task file
3555 * descriptor 3562 * descriptor
@@ -3581,12 +3588,12 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
3581 if (flags) 3588 if (flags)
3582 return -EINVAL; 3589 return -EINVAL;
3583 3590
3584 f = fdget_raw(pidfd); 3591 f = fdget(pidfd);
3585 if (!f.file) 3592 if (!f.file)
3586 return -EBADF; 3593 return -EBADF;
3587 3594
3588 /* Is this a pidfd? */ 3595 /* Is this a pidfd? */
3589 pid = tgid_pidfd_to_pid(f.file); 3596 pid = pidfd_to_pid(f.file);
3590 if (IS_ERR(pid)) { 3597 if (IS_ERR(pid)) {
3591 ret = PTR_ERR(pid); 3598 ret = PTR_ERR(pid);
3592 goto err; 3599 goto err;
@@ -3605,16 +3612,11 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
3605 if (unlikely(sig != kinfo.si_signo)) 3612 if (unlikely(sig != kinfo.si_signo))
3606 goto err; 3613 goto err;
3607 3614
3615 /* Only allow sending arbitrary signals to yourself. */
3616 ret = -EPERM;
3608 if ((task_pid(current) != pid) && 3617 if ((task_pid(current) != pid) &&
3609 (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL)) { 3618 (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL))
3610 /* Only allow sending arbitrary signals to yourself. */ 3619 goto err;
3611 ret = -EPERM;
3612 if (kinfo.si_code != SI_USER)
3613 goto err;
3614
3615 /* Turn this into a regular kill signal. */
3616 prepare_kill_siginfo(sig, &kinfo);
3617 }
3618 } else { 3620 } else {
3619 prepare_kill_siginfo(sig, &kinfo); 3621 prepare_kill_siginfo(sig, &kinfo);
3620 } 3622 }
@@ -3625,7 +3627,6 @@ err:
3625 fdput(f); 3627 fdput(f);
3626 return ret; 3628 return ret;
3627} 3629}
3628#endif /* CONFIG_PROC_FS */
3629 3630
3630static int 3631static int
3631do_send_specific(pid_t tgid, pid_t pid, int sig, struct kernel_siginfo *info) 3632do_send_specific(pid_t tgid, pid_t pid, int sig, struct kernel_siginfo *info)
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 10277429ed84..2c3382378d94 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -573,57 +573,6 @@ void tasklet_kill(struct tasklet_struct *t)
573} 573}
574EXPORT_SYMBOL(tasklet_kill); 574EXPORT_SYMBOL(tasklet_kill);
575 575
576/*
577 * tasklet_hrtimer
578 */
579
580/*
581 * The trampoline is called when the hrtimer expires. It schedules a tasklet
582 * to run __tasklet_hrtimer_trampoline() which in turn will call the intended
583 * hrtimer callback, but from softirq context.
584 */
585static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
586{
587 struct tasklet_hrtimer *ttimer =
588 container_of(timer, struct tasklet_hrtimer, timer);
589
590 tasklet_hi_schedule(&ttimer->tasklet);
591 return HRTIMER_NORESTART;
592}
593
594/*
595 * Helper function which calls the hrtimer callback from
596 * tasklet/softirq context
597 */
598static void __tasklet_hrtimer_trampoline(unsigned long data)
599{
600 struct tasklet_hrtimer *ttimer = (void *)data;
601 enum hrtimer_restart restart;
602
603 restart = ttimer->function(&ttimer->timer);
604 if (restart != HRTIMER_NORESTART)
605 hrtimer_restart(&ttimer->timer);
606}
607
608/**
609 * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks
610 * @ttimer: tasklet_hrtimer which is initialized
611 * @function: hrtimer callback function which gets called from softirq context
612 * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME)
613 * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL)
614 */
615void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,
616 enum hrtimer_restart (*function)(struct hrtimer *),
617 clockid_t which_clock, enum hrtimer_mode mode)
618{
619 hrtimer_init(&ttimer->timer, which_clock, mode);
620 ttimer->timer.function = __hrtimer_tasklet_trampoline;
621 tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline,
622 (unsigned long)ttimer);
623 ttimer->function = function;
624}
625EXPORT_SYMBOL_GPL(tasklet_hrtimer_init);
626
627void __init softirq_init(void) 576void __init softirq_init(void)
628{ 577{
629 int cpu; 578 int cpu;
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index f8edee9c792d..27bafc1e271e 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -5,41 +5,56 @@
5 * 5 *
6 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 6 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 */ 7 */
8#include <linux/sched/task_stack.h>
9#include <linux/sched/debug.h>
8#include <linux/sched.h> 10#include <linux/sched.h>
9#include <linux/kernel.h> 11#include <linux/kernel.h>
10#include <linux/export.h> 12#include <linux/export.h>
11#include <linux/kallsyms.h> 13#include <linux/kallsyms.h>
12#include <linux/stacktrace.h> 14#include <linux/stacktrace.h>
13 15
14void print_stack_trace(struct stack_trace *trace, int spaces) 16/**
17 * stack_trace_print - Print the entries in the stack trace
18 * @entries: Pointer to storage array
19 * @nr_entries: Number of entries in the storage array
20 * @spaces: Number of leading spaces to print
21 */
22void stack_trace_print(unsigned long *entries, unsigned int nr_entries,
23 int spaces)
15{ 24{
16 int i; 25 unsigned int i;
17 26
18 if (WARN_ON(!trace->entries)) 27 if (WARN_ON(!entries))
19 return; 28 return;
20 29
21 for (i = 0; i < trace->nr_entries; i++) 30 for (i = 0; i < nr_entries; i++)
22 printk("%*c%pS\n", 1 + spaces, ' ', (void *)trace->entries[i]); 31 printk("%*c%pS\n", 1 + spaces, ' ', (void *)entries[i]);
23} 32}
24EXPORT_SYMBOL_GPL(print_stack_trace); 33EXPORT_SYMBOL_GPL(stack_trace_print);
25 34
26int snprint_stack_trace(char *buf, size_t size, 35/**
27 struct stack_trace *trace, int spaces) 36 * stack_trace_snprint - Print the entries in the stack trace into a buffer
37 * @buf: Pointer to the print buffer
38 * @size: Size of the print buffer
39 * @entries: Pointer to storage array
40 * @nr_entries: Number of entries in the storage array
41 * @spaces: Number of leading spaces to print
42 *
43 * Return: Number of bytes printed.
44 */
45int stack_trace_snprint(char *buf, size_t size, unsigned long *entries,
46 unsigned int nr_entries, int spaces)
28{ 47{
29 int i; 48 unsigned int generated, i, total = 0;
30 int generated;
31 int total = 0;
32 49
33 if (WARN_ON(!trace->entries)) 50 if (WARN_ON(!entries))
34 return 0; 51 return 0;
35 52
36 for (i = 0; i < trace->nr_entries; i++) { 53 for (i = 0; i < nr_entries && size; i++) {
37 generated = snprintf(buf, size, "%*c%pS\n", 1 + spaces, ' ', 54 generated = snprintf(buf, size, "%*c%pS\n", 1 + spaces, ' ',
38 (void *)trace->entries[i]); 55 (void *)entries[i]);
39 56
40 total += generated; 57 total += generated;
41
42 /* Assume that generated isn't a negative number */
43 if (generated >= size) { 58 if (generated >= size) {
44 buf += size; 59 buf += size;
45 size = 0; 60 size = 0;
@@ -51,7 +66,176 @@ int snprint_stack_trace(char *buf, size_t size,
51 66
52 return total; 67 return total;
53} 68}
54EXPORT_SYMBOL_GPL(snprint_stack_trace); 69EXPORT_SYMBOL_GPL(stack_trace_snprint);
70
71#ifdef CONFIG_ARCH_STACKWALK
72
73struct stacktrace_cookie {
74 unsigned long *store;
75 unsigned int size;
76 unsigned int skip;
77 unsigned int len;
78};
79
80static bool stack_trace_consume_entry(void *cookie, unsigned long addr,
81 bool reliable)
82{
83 struct stacktrace_cookie *c = cookie;
84
85 if (c->len >= c->size)
86 return false;
87
88 if (c->skip > 0) {
89 c->skip--;
90 return true;
91 }
92 c->store[c->len++] = addr;
93 return c->len < c->size;
94}
95
96static bool stack_trace_consume_entry_nosched(void *cookie, unsigned long addr,
97 bool reliable)
98{
99 if (in_sched_functions(addr))
100 return true;
101 return stack_trace_consume_entry(cookie, addr, reliable);
102}
103
104/**
105 * stack_trace_save - Save a stack trace into a storage array
106 * @store: Pointer to storage array
107 * @size: Size of the storage array
108 * @skipnr: Number of entries to skip at the start of the stack trace
109 *
110 * Return: Number of trace entries stored.
111 */
112unsigned int stack_trace_save(unsigned long *store, unsigned int size,
113 unsigned int skipnr)
114{
115 stack_trace_consume_fn consume_entry = stack_trace_consume_entry;
116 struct stacktrace_cookie c = {
117 .store = store,
118 .size = size,
119 .skip = skipnr + 1,
120 };
121
122 arch_stack_walk(consume_entry, &c, current, NULL);
123 return c.len;
124}
125EXPORT_SYMBOL_GPL(stack_trace_save);
126
127/**
128 * stack_trace_save_tsk - Save a task stack trace into a storage array
129 * @task: The task to examine
130 * @store: Pointer to storage array
131 * @size: Size of the storage array
132 * @skipnr: Number of entries to skip at the start of the stack trace
133 *
134 * Return: Number of trace entries stored.
135 */
136unsigned int stack_trace_save_tsk(struct task_struct *tsk, unsigned long *store,
137 unsigned int size, unsigned int skipnr)
138{
139 stack_trace_consume_fn consume_entry = stack_trace_consume_entry_nosched;
140 struct stacktrace_cookie c = {
141 .store = store,
142 .size = size,
143 .skip = skipnr + 1,
144 };
145
146 if (!try_get_task_stack(tsk))
147 return 0;
148
149 arch_stack_walk(consume_entry, &c, tsk, NULL);
150 put_task_stack(tsk);
151 return c.len;
152}
153
154/**
155 * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array
156 * @regs: Pointer to pt_regs to examine
157 * @store: Pointer to storage array
158 * @size: Size of the storage array
159 * @skipnr: Number of entries to skip at the start of the stack trace
160 *
161 * Return: Number of trace entries stored.
162 */
163unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store,
164 unsigned int size, unsigned int skipnr)
165{
166 stack_trace_consume_fn consume_entry = stack_trace_consume_entry;
167 struct stacktrace_cookie c = {
168 .store = store,
169 .size = size,
170 .skip = skipnr,
171 };
172
173 arch_stack_walk(consume_entry, &c, current, regs);
174 return c.len;
175}
176
177#ifdef CONFIG_HAVE_RELIABLE_STACKTRACE
178/**
179 * stack_trace_save_tsk_reliable - Save task stack with verification
180 * @tsk: Pointer to the task to examine
181 * @store: Pointer to storage array
182 * @size: Size of the storage array
183 *
184 * Return: An error if it detects any unreliable features of the
185 * stack. Otherwise it guarantees that the stack trace is
186 * reliable and returns the number of entries stored.
187 *
188 * If the task is not 'current', the caller *must* ensure the task is inactive.
189 */
190int stack_trace_save_tsk_reliable(struct task_struct *tsk, unsigned long *store,
191 unsigned int size)
192{
193 stack_trace_consume_fn consume_entry = stack_trace_consume_entry;
194 struct stacktrace_cookie c = {
195 .store = store,
196 .size = size,
197 };
198 int ret;
199
200 /*
201 * If the task doesn't have a stack (e.g., a zombie), the stack is
202 * "reliably" empty.
203 */
204 if (!try_get_task_stack(tsk))
205 return 0;
206
207 ret = arch_stack_walk_reliable(consume_entry, &c, tsk);
208 put_task_stack(tsk);
209 return ret;
210}
211#endif
212
213#ifdef CONFIG_USER_STACKTRACE_SUPPORT
214/**
215 * stack_trace_save_user - Save a user space stack trace into a storage array
216 * @store: Pointer to storage array
217 * @size: Size of the storage array
218 *
219 * Return: Number of trace entries stored.
220 */
221unsigned int stack_trace_save_user(unsigned long *store, unsigned int size)
222{
223 stack_trace_consume_fn consume_entry = stack_trace_consume_entry;
224 struct stacktrace_cookie c = {
225 .store = store,
226 .size = size,
227 };
228
229 /* Trace user stack if not a kernel thread */
230 if (!current->mm)
231 return 0;
232
233 arch_stack_walk_user(consume_entry, &c, task_pt_regs(current));
234 return c.len;
235}
236#endif
237
238#else /* CONFIG_ARCH_STACKWALK */
55 239
56/* 240/*
57 * Architectures that do not implement save_stack_trace_*() 241 * Architectures that do not implement save_stack_trace_*()
@@ -77,3 +261,118 @@ save_stack_trace_tsk_reliable(struct task_struct *tsk,
77 WARN_ONCE(1, KERN_INFO "save_stack_tsk_reliable() not implemented yet.\n"); 261 WARN_ONCE(1, KERN_INFO "save_stack_tsk_reliable() not implemented yet.\n");
78 return -ENOSYS; 262 return -ENOSYS;
79} 263}
264
265/**
266 * stack_trace_save - Save a stack trace into a storage array
267 * @store: Pointer to storage array
268 * @size: Size of the storage array
269 * @skipnr: Number of entries to skip at the start of the stack trace
270 *
271 * Return: Number of trace entries stored
272 */
273unsigned int stack_trace_save(unsigned long *store, unsigned int size,
274 unsigned int skipnr)
275{
276 struct stack_trace trace = {
277 .entries = store,
278 .max_entries = size,
279 .skip = skipnr + 1,
280 };
281
282 save_stack_trace(&trace);
283 return trace.nr_entries;
284}
285EXPORT_SYMBOL_GPL(stack_trace_save);
286
287/**
288 * stack_trace_save_tsk - Save a task stack trace into a storage array
289 * @task: The task to examine
290 * @store: Pointer to storage array
291 * @size: Size of the storage array
292 * @skipnr: Number of entries to skip at the start of the stack trace
293 *
294 * Return: Number of trace entries stored
295 */
296unsigned int stack_trace_save_tsk(struct task_struct *task,
297 unsigned long *store, unsigned int size,
298 unsigned int skipnr)
299{
300 struct stack_trace trace = {
301 .entries = store,
302 .max_entries = size,
303 .skip = skipnr + 1,
304 };
305
306 save_stack_trace_tsk(task, &trace);
307 return trace.nr_entries;
308}
309
310/**
311 * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array
312 * @regs: Pointer to pt_regs to examine
313 * @store: Pointer to storage array
314 * @size: Size of the storage array
315 * @skipnr: Number of entries to skip at the start of the stack trace
316 *
317 * Return: Number of trace entries stored
318 */
319unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store,
320 unsigned int size, unsigned int skipnr)
321{
322 struct stack_trace trace = {
323 .entries = store,
324 .max_entries = size,
325 .skip = skipnr,
326 };
327
328 save_stack_trace_regs(regs, &trace);
329 return trace.nr_entries;
330}
331
332#ifdef CONFIG_HAVE_RELIABLE_STACKTRACE
333/**
334 * stack_trace_save_tsk_reliable - Save task stack with verification
335 * @tsk: Pointer to the task to examine
336 * @store: Pointer to storage array
337 * @size: Size of the storage array
338 *
339 * Return: An error if it detects any unreliable features of the
340 * stack. Otherwise it guarantees that the stack trace is
341 * reliable and returns the number of entries stored.
342 *
343 * If the task is not 'current', the caller *must* ensure the task is inactive.
344 */
345int stack_trace_save_tsk_reliable(struct task_struct *tsk, unsigned long *store,
346 unsigned int size)
347{
348 struct stack_trace trace = {
349 .entries = store,
350 .max_entries = size,
351 };
352 int ret = save_stack_trace_tsk_reliable(tsk, &trace);
353
354 return ret ? ret : trace.nr_entries;
355}
356#endif
357
358#ifdef CONFIG_USER_STACKTRACE_SUPPORT
359/**
360 * stack_trace_save_user - Save a user space stack trace into a storage array
361 * @store: Pointer to storage array
362 * @size: Size of the storage array
363 *
364 * Return: Number of trace entries stored
365 */
366unsigned int stack_trace_save_user(unsigned long *store, unsigned int size)
367{
368 struct stack_trace trace = {
369 .entries = store,
370 .max_entries = size,
371 };
372
373 save_stack_trace_user(&trace);
374 return trace.nr_entries;
375}
376#endif /* CONFIG_USER_STACKTRACE_SUPPORT */
377
378#endif /* !CONFIG_ARCH_STACKWALK */
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 067cb83f37ea..7231fb5953fc 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -513,7 +513,7 @@ repeat:
513 } 513 }
514 preempt_count_dec(); 514 preempt_count_dec();
515 WARN_ONCE(preempt_count(), 515 WARN_ONCE(preempt_count(),
516 "cpu_stop: %pf(%p) leaked preempt count\n", fn, arg); 516 "cpu_stop: %ps(%p) leaked preempt count\n", fn, arg);
517 goto repeat; 517 goto repeat;
518 } 518 }
519} 519}
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index d21f4befaea4..4d9ae5ea6caf 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -167,9 +167,6 @@ COND_SYSCALL(syslog);
167 167
168/* kernel/sched/core.c */ 168/* kernel/sched/core.c */
169 169
170/* kernel/signal.c */
171COND_SYSCALL(pidfd_send_signal);
172
173/* kernel/sys.c */ 170/* kernel/sys.c */
174COND_SYSCALL(setregid); 171COND_SYSCALL(setregid);
175COND_SYSCALL(setgid); 172COND_SYSCALL(setgid);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e5da394d1ca3..c9ec050bcf46 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -128,6 +128,7 @@ static int zero;
128static int __maybe_unused one = 1; 128static int __maybe_unused one = 1;
129static int __maybe_unused two = 2; 129static int __maybe_unused two = 2;
130static int __maybe_unused four = 4; 130static int __maybe_unused four = 4;
131static unsigned long zero_ul;
131static unsigned long one_ul = 1; 132static unsigned long one_ul = 1;
132static unsigned long long_max = LONG_MAX; 133static unsigned long long_max = LONG_MAX;
133static int one_hundred = 100; 134static int one_hundred = 100;
@@ -1750,7 +1751,7 @@ static struct ctl_table fs_table[] = {
1750 .maxlen = sizeof(files_stat.max_files), 1751 .maxlen = sizeof(files_stat.max_files),
1751 .mode = 0644, 1752 .mode = 0644,
1752 .proc_handler = proc_doulongvec_minmax, 1753 .proc_handler = proc_doulongvec_minmax,
1753 .extra1 = &zero, 1754 .extra1 = &zero_ul,
1754 .extra2 = &long_max, 1755 .extra2 = &long_max,
1755 }, 1756 },
1756 { 1757 {
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 2c97e8c2d29f..0519a8805aab 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -594,7 +594,7 @@ static ktime_t alarm_timer_remaining(struct k_itimer *timr, ktime_t now)
594{ 594{
595 struct alarm *alarm = &timr->it.alarm.alarmtimer; 595 struct alarm *alarm = &timr->it.alarm.alarmtimer;
596 596
597 return ktime_sub(now, alarm->node.expires); 597 return ktime_sub(alarm->node.expires, now);
598} 598}
599 599
600/** 600/**
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 5e77662dd2d9..f5490222e134 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -611,6 +611,22 @@ void clockevents_resume(void)
611} 611}
612 612
613#ifdef CONFIG_HOTPLUG_CPU 613#ifdef CONFIG_HOTPLUG_CPU
614
615# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
616/**
617 * tick_offline_cpu - Take CPU out of the broadcast mechanism
618 * @cpu: The outgoing CPU
619 *
620 * Called on the outgoing CPU after it took itself offline.
621 */
622void tick_offline_cpu(unsigned int cpu)
623{
624 raw_spin_lock(&clockevents_lock);
625 tick_broadcast_offline(cpu);
626 raw_spin_unlock(&clockevents_lock);
627}
628# endif
629
614/** 630/**
615 * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu 631 * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu
616 */ 632 */
@@ -621,8 +637,6 @@ void tick_cleanup_dead_cpu(int cpu)
621 637
622 raw_spin_lock_irqsave(&clockevents_lock, flags); 638 raw_spin_lock_irqsave(&clockevents_lock, flags);
623 639
624 tick_shutdown_broadcast_oneshot(cpu);
625 tick_shutdown_broadcast(cpu);
626 tick_shutdown(cpu); 640 tick_shutdown(cpu);
627 /* 641 /*
628 * Unregister the clock event devices which were 642 * Unregister the clock event devices which were
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index dc1b6f1929f9..d23b434c2ca7 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -63,7 +63,7 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
63#if (BITS_PER_LONG < 64) 63#if (BITS_PER_LONG < 64)
64u64 get_jiffies_64(void) 64u64 get_jiffies_64(void)
65{ 65{
66 unsigned long seq; 66 unsigned int seq;
67 u64 ret; 67 u64 ret;
68 68
69 do { 69 do {
@@ -89,7 +89,7 @@ struct clocksource * __init __weak clocksource_default_clock(void)
89 return &clocksource_jiffies; 89 return &clocksource_jiffies;
90} 90}
91 91
92struct clocksource refined_jiffies; 92static struct clocksource refined_jiffies;
93 93
94int register_refined_jiffies(long cycles_per_second) 94int register_refined_jiffies(long cycles_per_second)
95{ 95{
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 094b82ca95e5..142b07619918 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -94,7 +94,7 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
94unsigned long long notrace sched_clock(void) 94unsigned long long notrace sched_clock(void)
95{ 95{
96 u64 cyc, res; 96 u64 cyc, res;
97 unsigned long seq; 97 unsigned int seq;
98 struct clock_read_data *rd; 98 struct clock_read_data *rd;
99 99
100 do { 100 do {
@@ -231,7 +231,7 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
231 if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) 231 if (irqtime > 0 || (irqtime == -1 && rate >= 1000000))
232 enable_sched_clock_irqtime(); 232 enable_sched_clock_irqtime();
233 233
234 pr_debug("Registered %pF as sched_clock source\n", read); 234 pr_debug("Registered %pS as sched_clock source\n", read);
235} 235}
236 236
237void __init generic_sched_clock_init(void) 237void __init generic_sched_clock_init(void)
@@ -267,12 +267,12 @@ void __init generic_sched_clock_init(void)
267 */ 267 */
268static u64 notrace suspended_sched_clock_read(void) 268static u64 notrace suspended_sched_clock_read(void)
269{ 269{
270 unsigned long seq = raw_read_seqcount(&cd.seq); 270 unsigned int seq = raw_read_seqcount(&cd.seq);
271 271
272 return cd.read_data[seq & 1].epoch_cyc; 272 return cd.read_data[seq & 1].epoch_cyc;
273} 273}
274 274
275static int sched_clock_suspend(void) 275int sched_clock_suspend(void)
276{ 276{
277 struct clock_read_data *rd = &cd.read_data[0]; 277 struct clock_read_data *rd = &cd.read_data[0];
278 278
@@ -283,7 +283,7 @@ static int sched_clock_suspend(void)
283 return 0; 283 return 0;
284} 284}
285 285
286static void sched_clock_resume(void) 286void sched_clock_resume(void)
287{ 287{
288 struct clock_read_data *rd = &cd.read_data[0]; 288 struct clock_read_data *rd = &cd.read_data[0];
289 289
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index ee834d4fb814..e51778c312f1 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -36,10 +36,16 @@ static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
36static void tick_broadcast_setup_oneshot(struct clock_event_device *bc); 36static void tick_broadcast_setup_oneshot(struct clock_event_device *bc);
37static void tick_broadcast_clear_oneshot(int cpu); 37static void tick_broadcast_clear_oneshot(int cpu);
38static void tick_resume_broadcast_oneshot(struct clock_event_device *bc); 38static void tick_resume_broadcast_oneshot(struct clock_event_device *bc);
39# ifdef CONFIG_HOTPLUG_CPU
40static void tick_broadcast_oneshot_offline(unsigned int cpu);
41# endif
39#else 42#else
40static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } 43static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); }
41static inline void tick_broadcast_clear_oneshot(int cpu) { } 44static inline void tick_broadcast_clear_oneshot(int cpu) { }
42static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { } 45static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { }
46# ifdef CONFIG_HOTPLUG_CPU
47static inline void tick_broadcast_oneshot_offline(unsigned int cpu) { }
48# endif
43#endif 49#endif
44 50
45/* 51/*
@@ -433,27 +439,29 @@ void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)
433} 439}
434 440
435#ifdef CONFIG_HOTPLUG_CPU 441#ifdef CONFIG_HOTPLUG_CPU
436/* 442static void tick_shutdown_broadcast(void)
437 * Remove a CPU from broadcasting
438 */
439void tick_shutdown_broadcast(unsigned int cpu)
440{ 443{
441 struct clock_event_device *bc; 444 struct clock_event_device *bc = tick_broadcast_device.evtdev;
442 unsigned long flags;
443
444 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
445
446 bc = tick_broadcast_device.evtdev;
447 cpumask_clear_cpu(cpu, tick_broadcast_mask);
448 cpumask_clear_cpu(cpu, tick_broadcast_on);
449 445
450 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { 446 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
451 if (bc && cpumask_empty(tick_broadcast_mask)) 447 if (bc && cpumask_empty(tick_broadcast_mask))
452 clockevents_shutdown(bc); 448 clockevents_shutdown(bc);
453 } 449 }
450}
454 451
455 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 452/*
453 * Remove a CPU from broadcasting
454 */
455void tick_broadcast_offline(unsigned int cpu)
456{
457 raw_spin_lock(&tick_broadcast_lock);
458 cpumask_clear_cpu(cpu, tick_broadcast_mask);
459 cpumask_clear_cpu(cpu, tick_broadcast_on);
460 tick_broadcast_oneshot_offline(cpu);
461 tick_shutdown_broadcast();
462 raw_spin_unlock(&tick_broadcast_lock);
456} 463}
464
457#endif 465#endif
458 466
459void tick_suspend_broadcast(void) 467void tick_suspend_broadcast(void)
@@ -801,13 +809,13 @@ int __tick_broadcast_oneshot_control(enum tick_broadcast_state state)
801 * either the CPU handling the broadcast 809 * either the CPU handling the broadcast
802 * interrupt or we got woken by something else. 810 * interrupt or we got woken by something else.
803 * 811 *
804 * We are not longer in the broadcast mask, so 812 * We are no longer in the broadcast mask, so
805 * if the cpu local expiry time is already 813 * if the cpu local expiry time is already
806 * reached, we would reprogram the cpu local 814 * reached, we would reprogram the cpu local
807 * timer with an already expired event. 815 * timer with an already expired event.
808 * 816 *
809 * This can lead to a ping-pong when we return 817 * This can lead to a ping-pong when we return
810 * to idle and therefor rearm the broadcast 818 * to idle and therefore rearm the broadcast
811 * timer before the cpu local timer was able 819 * timer before the cpu local timer was able
812 * to fire. This happens because the forced 820 * to fire. This happens because the forced
813 * reprogramming makes sure that the event 821 * reprogramming makes sure that the event
@@ -950,14 +958,10 @@ void hotplug_cpu__broadcast_tick_pull(int deadcpu)
950} 958}
951 959
952/* 960/*
953 * Remove a dead CPU from broadcasting 961 * Remove a dying CPU from broadcasting
954 */ 962 */
955void tick_shutdown_broadcast_oneshot(unsigned int cpu) 963static void tick_broadcast_oneshot_offline(unsigned int cpu)
956{ 964{
957 unsigned long flags;
958
959 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
960
961 /* 965 /*
962 * Clear the broadcast masks for the dead cpu, but do not stop 966 * Clear the broadcast masks for the dead cpu, but do not stop
963 * the broadcast device! 967 * the broadcast device!
@@ -965,8 +969,6 @@ void tick_shutdown_broadcast_oneshot(unsigned int cpu)
965 cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); 969 cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
966 cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); 970 cpumask_clear_cpu(cpu, tick_broadcast_pending_mask);
967 cpumask_clear_cpu(cpu, tick_broadcast_force_mask); 971 cpumask_clear_cpu(cpu, tick_broadcast_force_mask);
968
969 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
970} 972}
971#endif 973#endif
972 974
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 529143b4c8d2..59225b484e4e 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -46,6 +46,14 @@ ktime_t tick_period;
46 * procedure also covers cpu hotplug. 46 * procedure also covers cpu hotplug.
47 */ 47 */
48int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; 48int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
49#ifdef CONFIG_NO_HZ_FULL
50/*
51 * tick_do_timer_boot_cpu indicates the boot CPU temporarily owns
52 * tick_do_timer_cpu and it should be taken over by an eligible secondary
53 * when one comes online.
54 */
55static int tick_do_timer_boot_cpu __read_mostly = -1;
56#endif
49 57
50/* 58/*
51 * Debugging: see timer_list.c 59 * Debugging: see timer_list.c
@@ -149,7 +157,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
149 !tick_broadcast_oneshot_active()) { 157 !tick_broadcast_oneshot_active()) {
150 clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC); 158 clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC);
151 } else { 159 } else {
152 unsigned long seq; 160 unsigned int seq;
153 ktime_t next; 161 ktime_t next;
154 162
155 do { 163 do {
@@ -167,6 +175,26 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
167 } 175 }
168} 176}
169 177
178#ifdef CONFIG_NO_HZ_FULL
179static void giveup_do_timer(void *info)
180{
181 int cpu = *(unsigned int *)info;
182
183 WARN_ON(tick_do_timer_cpu != smp_processor_id());
184
185 tick_do_timer_cpu = cpu;
186}
187
188static void tick_take_do_timer_from_boot(void)
189{
190 int cpu = smp_processor_id();
191 int from = tick_do_timer_boot_cpu;
192
193 if (from >= 0 && from != cpu)
194 smp_call_function_single(from, giveup_do_timer, &cpu, 1);
195}
196#endif
197
170/* 198/*
171 * Setup the tick device 199 * Setup the tick device
172 */ 200 */
@@ -186,12 +214,26 @@ static void tick_setup_device(struct tick_device *td,
186 * this cpu: 214 * this cpu:
187 */ 215 */
188 if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { 216 if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
189 if (!tick_nohz_full_cpu(cpu)) 217 tick_do_timer_cpu = cpu;
190 tick_do_timer_cpu = cpu; 218
191 else
192 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
193 tick_next_period = ktime_get(); 219 tick_next_period = ktime_get();
194 tick_period = NSEC_PER_SEC / HZ; 220 tick_period = NSEC_PER_SEC / HZ;
221#ifdef CONFIG_NO_HZ_FULL
222 /*
223 * The boot CPU may be nohz_full, in which case set
224 * tick_do_timer_boot_cpu so the first housekeeping
225 * secondary that comes up will take do_timer from
226 * us.
227 */
228 if (tick_nohz_full_cpu(cpu))
229 tick_do_timer_boot_cpu = cpu;
230
231 } else if (tick_do_timer_boot_cpu != -1 &&
232 !tick_nohz_full_cpu(cpu)) {
233 tick_take_do_timer_from_boot();
234 tick_do_timer_boot_cpu = -1;
235 WARN_ON(tick_do_timer_cpu != cpu);
236#endif
195 } 237 }
196 238
197 /* 239 /*
@@ -487,6 +529,7 @@ void tick_freeze(void)
487 trace_suspend_resume(TPS("timekeeping_freeze"), 529 trace_suspend_resume(TPS("timekeeping_freeze"),
488 smp_processor_id(), true); 530 smp_processor_id(), true);
489 system_state = SYSTEM_SUSPEND; 531 system_state = SYSTEM_SUSPEND;
532 sched_clock_suspend();
490 timekeeping_suspend(); 533 timekeeping_suspend();
491 } else { 534 } else {
492 tick_suspend_local(); 535 tick_suspend_local();
@@ -510,6 +553,7 @@ void tick_unfreeze(void)
510 553
511 if (tick_freeze_depth == num_online_cpus()) { 554 if (tick_freeze_depth == num_online_cpus()) {
512 timekeeping_resume(); 555 timekeeping_resume();
556 sched_clock_resume();
513 system_state = SYSTEM_RUNNING; 557 system_state = SYSTEM_RUNNING;
514 trace_suspend_resume(TPS("timekeeping_freeze"), 558 trace_suspend_resume(TPS("timekeeping_freeze"),
515 smp_processor_id(), false); 559 smp_processor_id(), false);
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index e277284c2831..7b2496136729 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -64,7 +64,6 @@ extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
64extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); 64extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
65extern void tick_install_broadcast_device(struct clock_event_device *dev); 65extern void tick_install_broadcast_device(struct clock_event_device *dev);
66extern int tick_is_broadcast_device(struct clock_event_device *dev); 66extern int tick_is_broadcast_device(struct clock_event_device *dev);
67extern void tick_shutdown_broadcast(unsigned int cpu);
68extern void tick_suspend_broadcast(void); 67extern void tick_suspend_broadcast(void);
69extern void tick_resume_broadcast(void); 68extern void tick_resume_broadcast(void);
70extern bool tick_resume_check_broadcast(void); 69extern bool tick_resume_check_broadcast(void);
@@ -78,7 +77,6 @@ static inline void tick_install_broadcast_device(struct clock_event_device *dev)
78static inline int tick_is_broadcast_device(struct clock_event_device *dev) { return 0; } 77static inline int tick_is_broadcast_device(struct clock_event_device *dev) { return 0; }
79static inline int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { return 0; } 78static inline int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { return 0; }
80static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { } 79static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { }
81static inline void tick_shutdown_broadcast(unsigned int cpu) { }
82static inline void tick_suspend_broadcast(void) { } 80static inline void tick_suspend_broadcast(void) { }
83static inline void tick_resume_broadcast(void) { } 81static inline void tick_resume_broadcast(void) { }
84static inline bool tick_resume_check_broadcast(void) { return false; } 82static inline bool tick_resume_check_broadcast(void) { return false; }
@@ -128,19 +126,23 @@ static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
128/* Functions related to oneshot broadcasting */ 126/* Functions related to oneshot broadcasting */
129#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT) 127#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT)
130extern void tick_broadcast_switch_to_oneshot(void); 128extern void tick_broadcast_switch_to_oneshot(void);
131extern void tick_shutdown_broadcast_oneshot(unsigned int cpu);
132extern int tick_broadcast_oneshot_active(void); 129extern int tick_broadcast_oneshot_active(void);
133extern void tick_check_oneshot_broadcast_this_cpu(void); 130extern void tick_check_oneshot_broadcast_this_cpu(void);
134bool tick_broadcast_oneshot_available(void); 131bool tick_broadcast_oneshot_available(void);
135extern struct cpumask *tick_get_broadcast_oneshot_mask(void); 132extern struct cpumask *tick_get_broadcast_oneshot_mask(void);
136#else /* !(BROADCAST && ONESHOT): */ 133#else /* !(BROADCAST && ONESHOT): */
137static inline void tick_broadcast_switch_to_oneshot(void) { } 134static inline void tick_broadcast_switch_to_oneshot(void) { }
138static inline void tick_shutdown_broadcast_oneshot(unsigned int cpu) { }
139static inline int tick_broadcast_oneshot_active(void) { return 0; } 135static inline int tick_broadcast_oneshot_active(void) { return 0; }
140static inline void tick_check_oneshot_broadcast_this_cpu(void) { } 136static inline void tick_check_oneshot_broadcast_this_cpu(void) { }
141static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); } 137static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); }
142#endif /* !(BROADCAST && ONESHOT) */ 138#endif /* !(BROADCAST && ONESHOT) */
143 139
140#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_HOTPLUG_CPU)
141extern void tick_broadcast_offline(unsigned int cpu);
142#else
143static inline void tick_broadcast_offline(unsigned int cpu) { }
144#endif
145
144/* NO_HZ_FULL internal */ 146/* NO_HZ_FULL internal */
145#ifdef CONFIG_NO_HZ_FULL 147#ifdef CONFIG_NO_HZ_FULL
146extern void tick_nohz_init(void); 148extern void tick_nohz_init(void);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 6fa52cd6df0b..f4ee1a3428ae 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -121,10 +121,16 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
121 * into a long sleep. If two CPUs happen to assign themselves to 121 * into a long sleep. If two CPUs happen to assign themselves to
122 * this duty, then the jiffies update is still serialized by 122 * this duty, then the jiffies update is still serialized by
123 * jiffies_lock. 123 * jiffies_lock.
124 *
125 * If nohz_full is enabled, this should not happen because the
126 * tick_do_timer_cpu never relinquishes.
124 */ 127 */
125 if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE) 128 if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) {
126 && !tick_nohz_full_cpu(cpu)) 129#ifdef CONFIG_NO_HZ_FULL
130 WARN_ON(tick_nohz_full_running);
131#endif
127 tick_do_timer_cpu = cpu; 132 tick_do_timer_cpu = cpu;
133 }
128#endif 134#endif
129 135
130 /* Check, if the jiffies need an update */ 136 /* Check, if the jiffies need an update */
@@ -395,8 +401,8 @@ void __init tick_nohz_full_setup(cpumask_var_t cpumask)
395static int tick_nohz_cpu_down(unsigned int cpu) 401static int tick_nohz_cpu_down(unsigned int cpu)
396{ 402{
397 /* 403 /*
398 * The boot CPU handles housekeeping duty (unbound timers, 404 * The tick_do_timer_cpu CPU handles housekeeping duty (unbound
399 * workqueues, timekeeping, ...) on behalf of full dynticks 405 * timers, workqueues, timekeeping, ...) on behalf of full dynticks
400 * CPUs. It must remain online when nohz full is enabled. 406 * CPUs. It must remain online when nohz full is enabled.
401 */ 407 */
402 if (tick_nohz_full_running && tick_do_timer_cpu == cpu) 408 if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
@@ -423,12 +429,15 @@ void __init tick_nohz_init(void)
423 return; 429 return;
424 } 430 }
425 431
426 cpu = smp_processor_id(); 432 if (IS_ENABLED(CONFIG_PM_SLEEP_SMP) &&
433 !IS_ENABLED(CONFIG_PM_SLEEP_SMP_NONZERO_CPU)) {
434 cpu = smp_processor_id();
427 435
428 if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { 436 if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
429 pr_warn("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", 437 pr_warn("NO_HZ: Clearing %d from nohz_full range "
430 cpu); 438 "for timekeeping\n", cpu);
431 cpumask_clear_cpu(cpu, tick_nohz_full_mask); 439 cpumask_clear_cpu(cpu, tick_nohz_full_mask);
440 }
432 } 441 }
433 442
434 for_each_cpu(cpu, tick_nohz_full_mask) 443 for_each_cpu(cpu, tick_nohz_full_mask)
@@ -645,7 +654,8 @@ static inline bool local_timer_softirq_pending(void)
645static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) 654static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
646{ 655{
647 u64 basemono, next_tick, next_tmr, next_rcu, delta, expires; 656 u64 basemono, next_tick, next_tmr, next_rcu, delta, expires;
648 unsigned long seq, basejiff; 657 unsigned long basejiff;
658 unsigned int seq;
649 659
650 /* Read jiffies and the time when jiffies were updated last */ 660 /* Read jiffies and the time when jiffies were updated last */
651 do { 661 do {
@@ -904,8 +914,13 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
904 /* 914 /*
905 * Boot safety: make sure the timekeeping duty has been 915 * Boot safety: make sure the timekeeping duty has been
906 * assigned before entering dyntick-idle mode, 916 * assigned before entering dyntick-idle mode,
917 * tick_do_timer_cpu is TICK_DO_TIMER_BOOT
907 */ 918 */
908 if (tick_do_timer_cpu == TICK_DO_TIMER_NONE) 919 if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_BOOT))
920 return false;
921
922 /* Should not happen for nohz-full */
923 if (WARN_ON_ONCE(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
909 return false; 924 return false;
910 } 925 }
911 926
@@ -1023,6 +1038,18 @@ bool tick_nohz_idle_got_tick(void)
1023} 1038}
1024 1039
1025/** 1040/**
1041 * tick_nohz_get_next_hrtimer - return the next expiration time for the hrtimer
1042 * or the tick, whatever that expires first. Note that, if the tick has been
1043 * stopped, it returns the next hrtimer.
1044 *
1045 * Called from power state control code with interrupts disabled
1046 */
1047ktime_t tick_nohz_get_next_hrtimer(void)
1048{
1049 return __this_cpu_read(tick_cpu_device.evtdev)->next_event;
1050}
1051
1052/**
1026 * tick_nohz_get_sleep_length - return the expected length of the current sleep 1053 * tick_nohz_get_sleep_length - return the expected length of the current sleep
1027 * @delta_next: duration until the next event if the tick cannot be stopped 1054 * @delta_next: duration until the next event if the tick cannot be stopped
1028 * 1055 *
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index 6de959a854b2..4fb06527cf64 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -24,12 +24,19 @@ enum tick_nohz_mode {
24 * struct tick_sched - sched tick emulation and no idle tick control/stats 24 * struct tick_sched - sched tick emulation and no idle tick control/stats
25 * @sched_timer: hrtimer to schedule the periodic tick in high 25 * @sched_timer: hrtimer to schedule the periodic tick in high
26 * resolution mode 26 * resolution mode
27 * @check_clocks: Notification mechanism about clocksource changes
28 * @nohz_mode: Mode - one state of tick_nohz_mode
29 * @inidle: Indicator that the CPU is in the tick idle mode
30 * @tick_stopped: Indicator that the idle tick has been stopped
31 * @idle_active: Indicator that the CPU is actively in the tick idle mode;
32 * it is resetted during irq handling phases.
33 * @do_timer_lst: CPU was the last one doing do_timer before going idle
34 * @got_idle_tick: Tick timer function has run with @inidle set
27 * @last_tick: Store the last tick expiry time when the tick 35 * @last_tick: Store the last tick expiry time when the tick
28 * timer is modified for nohz sleeps. This is necessary 36 * timer is modified for nohz sleeps. This is necessary
29 * to resume the tick timer operation in the timeline 37 * to resume the tick timer operation in the timeline
30 * when the CPU returns from nohz sleep. 38 * when the CPU returns from nohz sleep.
31 * @next_tick: Next tick to be fired when in dynticks mode. 39 * @next_tick: Next tick to be fired when in dynticks mode.
32 * @tick_stopped: Indicator that the idle tick has been stopped
33 * @idle_jiffies: jiffies at the entry to idle for idle time accounting 40 * @idle_jiffies: jiffies at the entry to idle for idle time accounting
34 * @idle_calls: Total number of idle calls 41 * @idle_calls: Total number of idle calls
35 * @idle_sleeps: Number of idle calls, where the sched tick was stopped 42 * @idle_sleeps: Number of idle calls, where the sched tick was stopped
@@ -40,8 +47,8 @@ enum tick_nohz_mode {
40 * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding 47 * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding
41 * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped) 48 * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped)
42 * @timer_expires_base: Base time clock monotonic for @timer_expires 49 * @timer_expires_base: Base time clock monotonic for @timer_expires
43 * @do_timer_lst: CPU was the last one doing do_timer before going idle 50 * @next_timer: Expiry time of next expiring timer for debugging purpose only
44 * @got_idle_tick: Tick timer function has run with @inidle set 51 * @tick_dep_mask: Tick dependency mask - is set, if someone needs the tick
45 */ 52 */
46struct tick_sched { 53struct tick_sched {
47 struct hrtimer sched_timer; 54 struct hrtimer sched_timer;
diff --git a/kernel/time/time.c b/kernel/time/time.c
index c3f756f8534b..86656bbac232 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -171,7 +171,7 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz
171 static int firsttime = 1; 171 static int firsttime = 1;
172 int error = 0; 172 int error = 0;
173 173
174 if (tv && !timespec64_valid(tv)) 174 if (tv && !timespec64_valid_settod(tv))
175 return -EINVAL; 175 return -EINVAL;
176 176
177 error = security_settime64(tv, tz); 177 error = security_settime64(tv, tz);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index f986e1918d12..5716e28bfa3c 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -720,7 +720,7 @@ static void timekeeping_forward_now(struct timekeeper *tk)
720void ktime_get_real_ts64(struct timespec64 *ts) 720void ktime_get_real_ts64(struct timespec64 *ts)
721{ 721{
722 struct timekeeper *tk = &tk_core.timekeeper; 722 struct timekeeper *tk = &tk_core.timekeeper;
723 unsigned long seq; 723 unsigned int seq;
724 u64 nsecs; 724 u64 nsecs;
725 725
726 WARN_ON(timekeeping_suspended); 726 WARN_ON(timekeeping_suspended);
@@ -829,7 +829,7 @@ EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset);
829ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs) 829ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs)
830{ 830{
831 ktime_t *offset = offsets[offs]; 831 ktime_t *offset = offsets[offs];
832 unsigned long seq; 832 unsigned int seq;
833 ktime_t tconv; 833 ktime_t tconv;
834 834
835 do { 835 do {
@@ -960,7 +960,7 @@ time64_t __ktime_get_real_seconds(void)
960void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) 960void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
961{ 961{
962 struct timekeeper *tk = &tk_core.timekeeper; 962 struct timekeeper *tk = &tk_core.timekeeper;
963 unsigned long seq; 963 unsigned int seq;
964 ktime_t base_raw; 964 ktime_t base_raw;
965 ktime_t base_real; 965 ktime_t base_real;
966 u64 nsec_raw; 966 u64 nsec_raw;
@@ -1122,7 +1122,7 @@ int get_device_system_crosststamp(int (*get_time_fn)
1122 ktime_t base_real, base_raw; 1122 ktime_t base_real, base_raw;
1123 u64 nsec_real, nsec_raw; 1123 u64 nsec_real, nsec_raw;
1124 u8 cs_was_changed_seq; 1124 u8 cs_was_changed_seq;
1125 unsigned long seq; 1125 unsigned int seq;
1126 bool do_interp; 1126 bool do_interp;
1127 int ret; 1127 int ret;
1128 1128
@@ -1221,7 +1221,7 @@ int do_settimeofday64(const struct timespec64 *ts)
1221 unsigned long flags; 1221 unsigned long flags;
1222 int ret = 0; 1222 int ret = 0;
1223 1223
1224 if (!timespec64_valid_strict(ts)) 1224 if (!timespec64_valid_settod(ts))
1225 return -EINVAL; 1225 return -EINVAL;
1226 1226
1227 raw_spin_lock_irqsave(&timekeeper_lock, flags); 1227 raw_spin_lock_irqsave(&timekeeper_lock, flags);
@@ -1278,7 +1278,7 @@ static int timekeeping_inject_offset(const struct timespec64 *ts)
1278 /* Make sure the proposed value is valid */ 1278 /* Make sure the proposed value is valid */
1279 tmp = timespec64_add(tk_xtime(tk), *ts); 1279 tmp = timespec64_add(tk_xtime(tk), *ts);
1280 if (timespec64_compare(&tk->wall_to_monotonic, ts) > 0 || 1280 if (timespec64_compare(&tk->wall_to_monotonic, ts) > 0 ||
1281 !timespec64_valid_strict(&tmp)) { 1281 !timespec64_valid_settod(&tmp)) {
1282 ret = -EINVAL; 1282 ret = -EINVAL;
1283 goto error; 1283 goto error;
1284 } 1284 }
@@ -1409,7 +1409,7 @@ int timekeeping_notify(struct clocksource *clock)
1409void ktime_get_raw_ts64(struct timespec64 *ts) 1409void ktime_get_raw_ts64(struct timespec64 *ts)
1410{ 1410{
1411 struct timekeeper *tk = &tk_core.timekeeper; 1411 struct timekeeper *tk = &tk_core.timekeeper;
1412 unsigned long seq; 1412 unsigned int seq;
1413 u64 nsecs; 1413 u64 nsecs;
1414 1414
1415 do { 1415 do {
@@ -1431,7 +1431,7 @@ EXPORT_SYMBOL(ktime_get_raw_ts64);
1431int timekeeping_valid_for_hres(void) 1431int timekeeping_valid_for_hres(void)
1432{ 1432{
1433 struct timekeeper *tk = &tk_core.timekeeper; 1433 struct timekeeper *tk = &tk_core.timekeeper;
1434 unsigned long seq; 1434 unsigned int seq;
1435 int ret; 1435 int ret;
1436 1436
1437 do { 1437 do {
@@ -1450,7 +1450,7 @@ int timekeeping_valid_for_hres(void)
1450u64 timekeeping_max_deferment(void) 1450u64 timekeeping_max_deferment(void)
1451{ 1451{
1452 struct timekeeper *tk = &tk_core.timekeeper; 1452 struct timekeeper *tk = &tk_core.timekeeper;
1453 unsigned long seq; 1453 unsigned int seq;
1454 u64 ret; 1454 u64 ret;
1455 1455
1456 do { 1456 do {
@@ -1527,7 +1527,7 @@ void __init timekeeping_init(void)
1527 unsigned long flags; 1527 unsigned long flags;
1528 1528
1529 read_persistent_wall_and_boot_offset(&wall_time, &boot_offset); 1529 read_persistent_wall_and_boot_offset(&wall_time, &boot_offset);
1530 if (timespec64_valid_strict(&wall_time) && 1530 if (timespec64_valid_settod(&wall_time) &&
1531 timespec64_to_ns(&wall_time) > 0) { 1531 timespec64_to_ns(&wall_time) > 0) {
1532 persistent_clock_exists = true; 1532 persistent_clock_exists = true;
1533 } else if (timespec64_to_ns(&wall_time) != 0) { 1533 } else if (timespec64_to_ns(&wall_time) != 0) {
@@ -2150,7 +2150,7 @@ EXPORT_SYMBOL_GPL(getboottime64);
2150void ktime_get_coarse_real_ts64(struct timespec64 *ts) 2150void ktime_get_coarse_real_ts64(struct timespec64 *ts)
2151{ 2151{
2152 struct timekeeper *tk = &tk_core.timekeeper; 2152 struct timekeeper *tk = &tk_core.timekeeper;
2153 unsigned long seq; 2153 unsigned int seq;
2154 2154
2155 do { 2155 do {
2156 seq = read_seqcount_begin(&tk_core.seq); 2156 seq = read_seqcount_begin(&tk_core.seq);
@@ -2164,7 +2164,7 @@ void ktime_get_coarse_ts64(struct timespec64 *ts)
2164{ 2164{
2165 struct timekeeper *tk = &tk_core.timekeeper; 2165 struct timekeeper *tk = &tk_core.timekeeper;
2166 struct timespec64 now, mono; 2166 struct timespec64 now, mono;
2167 unsigned long seq; 2167 unsigned int seq;
2168 2168
2169 do { 2169 do {
2170 seq = read_seqcount_begin(&tk_core.seq); 2170 seq = read_seqcount_begin(&tk_core.seq);
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index 7a9b4eb7a1d5..141ab3ab0354 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -14,6 +14,13 @@ extern u64 timekeeping_max_deferment(void);
14extern void timekeeping_warp_clock(void); 14extern void timekeeping_warp_clock(void);
15extern int timekeeping_suspend(void); 15extern int timekeeping_suspend(void);
16extern void timekeeping_resume(void); 16extern void timekeeping_resume(void);
17#ifdef CONFIG_GENERIC_SCHED_CLOCK
18extern int sched_clock_suspend(void);
19extern void sched_clock_resume(void);
20#else
21static inline int sched_clock_suspend(void) { return 0; }
22static inline void sched_clock_resume(void) { }
23#endif
17 24
18extern void do_timer(unsigned long ticks); 25extern void do_timer(unsigned long ticks);
19extern void update_wall_time(void); 26extern void update_wall_time(void);
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 2fce056f8a49..343c7ba33b1c 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -536,6 +536,8 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
536 hlist_add_head(&timer->entry, base->vectors + idx); 536 hlist_add_head(&timer->entry, base->vectors + idx);
537 __set_bit(idx, base->pending_map); 537 __set_bit(idx, base->pending_map);
538 timer_set_idx(timer, idx); 538 timer_set_idx(timer, idx);
539
540 trace_timer_start(timer, timer->expires, timer->flags);
539} 541}
540 542
541static void 543static void
@@ -757,13 +759,6 @@ static inline void debug_init(struct timer_list *timer)
757 trace_timer_init(timer); 759 trace_timer_init(timer);
758} 760}
759 761
760static inline void
761debug_activate(struct timer_list *timer, unsigned long expires)
762{
763 debug_timer_activate(timer);
764 trace_timer_start(timer, expires, timer->flags);
765}
766
767static inline void debug_deactivate(struct timer_list *timer) 762static inline void debug_deactivate(struct timer_list *timer)
768{ 763{
769 debug_timer_deactivate(timer); 764 debug_timer_deactivate(timer);
@@ -1037,7 +1032,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
1037 } 1032 }
1038 } 1033 }
1039 1034
1040 debug_activate(timer, expires); 1035 debug_timer_activate(timer);
1041 1036
1042 timer->expires = expires; 1037 timer->expires = expires;
1043 /* 1038 /*
@@ -1171,7 +1166,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
1171 } 1166 }
1172 forward_timer_base(base); 1167 forward_timer_base(base);
1173 1168
1174 debug_activate(timer, timer->expires); 1169 debug_timer_activate(timer);
1175 internal_add_timer(base, timer); 1170 internal_add_timer(base, timer);
1176 raw_spin_unlock_irqrestore(&base->lock, flags); 1171 raw_spin_unlock_irqrestore(&base->lock, flags);
1177} 1172}
@@ -1298,7 +1293,9 @@ int del_timer_sync(struct timer_list *timer)
1298EXPORT_SYMBOL(del_timer_sync); 1293EXPORT_SYMBOL(del_timer_sync);
1299#endif 1294#endif
1300 1295
1301static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list *)) 1296static void call_timer_fn(struct timer_list *timer,
1297 void (*fn)(struct timer_list *),
1298 unsigned long baseclk)
1302{ 1299{
1303 int count = preempt_count(); 1300 int count = preempt_count();
1304 1301
@@ -1321,14 +1318,14 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list
1321 */ 1318 */
1322 lock_map_acquire(&lockdep_map); 1319 lock_map_acquire(&lockdep_map);
1323 1320
1324 trace_timer_expire_entry(timer); 1321 trace_timer_expire_entry(timer, baseclk);
1325 fn(timer); 1322 fn(timer);
1326 trace_timer_expire_exit(timer); 1323 trace_timer_expire_exit(timer);
1327 1324
1328 lock_map_release(&lockdep_map); 1325 lock_map_release(&lockdep_map);
1329 1326
1330 if (count != preempt_count()) { 1327 if (count != preempt_count()) {
1331 WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n", 1328 WARN_ONCE(1, "timer: %pS preempt leak: %08x -> %08x\n",
1332 fn, count, preempt_count()); 1329 fn, count, preempt_count());
1333 /* 1330 /*
1334 * Restore the preempt count. That gives us a decent 1331 * Restore the preempt count. That gives us a decent
@@ -1342,6 +1339,13 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list
1342 1339
1343static void expire_timers(struct timer_base *base, struct hlist_head *head) 1340static void expire_timers(struct timer_base *base, struct hlist_head *head)
1344{ 1341{
1342 /*
1343 * This value is required only for tracing. base->clk was
1344 * incremented directly before expire_timers was called. But expiry
1345 * is related to the old base->clk value.
1346 */
1347 unsigned long baseclk = base->clk - 1;
1348
1345 while (!hlist_empty(head)) { 1349 while (!hlist_empty(head)) {
1346 struct timer_list *timer; 1350 struct timer_list *timer;
1347 void (*fn)(struct timer_list *); 1351 void (*fn)(struct timer_list *);
@@ -1355,11 +1359,11 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
1355 1359
1356 if (timer->flags & TIMER_IRQSAFE) { 1360 if (timer->flags & TIMER_IRQSAFE) {
1357 raw_spin_unlock(&base->lock); 1361 raw_spin_unlock(&base->lock);
1358 call_timer_fn(timer, fn); 1362 call_timer_fn(timer, fn, baseclk);
1359 raw_spin_lock(&base->lock); 1363 raw_spin_lock(&base->lock);
1360 } else { 1364 } else {
1361 raw_spin_unlock_irq(&base->lock); 1365 raw_spin_unlock_irq(&base->lock);
1362 call_timer_fn(timer, fn); 1366 call_timer_fn(timer, fn, baseclk);
1363 raw_spin_lock_irq(&base->lock); 1367 raw_spin_lock_irq(&base->lock);
1364 } 1368 }
1365 } 1369 }
diff --git a/kernel/torture.c b/kernel/torture.c
index 8faa1a9aaeb9..17b2be9bde12 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -88,6 +88,8 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes,
88 88
89 if (!cpu_online(cpu) || !cpu_is_hotpluggable(cpu)) 89 if (!cpu_online(cpu) || !cpu_is_hotpluggable(cpu))
90 return false; 90 return false;
91 if (num_online_cpus() <= 1)
92 return false; /* Can't offline the last CPU. */
91 93
92 if (verbose > 1) 94 if (verbose > 1)
93 pr_alert("%s" TORTURE_FLAG 95 pr_alert("%s" TORTURE_FLAG
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index d64c00afceb5..94b0e37d90ef 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -14,6 +14,8 @@
14#include <linux/syscalls.h> 14#include <linux/syscalls.h>
15#include <linux/error-injection.h> 15#include <linux/error-injection.h>
16 16
17#include <asm/tlb.h>
18
17#include "trace_probe.h" 19#include "trace_probe.h"
18#include "trace.h" 20#include "trace.h"
19 21
@@ -163,6 +165,10 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
163 * access_ok() should prevent writing to non-user memory, but in 165 * access_ok() should prevent writing to non-user memory, but in
164 * some situations (nommu, temporary switch, etc) access_ok() does 166 * some situations (nommu, temporary switch, etc) access_ok() does
165 * not provide enough validation, hence the check on KERNEL_DS. 167 * not provide enough validation, hence the check on KERNEL_DS.
168 *
169 * nmi_uaccess_okay() ensures the probe is not run in an interim
170 * state, when the task or mm are switched. This is specifically
171 * required to prevent the use of temporary mm.
166 */ 172 */
167 173
168 if (unlikely(in_interrupt() || 174 if (unlikely(in_interrupt() ||
@@ -170,6 +176,8 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
170 return -EPERM; 176 return -EPERM;
171 if (unlikely(uaccess_kernel())) 177 if (unlikely(uaccess_kernel()))
172 return -EPERM; 178 return -EPERM;
179 if (unlikely(!nmi_uaccess_okay()))
180 return -EPERM;
173 if (!access_ok(unsafe_ptr, size)) 181 if (!access_ok(unsafe_ptr, size))
174 return -EPERM; 182 return -EPERM;
175 183
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index fa79323331b2..b920358dd8f7 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -33,6 +33,7 @@
33#include <linux/list.h> 33#include <linux/list.h>
34#include <linux/hash.h> 34#include <linux/hash.h>
35#include <linux/rcupdate.h> 35#include <linux/rcupdate.h>
36#include <linux/kprobes.h>
36 37
37#include <trace/events/sched.h> 38#include <trace/events/sched.h>
38 39
@@ -1992,7 +1993,7 @@ static void print_bug_type(void)
1992 * modifying the code. @failed should be one of either: 1993 * modifying the code. @failed should be one of either:
1993 * EFAULT - if the problem happens on reading the @ip address 1994 * EFAULT - if the problem happens on reading the @ip address
1994 * EINVAL - if what is read at @ip is not what was expected 1995 * EINVAL - if what is read at @ip is not what was expected
1995 * EPERM - if the problem happens on writting to the @ip address 1996 * EPERM - if the problem happens on writing to the @ip address
1996 */ 1997 */
1997void ftrace_bug(int failed, struct dyn_ftrace *rec) 1998void ftrace_bug(int failed, struct dyn_ftrace *rec)
1998{ 1999{
@@ -2391,7 +2392,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
2391 return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr); 2392 return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr);
2392 } 2393 }
2393 2394
2394 return -1; /* unknow ftrace bug */ 2395 return -1; /* unknown ftrace bug */
2395} 2396}
2396 2397
2397void __weak ftrace_replace_code(int mod_flags) 2398void __weak ftrace_replace_code(int mod_flags)
@@ -3004,7 +3005,7 @@ ftrace_allocate_pages(unsigned long num_to_init)
3004 int cnt; 3005 int cnt;
3005 3006
3006 if (!num_to_init) 3007 if (!num_to_init)
3007 return 0; 3008 return NULL;
3008 3009
3009 start_pg = pg = kzalloc(sizeof(*pg), GFP_KERNEL); 3010 start_pg = pg = kzalloc(sizeof(*pg), GFP_KERNEL);
3010 if (!pg) 3011 if (!pg)
@@ -4755,7 +4756,7 @@ static int
4755ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove, 4756ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove,
4756 int reset, int enable) 4757 int reset, int enable)
4757{ 4758{
4758 return ftrace_set_hash(ops, 0, 0, ip, remove, reset, enable); 4759 return ftrace_set_hash(ops, NULL, 0, ip, remove, reset, enable);
4759} 4760}
4760 4761
4761/** 4762/**
@@ -5463,7 +5464,7 @@ void ftrace_create_filter_files(struct ftrace_ops *ops,
5463 5464
5464/* 5465/*
5465 * The name "destroy_filter_files" is really a misnomer. Although 5466 * The name "destroy_filter_files" is really a misnomer. Although
5466 * in the future, it may actualy delete the files, but this is 5467 * in the future, it may actually delete the files, but this is
5467 * really intended to make sure the ops passed in are disabled 5468 * really intended to make sure the ops passed in are disabled
5468 * and that when this function returns, the caller is free to 5469 * and that when this function returns, the caller is free to
5469 * free the ops. 5470 * free the ops.
@@ -5786,7 +5787,7 @@ void ftrace_module_enable(struct module *mod)
5786 /* 5787 /*
5787 * If the tracing is enabled, go ahead and enable the record. 5788 * If the tracing is enabled, go ahead and enable the record.
5788 * 5789 *
5789 * The reason not to enable the record immediatelly is the 5790 * The reason not to enable the record immediately is the
5790 * inherent check of ftrace_make_nop/ftrace_make_call for 5791 * inherent check of ftrace_make_nop/ftrace_make_call for
5791 * correct previous instructions. Making first the NOP 5792 * correct previous instructions. Making first the NOP
5792 * conversion puts the module to the correct state, thus 5793 * conversion puts the module to the correct state, thus
@@ -6246,7 +6247,7 @@ void ftrace_reset_array_ops(struct trace_array *tr)
6246 tr->ops->func = ftrace_stub; 6247 tr->ops->func = ftrace_stub;
6247} 6248}
6248 6249
6249static inline void 6250static nokprobe_inline void
6250__ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, 6251__ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
6251 struct ftrace_ops *ignored, struct pt_regs *regs) 6252 struct ftrace_ops *ignored, struct pt_regs *regs)
6252{ 6253{
@@ -6306,11 +6307,13 @@ static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
6306{ 6307{
6307 __ftrace_ops_list_func(ip, parent_ip, NULL, regs); 6308 __ftrace_ops_list_func(ip, parent_ip, NULL, regs);
6308} 6309}
6310NOKPROBE_SYMBOL(ftrace_ops_list_func);
6309#else 6311#else
6310static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip) 6312static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip)
6311{ 6313{
6312 __ftrace_ops_list_func(ip, parent_ip, NULL, NULL); 6314 __ftrace_ops_list_func(ip, parent_ip, NULL, NULL);
6313} 6315}
6316NOKPROBE_SYMBOL(ftrace_ops_no_ops);
6314#endif 6317#endif
6315 6318
6316/* 6319/*
@@ -6337,6 +6340,7 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip,
6337 preempt_enable_notrace(); 6340 preempt_enable_notrace();
6338 trace_clear_recursion(bit); 6341 trace_clear_recursion(bit);
6339} 6342}
6343NOKPROBE_SYMBOL(ftrace_ops_assist_func);
6340 6344
6341/** 6345/**
6342 * ftrace_ops_get_func - get the function a trampoline should call 6346 * ftrace_ops_get_func - get the function a trampoline should call
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 41b6f96e5366..4ee8d8aa3d0f 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -762,7 +762,7 @@ u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)
762 762
763 preempt_disable_notrace(); 763 preempt_disable_notrace();
764 time = rb_time_stamp(buffer); 764 time = rb_time_stamp(buffer);
765 preempt_enable_no_resched_notrace(); 765 preempt_enable_notrace();
766 766
767 return time; 767 return time;
768} 768}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 21153e64bf1c..ec439999f387 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -159,6 +159,8 @@ static union trace_eval_map_item *trace_eval_maps;
159#endif /* CONFIG_TRACE_EVAL_MAP_FILE */ 159#endif /* CONFIG_TRACE_EVAL_MAP_FILE */
160 160
161static int tracing_set_tracer(struct trace_array *tr, const char *buf); 161static int tracing_set_tracer(struct trace_array *tr, const char *buf);
162static void ftrace_trace_userstack(struct ring_buffer *buffer,
163 unsigned long flags, int pc);
162 164
163#define MAX_TRACER_SIZE 100 165#define MAX_TRACER_SIZE 100
164static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; 166static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
@@ -496,8 +498,10 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
496 * not modified. 498 * not modified.
497 */ 499 */
498 pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); 500 pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL);
499 if (!pid_list) 501 if (!pid_list) {
502 trace_parser_put(&parser);
500 return -ENOMEM; 503 return -ENOMEM;
504 }
501 505
502 pid_list->pid_max = READ_ONCE(pid_max); 506 pid_list->pid_max = READ_ONCE(pid_max);
503 507
@@ -507,6 +511,7 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
507 511
508 pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3); 512 pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3);
509 if (!pid_list->pids) { 513 if (!pid_list->pids) {
514 trace_parser_put(&parser);
510 kfree(pid_list); 515 kfree(pid_list);
511 return -ENOMEM; 516 return -ENOMEM;
512 } 517 }
@@ -2749,12 +2754,21 @@ trace_function(struct trace_array *tr,
2749 2754
2750#ifdef CONFIG_STACKTRACE 2755#ifdef CONFIG_STACKTRACE
2751 2756
2752#define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long)) 2757/* Allow 4 levels of nesting: normal, softirq, irq, NMI */
2758#define FTRACE_KSTACK_NESTING 4
2759
2760#define FTRACE_KSTACK_ENTRIES (PAGE_SIZE / FTRACE_KSTACK_NESTING)
2761
2753struct ftrace_stack { 2762struct ftrace_stack {
2754 unsigned long calls[FTRACE_STACK_MAX_ENTRIES]; 2763 unsigned long calls[FTRACE_KSTACK_ENTRIES];
2755}; 2764};
2756 2765
2757static DEFINE_PER_CPU(struct ftrace_stack, ftrace_stack); 2766
2767struct ftrace_stacks {
2768 struct ftrace_stack stacks[FTRACE_KSTACK_NESTING];
2769};
2770
2771static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks);
2758static DEFINE_PER_CPU(int, ftrace_stack_reserve); 2772static DEFINE_PER_CPU(int, ftrace_stack_reserve);
2759 2773
2760static void __ftrace_trace_stack(struct ring_buffer *buffer, 2774static void __ftrace_trace_stack(struct ring_buffer *buffer,
@@ -2763,13 +2777,10 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
2763{ 2777{
2764 struct trace_event_call *call = &event_kernel_stack; 2778 struct trace_event_call *call = &event_kernel_stack;
2765 struct ring_buffer_event *event; 2779 struct ring_buffer_event *event;
2780 unsigned int size, nr_entries;
2781 struct ftrace_stack *fstack;
2766 struct stack_entry *entry; 2782 struct stack_entry *entry;
2767 struct stack_trace trace; 2783 int stackidx;
2768 int use_stack;
2769 int size = FTRACE_STACK_ENTRIES;
2770
2771 trace.nr_entries = 0;
2772 trace.skip = skip;
2773 2784
2774 /* 2785 /*
2775 * Add one, for this function and the call to save_stack_trace() 2786 * Add one, for this function and the call to save_stack_trace()
@@ -2777,7 +2788,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
2777 */ 2788 */
2778#ifndef CONFIG_UNWINDER_ORC 2789#ifndef CONFIG_UNWINDER_ORC
2779 if (!regs) 2790 if (!regs)
2780 trace.skip++; 2791 skip++;
2781#endif 2792#endif
2782 2793
2783 /* 2794 /*
@@ -2788,53 +2799,40 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
2788 */ 2799 */
2789 preempt_disable_notrace(); 2800 preempt_disable_notrace();
2790 2801
2791 use_stack = __this_cpu_inc_return(ftrace_stack_reserve); 2802 stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1;
2803
2804 /* This should never happen. If it does, yell once and skip */
2805 if (WARN_ON_ONCE(stackidx > FTRACE_KSTACK_NESTING))
2806 goto out;
2807
2792 /* 2808 /*
2793 * We don't need any atomic variables, just a barrier. 2809 * The above __this_cpu_inc_return() is 'atomic' cpu local. An
2794 * If an interrupt comes in, we don't care, because it would 2810 * interrupt will either see the value pre increment or post
2795 * have exited and put the counter back to what we want. 2811 * increment. If the interrupt happens pre increment it will have
2796 * We just need a barrier to keep gcc from moving things 2812 * restored the counter when it returns. We just need a barrier to
2797 * around. 2813 * keep gcc from moving things around.
2798 */ 2814 */
2799 barrier(); 2815 barrier();
2800 if (use_stack == 1) {
2801 trace.entries = this_cpu_ptr(ftrace_stack.calls);
2802 trace.max_entries = FTRACE_STACK_MAX_ENTRIES;
2803 2816
2804 if (regs) 2817 fstack = this_cpu_ptr(ftrace_stacks.stacks) + stackidx;
2805 save_stack_trace_regs(regs, &trace); 2818 size = ARRAY_SIZE(fstack->calls);
2806 else
2807 save_stack_trace(&trace);
2808
2809 if (trace.nr_entries > size)
2810 size = trace.nr_entries;
2811 } else
2812 /* From now on, use_stack is a boolean */
2813 use_stack = 0;
2814 2819
2815 size *= sizeof(unsigned long); 2820 if (regs) {
2821 nr_entries = stack_trace_save_regs(regs, fstack->calls,
2822 size, skip);
2823 } else {
2824 nr_entries = stack_trace_save(fstack->calls, size, skip);
2825 }
2816 2826
2827 size = nr_entries * sizeof(unsigned long);
2817 event = __trace_buffer_lock_reserve(buffer, TRACE_STACK, 2828 event = __trace_buffer_lock_reserve(buffer, TRACE_STACK,
2818 sizeof(*entry) + size, flags, pc); 2829 sizeof(*entry) + size, flags, pc);
2819 if (!event) 2830 if (!event)
2820 goto out; 2831 goto out;
2821 entry = ring_buffer_event_data(event); 2832 entry = ring_buffer_event_data(event);
2822 2833
2823 memset(&entry->caller, 0, size); 2834 memcpy(&entry->caller, fstack->calls, size);
2824 2835 entry->size = nr_entries;
2825 if (use_stack)
2826 memcpy(&entry->caller, trace.entries,
2827 trace.nr_entries * sizeof(unsigned long));
2828 else {
2829 trace.max_entries = FTRACE_STACK_ENTRIES;
2830 trace.entries = entry->caller;
2831 if (regs)
2832 save_stack_trace_regs(regs, &trace);
2833 else
2834 save_stack_trace(&trace);
2835 }
2836
2837 entry->size = trace.nr_entries;
2838 2836
2839 if (!call_filter_check_discard(call, entry, buffer, event)) 2837 if (!call_filter_check_discard(call, entry, buffer, event))
2840 __buffer_unlock_commit(buffer, event); 2838 __buffer_unlock_commit(buffer, event);
@@ -2904,15 +2902,15 @@ void trace_dump_stack(int skip)
2904} 2902}
2905EXPORT_SYMBOL_GPL(trace_dump_stack); 2903EXPORT_SYMBOL_GPL(trace_dump_stack);
2906 2904
2905#ifdef CONFIG_USER_STACKTRACE_SUPPORT
2907static DEFINE_PER_CPU(int, user_stack_count); 2906static DEFINE_PER_CPU(int, user_stack_count);
2908 2907
2909void 2908static void
2910ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) 2909ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
2911{ 2910{
2912 struct trace_event_call *call = &event_user_stack; 2911 struct trace_event_call *call = &event_user_stack;
2913 struct ring_buffer_event *event; 2912 struct ring_buffer_event *event;
2914 struct userstack_entry *entry; 2913 struct userstack_entry *entry;
2915 struct stack_trace trace;
2916 2914
2917 if (!(global_trace.trace_flags & TRACE_ITER_USERSTACKTRACE)) 2915 if (!(global_trace.trace_flags & TRACE_ITER_USERSTACKTRACE))
2918 return; 2916 return;
@@ -2943,12 +2941,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
2943 entry->tgid = current->tgid; 2941 entry->tgid = current->tgid;
2944 memset(&entry->caller, 0, sizeof(entry->caller)); 2942 memset(&entry->caller, 0, sizeof(entry->caller));
2945 2943
2946 trace.nr_entries = 0; 2944 stack_trace_save_user(entry->caller, FTRACE_STACK_ENTRIES);
2947 trace.max_entries = FTRACE_STACK_ENTRIES;
2948 trace.skip = 0;
2949 trace.entries = entry->caller;
2950
2951 save_stack_trace_user(&trace);
2952 if (!call_filter_check_discard(call, entry, buffer, event)) 2945 if (!call_filter_check_discard(call, entry, buffer, event))
2953 __buffer_unlock_commit(buffer, event); 2946 __buffer_unlock_commit(buffer, event);
2954 2947
@@ -2957,13 +2950,12 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
2957 out: 2950 out:
2958 preempt_enable(); 2951 preempt_enable();
2959} 2952}
2960 2953#else /* CONFIG_USER_STACKTRACE_SUPPORT */
2961#ifdef UNUSED 2954static void ftrace_trace_userstack(struct ring_buffer *buffer,
2962static void __trace_userstack(struct trace_array *tr, unsigned long flags) 2955 unsigned long flags, int pc)
2963{ 2956{
2964 ftrace_trace_userstack(tr, flags, preempt_count());
2965} 2957}
2966#endif /* UNUSED */ 2958#endif /* !CONFIG_USER_STACKTRACE_SUPPORT */
2967 2959
2968#endif /* CONFIG_STACKTRACE */ 2960#endif /* CONFIG_STACKTRACE */
2969 2961
@@ -7025,35 +7017,43 @@ struct buffer_ref {
7025 struct ring_buffer *buffer; 7017 struct ring_buffer *buffer;
7026 void *page; 7018 void *page;
7027 int cpu; 7019 int cpu;
7028 int ref; 7020 refcount_t refcount;
7029}; 7021};
7030 7022
7023static void buffer_ref_release(struct buffer_ref *ref)
7024{
7025 if (!refcount_dec_and_test(&ref->refcount))
7026 return;
7027 ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page);
7028 kfree(ref);
7029}
7030
7031static void buffer_pipe_buf_release(struct pipe_inode_info *pipe, 7031static void buffer_pipe_buf_release(struct pipe_inode_info *pipe,
7032 struct pipe_buffer *buf) 7032 struct pipe_buffer *buf)
7033{ 7033{
7034 struct buffer_ref *ref = (struct buffer_ref *)buf->private; 7034 struct buffer_ref *ref = (struct buffer_ref *)buf->private;
7035 7035
7036 if (--ref->ref) 7036 buffer_ref_release(ref);
7037 return;
7038
7039 ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page);
7040 kfree(ref);
7041 buf->private = 0; 7037 buf->private = 0;
7042} 7038}
7043 7039
7044static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, 7040static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe,
7045 struct pipe_buffer *buf) 7041 struct pipe_buffer *buf)
7046{ 7042{
7047 struct buffer_ref *ref = (struct buffer_ref *)buf->private; 7043 struct buffer_ref *ref = (struct buffer_ref *)buf->private;
7048 7044
7049 ref->ref++; 7045 if (refcount_read(&ref->refcount) > INT_MAX/2)
7046 return false;
7047
7048 refcount_inc(&ref->refcount);
7049 return true;
7050} 7050}
7051 7051
7052/* Pipe buffer operations for a buffer. */ 7052/* Pipe buffer operations for a buffer. */
7053static const struct pipe_buf_operations buffer_pipe_buf_ops = { 7053static const struct pipe_buf_operations buffer_pipe_buf_ops = {
7054 .confirm = generic_pipe_buf_confirm, 7054 .confirm = generic_pipe_buf_confirm,
7055 .release = buffer_pipe_buf_release, 7055 .release = buffer_pipe_buf_release,
7056 .steal = generic_pipe_buf_steal, 7056 .steal = generic_pipe_buf_nosteal,
7057 .get = buffer_pipe_buf_get, 7057 .get = buffer_pipe_buf_get,
7058}; 7058};
7059 7059
@@ -7066,11 +7066,7 @@ static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i)
7066 struct buffer_ref *ref = 7066 struct buffer_ref *ref =
7067 (struct buffer_ref *)spd->partial[i].private; 7067 (struct buffer_ref *)spd->partial[i].private;
7068 7068
7069 if (--ref->ref) 7069 buffer_ref_release(ref);
7070 return;
7071
7072 ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page);
7073 kfree(ref);
7074 spd->partial[i].private = 0; 7070 spd->partial[i].private = 0;
7075} 7071}
7076 7072
@@ -7125,7 +7121,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
7125 break; 7121 break;
7126 } 7122 }
7127 7123
7128 ref->ref = 1; 7124 refcount_set(&ref->refcount, 1);
7129 ref->buffer = iter->trace_buffer->buffer; 7125 ref->buffer = iter->trace_buffer->buffer;
7130 ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file); 7126 ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);
7131 if (IS_ERR(ref->page)) { 7127 if (IS_ERR(ref->page)) {
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d80cee49e0eb..639047b259d7 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -782,17 +782,9 @@ void update_max_tr_single(struct trace_array *tr,
782#endif /* CONFIG_TRACER_MAX_TRACE */ 782#endif /* CONFIG_TRACER_MAX_TRACE */
783 783
784#ifdef CONFIG_STACKTRACE 784#ifdef CONFIG_STACKTRACE
785void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags,
786 int pc);
787
788void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, 785void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
789 int pc); 786 int pc);
790#else 787#else
791static inline void ftrace_trace_userstack(struct ring_buffer *buffer,
792 unsigned long flags, int pc)
793{
794}
795
796static inline void __trace_stack(struct trace_array *tr, unsigned long flags, 788static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
797 int skip, int pc) 789 int skip, int pc)
798{ 790{
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 4ad967453b6f..3ea65cdff30d 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -205,6 +205,8 @@ void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect)
205void ftrace_likely_update(struct ftrace_likely_data *f, int val, 205void ftrace_likely_update(struct ftrace_likely_data *f, int val,
206 int expect, int is_constant) 206 int expect, int is_constant)
207{ 207{
208 unsigned long flags = user_access_save();
209
208 /* A constant is always correct */ 210 /* A constant is always correct */
209 if (is_constant) { 211 if (is_constant) {
210 f->constant++; 212 f->constant++;
@@ -223,6 +225,8 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
223 f->data.correct++; 225 f->data.correct++;
224 else 226 else
225 f->data.incorrect++; 227 f->data.incorrect++;
228
229 user_access_restore(flags);
226} 230}
227EXPORT_SYMBOL(ftrace_likely_update); 231EXPORT_SYMBOL(ftrace_likely_update);
228 232
diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c
index dd1f43588d70..fa100ed3b4de 100644
--- a/kernel/trace/trace_dynevent.c
+++ b/kernel/trace/trace_dynevent.c
@@ -74,7 +74,7 @@ int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type)
74static int create_dyn_event(int argc, char **argv) 74static int create_dyn_event(int argc, char **argv)
75{ 75{
76 struct dyn_event_operations *ops; 76 struct dyn_event_operations *ops;
77 int ret; 77 int ret = -ENODEV;
78 78
79 if (argv[0][0] == '-' || argv[0][0] == '!') 79 if (argv[0][0] == '-' || argv[0][0] == '!')
80 return dyn_event_release(argc, argv, NULL); 80 return dyn_event_release(argc, argv, NULL);
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index ca46339f3009..a1d20421f4b0 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -3713,7 +3713,6 @@ static void track_data_destroy(struct hist_trigger_data *hist_data,
3713 struct trace_event_file *file = hist_data->event_file; 3713 struct trace_event_file *file = hist_data->event_file;
3714 3714
3715 destroy_hist_field(data->track_data.track_var, 0); 3715 destroy_hist_field(data->track_data.track_var, 0);
3716 destroy_hist_field(data->track_data.var_ref, 0);
3717 3716
3718 if (data->action == ACTION_SNAPSHOT) { 3717 if (data->action == ACTION_SNAPSHOT) {
3719 struct track_data *track_data; 3718 struct track_data *track_data;
@@ -5187,7 +5186,6 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec,
5187 u64 var_ref_vals[TRACING_MAP_VARS_MAX]; 5186 u64 var_ref_vals[TRACING_MAP_VARS_MAX];
5188 char compound_key[HIST_KEY_SIZE_MAX]; 5187 char compound_key[HIST_KEY_SIZE_MAX];
5189 struct tracing_map_elt *elt = NULL; 5188 struct tracing_map_elt *elt = NULL;
5190 struct stack_trace stacktrace;
5191 struct hist_field *key_field; 5189 struct hist_field *key_field;
5192 u64 field_contents; 5190 u64 field_contents;
5193 void *key = NULL; 5191 void *key = NULL;
@@ -5199,14 +5197,9 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec,
5199 key_field = hist_data->fields[i]; 5197 key_field = hist_data->fields[i];
5200 5198
5201 if (key_field->flags & HIST_FIELD_FL_STACKTRACE) { 5199 if (key_field->flags & HIST_FIELD_FL_STACKTRACE) {
5202 stacktrace.max_entries = HIST_STACKTRACE_DEPTH; 5200 memset(entries, 0, HIST_STACKTRACE_SIZE);
5203 stacktrace.entries = entries; 5201 stack_trace_save(entries, HIST_STACKTRACE_DEPTH,
5204 stacktrace.nr_entries = 0; 5202 HIST_STACKTRACE_SKIP);
5205 stacktrace.skip = HIST_STACKTRACE_SKIP;
5206
5207 memset(stacktrace.entries, 0, HIST_STACKTRACE_SIZE);
5208 save_stack_trace(&stacktrace);
5209
5210 key = entries; 5203 key = entries;
5211 } else { 5204 } else {
5212 field_contents = key_field->fn(key_field, elt, rbe, rec); 5205 field_contents = key_field->fn(key_field, elt, rbe, rec);
@@ -5247,7 +5240,7 @@ static void hist_trigger_stacktrace_print(struct seq_file *m,
5247 unsigned int i; 5240 unsigned int i;
5248 5241
5249 for (i = 0; i < max_entries; i++) { 5242 for (i = 0; i < max_entries; i++) {
5250 if (stacktrace_entries[i] == ULONG_MAX) 5243 if (!stacktrace_entries[i])
5251 return; 5244 return;
5252 5245
5253 seq_printf(m, "%*c", 1 + spaces, ' '); 5246 seq_printf(m, "%*c", 1 + spaces, ' ');
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index eec648a0d673..5d16f73898db 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -18,44 +18,32 @@
18 18
19#include "trace.h" 19#include "trace.h"
20 20
21static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] = 21#define STACK_TRACE_ENTRIES 500
22 { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX };
23unsigned stack_trace_index[STACK_TRACE_ENTRIES];
24 22
25/* 23static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES];
26 * Reserve one entry for the passed in ip. This will allow 24static unsigned stack_trace_index[STACK_TRACE_ENTRIES];
27 * us to remove most or all of the stack size overhead
28 * added by the stack tracer itself.
29 */
30struct stack_trace stack_trace_max = {
31 .max_entries = STACK_TRACE_ENTRIES - 1,
32 .entries = &stack_dump_trace[0],
33};
34 25
35unsigned long stack_trace_max_size; 26static unsigned int stack_trace_nr_entries;
36arch_spinlock_t stack_trace_max_lock = 27static unsigned long stack_trace_max_size;
28static arch_spinlock_t stack_trace_max_lock =
37 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 29 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
38 30
39DEFINE_PER_CPU(int, disable_stack_tracer); 31DEFINE_PER_CPU(int, disable_stack_tracer);
40static DEFINE_MUTEX(stack_sysctl_mutex); 32static DEFINE_MUTEX(stack_sysctl_mutex);
41 33
42int stack_tracer_enabled; 34int stack_tracer_enabled;
43static int last_stack_tracer_enabled;
44 35
45void stack_trace_print(void) 36static void print_max_stack(void)
46{ 37{
47 long i; 38 long i;
48 int size; 39 int size;
49 40
50 pr_emerg(" Depth Size Location (%d entries)\n" 41 pr_emerg(" Depth Size Location (%d entries)\n"
51 " ----- ---- --------\n", 42 " ----- ---- --------\n",
52 stack_trace_max.nr_entries); 43 stack_trace_nr_entries);
53 44
54 for (i = 0; i < stack_trace_max.nr_entries; i++) { 45 for (i = 0; i < stack_trace_nr_entries; i++) {
55 if (stack_dump_trace[i] == ULONG_MAX) 46 if (i + 1 == stack_trace_nr_entries)
56 break;
57 if (i+1 == stack_trace_max.nr_entries ||
58 stack_dump_trace[i+1] == ULONG_MAX)
59 size = stack_trace_index[i]; 47 size = stack_trace_index[i];
60 else 48 else
61 size = stack_trace_index[i] - stack_trace_index[i+1]; 49 size = stack_trace_index[i] - stack_trace_index[i+1];
@@ -65,16 +53,7 @@ void stack_trace_print(void)
65 } 53 }
66} 54}
67 55
68/* 56static void check_stack(unsigned long ip, unsigned long *stack)
69 * When arch-specific code overrides this function, the following
70 * data should be filled up, assuming stack_trace_max_lock is held to
71 * prevent concurrent updates.
72 * stack_trace_index[]
73 * stack_trace_max
74 * stack_trace_max_size
75 */
76void __weak
77check_stack(unsigned long ip, unsigned long *stack)
78{ 57{
79 unsigned long this_size, flags; unsigned long *p, *top, *start; 58 unsigned long this_size, flags; unsigned long *p, *top, *start;
80 static int tracer_frame; 59 static int tracer_frame;
@@ -110,13 +89,12 @@ check_stack(unsigned long ip, unsigned long *stack)
110 89
111 stack_trace_max_size = this_size; 90 stack_trace_max_size = this_size;
112 91
113 stack_trace_max.nr_entries = 0; 92 stack_trace_nr_entries = stack_trace_save(stack_dump_trace,
114 stack_trace_max.skip = 0; 93 ARRAY_SIZE(stack_dump_trace) - 1,
115 94 0);
116 save_stack_trace(&stack_trace_max);
117 95
118 /* Skip over the overhead of the stack tracer itself */ 96 /* Skip over the overhead of the stack tracer itself */
119 for (i = 0; i < stack_trace_max.nr_entries; i++) { 97 for (i = 0; i < stack_trace_nr_entries; i++) {
120 if (stack_dump_trace[i] == ip) 98 if (stack_dump_trace[i] == ip)
121 break; 99 break;
122 } 100 }
@@ -125,7 +103,7 @@ check_stack(unsigned long ip, unsigned long *stack)
125 * Some archs may not have the passed in ip in the dump. 103 * Some archs may not have the passed in ip in the dump.
126 * If that happens, we need to show everything. 104 * If that happens, we need to show everything.
127 */ 105 */
128 if (i == stack_trace_max.nr_entries) 106 if (i == stack_trace_nr_entries)
129 i = 0; 107 i = 0;
130 108
131 /* 109 /*
@@ -143,15 +121,13 @@ check_stack(unsigned long ip, unsigned long *stack)
143 * loop will only happen once. This code only takes place 121 * loop will only happen once. This code only takes place
144 * on a new max, so it is far from a fast path. 122 * on a new max, so it is far from a fast path.
145 */ 123 */
146 while (i < stack_trace_max.nr_entries) { 124 while (i < stack_trace_nr_entries) {
147 int found = 0; 125 int found = 0;
148 126
149 stack_trace_index[x] = this_size; 127 stack_trace_index[x] = this_size;
150 p = start; 128 p = start;
151 129
152 for (; p < top && i < stack_trace_max.nr_entries; p++) { 130 for (; p < top && i < stack_trace_nr_entries; p++) {
153 if (stack_dump_trace[i] == ULONG_MAX)
154 break;
155 /* 131 /*
156 * The READ_ONCE_NOCHECK is used to let KASAN know that 132 * The READ_ONCE_NOCHECK is used to let KASAN know that
157 * this is not a stack-out-of-bounds error. 133 * this is not a stack-out-of-bounds error.
@@ -182,12 +158,10 @@ check_stack(unsigned long ip, unsigned long *stack)
182 i++; 158 i++;
183 } 159 }
184 160
185 stack_trace_max.nr_entries = x; 161 stack_trace_nr_entries = x;
186 for (; x < i; x++)
187 stack_dump_trace[x] = ULONG_MAX;
188 162
189 if (task_stack_end_corrupted(current)) { 163 if (task_stack_end_corrupted(current)) {
190 stack_trace_print(); 164 print_max_stack();
191 BUG(); 165 BUG();
192 } 166 }
193 167
@@ -286,7 +260,7 @@ __next(struct seq_file *m, loff_t *pos)
286{ 260{
287 long n = *pos - 1; 261 long n = *pos - 1;
288 262
289 if (n >= stack_trace_max.nr_entries || stack_dump_trace[n] == ULONG_MAX) 263 if (n >= stack_trace_nr_entries)
290 return NULL; 264 return NULL;
291 265
292 m->private = (void *)n; 266 m->private = (void *)n;
@@ -350,7 +324,7 @@ static int t_show(struct seq_file *m, void *v)
350 seq_printf(m, " Depth Size Location" 324 seq_printf(m, " Depth Size Location"
351 " (%d entries)\n" 325 " (%d entries)\n"
352 " ----- ---- --------\n", 326 " ----- ---- --------\n",
353 stack_trace_max.nr_entries); 327 stack_trace_nr_entries);
354 328
355 if (!stack_tracer_enabled && !stack_trace_max_size) 329 if (!stack_tracer_enabled && !stack_trace_max_size)
356 print_disabled(m); 330 print_disabled(m);
@@ -360,12 +334,10 @@ static int t_show(struct seq_file *m, void *v)
360 334
361 i = *(long *)v; 335 i = *(long *)v;
362 336
363 if (i >= stack_trace_max.nr_entries || 337 if (i >= stack_trace_nr_entries)
364 stack_dump_trace[i] == ULONG_MAX)
365 return 0; 338 return 0;
366 339
367 if (i+1 == stack_trace_max.nr_entries || 340 if (i + 1 == stack_trace_nr_entries)
368 stack_dump_trace[i+1] == ULONG_MAX)
369 size = stack_trace_index[i]; 341 size = stack_trace_index[i];
370 else 342 else
371 size = stack_trace_index[i] - stack_trace_index[i+1]; 343 size = stack_trace_index[i] - stack_trace_index[i+1];
@@ -422,23 +394,21 @@ stack_trace_sysctl(struct ctl_table *table, int write,
422 void __user *buffer, size_t *lenp, 394 void __user *buffer, size_t *lenp,
423 loff_t *ppos) 395 loff_t *ppos)
424{ 396{
397 int was_enabled;
425 int ret; 398 int ret;
426 399
427 mutex_lock(&stack_sysctl_mutex); 400 mutex_lock(&stack_sysctl_mutex);
401 was_enabled = !!stack_tracer_enabled;
428 402
429 ret = proc_dointvec(table, write, buffer, lenp, ppos); 403 ret = proc_dointvec(table, write, buffer, lenp, ppos);
430 404
431 if (ret || !write || 405 if (ret || !write || (was_enabled == !!stack_tracer_enabled))
432 (last_stack_tracer_enabled == !!stack_tracer_enabled))
433 goto out; 406 goto out;
434 407
435 last_stack_tracer_enabled = !!stack_tracer_enabled;
436
437 if (stack_tracer_enabled) 408 if (stack_tracer_enabled)
438 register_ftrace_function(&trace_ops); 409 register_ftrace_function(&trace_ops);
439 else 410 else
440 unregister_ftrace_function(&trace_ops); 411 unregister_ftrace_function(&trace_ops);
441
442 out: 412 out:
443 mutex_unlock(&stack_sysctl_mutex); 413 mutex_unlock(&stack_sysctl_mutex);
444 return ret; 414 return ret;
@@ -454,7 +424,6 @@ static __init int enable_stacktrace(char *str)
454 strncpy(stack_trace_filter_buf, str + len, COMMAND_LINE_SIZE); 424 strncpy(stack_trace_filter_buf, str + len, COMMAND_LINE_SIZE);
455 425
456 stack_tracer_enabled = 1; 426 stack_tracer_enabled = 1;
457 last_stack_tracer_enabled = 1;
458 return 1; 427 return 1;
459} 428}
460__setup("stacktrace", enable_stacktrace); 429__setup("stacktrace", enable_stacktrace);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index f93a56d2db27..fa8fbff736d6 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -314,6 +314,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
314 struct ring_buffer_event *event; 314 struct ring_buffer_event *event;
315 struct ring_buffer *buffer; 315 struct ring_buffer *buffer;
316 unsigned long irq_flags; 316 unsigned long irq_flags;
317 unsigned long args[6];
317 int pc; 318 int pc;
318 int syscall_nr; 319 int syscall_nr;
319 int size; 320 int size;
@@ -347,7 +348,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
347 348
348 entry = ring_buffer_event_data(event); 349 entry = ring_buffer_event_data(event);
349 entry->nr = syscall_nr; 350 entry->nr = syscall_nr;
350 syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); 351 syscall_get_arguments(current, regs, args);
352 memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args);
351 353
352 event_trigger_unlock_commit(trace_file, buffer, event, entry, 354 event_trigger_unlock_commit(trace_file, buffer, event, entry,
353 irq_flags, pc); 355 irq_flags, pc);
@@ -583,6 +585,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
583 struct syscall_metadata *sys_data; 585 struct syscall_metadata *sys_data;
584 struct syscall_trace_enter *rec; 586 struct syscall_trace_enter *rec;
585 struct hlist_head *head; 587 struct hlist_head *head;
588 unsigned long args[6];
586 bool valid_prog_array; 589 bool valid_prog_array;
587 int syscall_nr; 590 int syscall_nr;
588 int rctx; 591 int rctx;
@@ -613,8 +616,8 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
613 return; 616 return;
614 617
615 rec->nr = syscall_nr; 618 rec->nr = syscall_nr;
616 syscall_get_arguments(current, regs, 0, sys_data->nb_args, 619 syscall_get_arguments(current, regs, args);
617 (unsigned long *)&rec->args); 620 memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args);
618 621
619 if ((valid_prog_array && 622 if ((valid_prog_array &&
620 !perf_call_bpf_enter(sys_data->enter_event, regs, sys_data, rec)) || 623 !perf_call_bpf_enter(sys_data->enter_event, regs, sys_data, rec)) ||
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 8fbfda94a67b..7f9e7b9306fe 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -42,9 +42,9 @@ int __read_mostly watchdog_user_enabled = 1;
42int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT; 42int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT;
43int __read_mostly soft_watchdog_user_enabled = 1; 43int __read_mostly soft_watchdog_user_enabled = 1;
44int __read_mostly watchdog_thresh = 10; 44int __read_mostly watchdog_thresh = 10;
45int __read_mostly nmi_watchdog_available; 45static int __read_mostly nmi_watchdog_available;
46 46
47struct cpumask watchdog_allowed_mask __read_mostly; 47static struct cpumask watchdog_allowed_mask __read_mostly;
48 48
49struct cpumask watchdog_cpumask __read_mostly; 49struct cpumask watchdog_cpumask __read_mostly;
50unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); 50unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
@@ -554,13 +554,15 @@ static void softlockup_start_all(void)
554 554
555int lockup_detector_online_cpu(unsigned int cpu) 555int lockup_detector_online_cpu(unsigned int cpu)
556{ 556{
557 watchdog_enable(cpu); 557 if (cpumask_test_cpu(cpu, &watchdog_allowed_mask))
558 watchdog_enable(cpu);
558 return 0; 559 return 0;
559} 560}
560 561
561int lockup_detector_offline_cpu(unsigned int cpu) 562int lockup_detector_offline_cpu(unsigned int cpu)
562{ 563{
563 watchdog_disable(cpu); 564 if (cpumask_test_cpu(cpu, &watchdog_allowed_mask))
565 watchdog_disable(cpu);
564 return 0; 566 return 0;
565} 567}
566 568
@@ -588,7 +590,7 @@ static void lockup_detector_reconfigure(void)
588 * Create the watchdog thread infrastructure and configure the detector(s). 590 * Create the watchdog thread infrastructure and configure the detector(s).
589 * 591 *
590 * The threads are not unparked as watchdog_allowed_mask is empty. When 592 * The threads are not unparked as watchdog_allowed_mask is empty. When
591 * the threads are sucessfully initialized, take the proper locks and 593 * the threads are successfully initialized, take the proper locks and
592 * unpark the threads in the watchdog_cpumask if the watchdog is enabled. 594 * unpark the threads in the watchdog_cpumask if the watchdog is enabled.
593 */ 595 */
594static __init void lockup_detector_setup(void) 596static __init void lockup_detector_setup(void)
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 71381168dede..247bf0b1582c 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -135,7 +135,8 @@ static void watchdog_overflow_callback(struct perf_event *event,
135 if (__this_cpu_read(hard_watchdog_warn) == true) 135 if (__this_cpu_read(hard_watchdog_warn) == true)
136 return; 136 return;
137 137
138 pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu); 138 pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n",
139 this_cpu);
139 print_modules(); 140 print_modules();
140 print_irqtrace_events(current); 141 print_irqtrace_events(current);
141 if (regs) 142 if (regs)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4026d1871407..faf7622246da 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -841,43 +841,32 @@ static void wake_up_worker(struct worker_pool *pool)
841} 841}
842 842
843/** 843/**
844 * wq_worker_waking_up - a worker is waking up 844 * wq_worker_running - a worker is running again
845 * @task: task waking up 845 * @task: task waking up
846 * @cpu: CPU @task is waking up to
847 * 846 *
848 * This function is called during try_to_wake_up() when a worker is 847 * This function is called when a worker returns from schedule()
849 * being awoken.
850 *
851 * CONTEXT:
852 * spin_lock_irq(rq->lock)
853 */ 848 */
854void wq_worker_waking_up(struct task_struct *task, int cpu) 849void wq_worker_running(struct task_struct *task)
855{ 850{
856 struct worker *worker = kthread_data(task); 851 struct worker *worker = kthread_data(task);
857 852
858 if (!(worker->flags & WORKER_NOT_RUNNING)) { 853 if (!worker->sleeping)
859 WARN_ON_ONCE(worker->pool->cpu != cpu); 854 return;
855 if (!(worker->flags & WORKER_NOT_RUNNING))
860 atomic_inc(&worker->pool->nr_running); 856 atomic_inc(&worker->pool->nr_running);
861 } 857 worker->sleeping = 0;
862} 858}
863 859
864/** 860/**
865 * wq_worker_sleeping - a worker is going to sleep 861 * wq_worker_sleeping - a worker is going to sleep
866 * @task: task going to sleep 862 * @task: task going to sleep
867 * 863 *
868 * This function is called during schedule() when a busy worker is 864 * This function is called from schedule() when a busy worker is
869 * going to sleep. Worker on the same cpu can be woken up by 865 * going to sleep.
870 * returning pointer to its task.
871 *
872 * CONTEXT:
873 * spin_lock_irq(rq->lock)
874 *
875 * Return:
876 * Worker task on @cpu to wake up, %NULL if none.
877 */ 866 */
878struct task_struct *wq_worker_sleeping(struct task_struct *task) 867void wq_worker_sleeping(struct task_struct *task)
879{ 868{
880 struct worker *worker = kthread_data(task), *to_wakeup = NULL; 869 struct worker *next, *worker = kthread_data(task);
881 struct worker_pool *pool; 870 struct worker_pool *pool;
882 871
883 /* 872 /*
@@ -886,13 +875,15 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task)
886 * checking NOT_RUNNING. 875 * checking NOT_RUNNING.
887 */ 876 */
888 if (worker->flags & WORKER_NOT_RUNNING) 877 if (worker->flags & WORKER_NOT_RUNNING)
889 return NULL; 878 return;
890 879
891 pool = worker->pool; 880 pool = worker->pool;
892 881
893 /* this can only happen on the local cpu */ 882 if (WARN_ON_ONCE(worker->sleeping))
894 if (WARN_ON_ONCE(pool->cpu != raw_smp_processor_id())) 883 return;
895 return NULL; 884
885 worker->sleeping = 1;
886 spin_lock_irq(&pool->lock);
896 887
897 /* 888 /*
898 * The counterpart of the following dec_and_test, implied mb, 889 * The counterpart of the following dec_and_test, implied mb,
@@ -906,9 +897,12 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task)
906 * lock is safe. 897 * lock is safe.
907 */ 898 */
908 if (atomic_dec_and_test(&pool->nr_running) && 899 if (atomic_dec_and_test(&pool->nr_running) &&
909 !list_empty(&pool->worklist)) 900 !list_empty(&pool->worklist)) {
910 to_wakeup = first_idle_worker(pool); 901 next = first_idle_worker(pool);
911 return to_wakeup ? to_wakeup->task : NULL; 902 if (next)
903 wake_up_process(next->task);
904 }
905 spin_unlock_irq(&pool->lock);
912} 906}
913 907
914/** 908/**
@@ -2277,7 +2271,7 @@ __acquires(&pool->lock)
2277 2271
2278 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { 2272 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
2279 pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n" 2273 pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"
2280 " last function: %pf\n", 2274 " last function: %ps\n",
2281 current->comm, preempt_count(), task_pid_nr(current), 2275 current->comm, preempt_count(), task_pid_nr(current),
2282 worker->current_func); 2276 worker->current_func);
2283 debug_show_held_locks(current); 2277 debug_show_held_locks(current);
@@ -2596,11 +2590,11 @@ static void check_flush_dependency(struct workqueue_struct *target_wq,
2596 worker = current_wq_worker(); 2590 worker = current_wq_worker();
2597 2591
2598 WARN_ONCE(current->flags & PF_MEMALLOC, 2592 WARN_ONCE(current->flags & PF_MEMALLOC,
2599 "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%pf", 2593 "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%ps",
2600 current->pid, current->comm, target_wq->name, target_func); 2594 current->pid, current->comm, target_wq->name, target_func);
2601 WARN_ONCE(worker && ((worker->current_pwq->wq->flags & 2595 WARN_ONCE(worker && ((worker->current_pwq->wq->flags &
2602 (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM), 2596 (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM),
2603 "workqueue: WQ_MEM_RECLAIM %s:%pf is flushing !WQ_MEM_RECLAIM %s:%pf", 2597 "workqueue: WQ_MEM_RECLAIM %s:%ps is flushing !WQ_MEM_RECLAIM %s:%ps",
2604 worker->current_pwq->wq->name, worker->current_func, 2598 worker->current_pwq->wq->name, worker->current_func,
2605 target_wq->name, target_func); 2599 target_wq->name, target_func);
2606} 2600}
@@ -4266,7 +4260,7 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
4266 INIT_LIST_HEAD(&wq->list); 4260 INIT_LIST_HEAD(&wq->list);
4267 4261
4268 if (alloc_and_link_pwqs(wq) < 0) 4262 if (alloc_and_link_pwqs(wq) < 0)
4269 goto err_free_wq; 4263 goto err_unreg_lockdep;
4270 4264
4271 if (wq_online && init_rescuer(wq) < 0) 4265 if (wq_online && init_rescuer(wq) < 0)
4272 goto err_destroy; 4266 goto err_destroy;
@@ -4292,9 +4286,10 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
4292 4286
4293 return wq; 4287 return wq;
4294 4288
4295err_free_wq: 4289err_unreg_lockdep:
4296 wq_unregister_lockdep(wq); 4290 wq_unregister_lockdep(wq);
4297 wq_free_lockdep(wq); 4291 wq_free_lockdep(wq);
4292err_free_wq:
4298 free_workqueue_attrs(wq->unbound_attrs); 4293 free_workqueue_attrs(wq->unbound_attrs);
4299 kfree(wq); 4294 kfree(wq);
4300 return NULL; 4295 return NULL;
@@ -4586,7 +4581,7 @@ void print_worker_info(const char *log_lvl, struct task_struct *task)
4586 probe_kernel_read(desc, worker->desc, sizeof(desc) - 1); 4581 probe_kernel_read(desc, worker->desc, sizeof(desc) - 1);
4587 4582
4588 if (fn || name[0] || desc[0]) { 4583 if (fn || name[0] || desc[0]) {
4589 printk("%sWorkqueue: %s %pf", log_lvl, name, fn); 4584 printk("%sWorkqueue: %s %ps", log_lvl, name, fn);
4590 if (strcmp(name, desc)) 4585 if (strcmp(name, desc))
4591 pr_cont(" (%s)", desc); 4586 pr_cont(" (%s)", desc);
4592 pr_cont("\n"); 4587 pr_cont("\n");
@@ -4611,7 +4606,7 @@ static void pr_cont_work(bool comma, struct work_struct *work)
4611 pr_cont("%s BAR(%d)", comma ? "," : "", 4606 pr_cont("%s BAR(%d)", comma ? "," : "",
4612 task_pid_nr(barr->task)); 4607 task_pid_nr(barr->task));
4613 } else { 4608 } else {
4614 pr_cont("%s %pf", comma ? "," : "", work->func); 4609 pr_cont("%s %ps", comma ? "," : "", work->func);
4615 } 4610 }
4616} 4611}
4617 4612
@@ -4643,7 +4638,7 @@ static void show_pwq(struct pool_workqueue *pwq)
4643 if (worker->current_pwq != pwq) 4638 if (worker->current_pwq != pwq)
4644 continue; 4639 continue;
4645 4640
4646 pr_cont("%s %d%s:%pf", comma ? "," : "", 4641 pr_cont("%s %d%s:%ps", comma ? "," : "",
4647 task_pid_nr(worker->task), 4642 task_pid_nr(worker->task),
4648 worker == pwq->wq->rescuer ? "(RESCUER)" : "", 4643 worker == pwq->wq->rescuer ? "(RESCUER)" : "",
4649 worker->current_func); 4644 worker->current_func);
@@ -4928,7 +4923,7 @@ static void rebind_workers(struct worker_pool *pool)
4928 * 4923 *
4929 * WRITE_ONCE() is necessary because @worker->flags may be 4924 * WRITE_ONCE() is necessary because @worker->flags may be
4930 * tested without holding any lock in 4925 * tested without holding any lock in
4931 * wq_worker_waking_up(). Without it, NOT_RUNNING test may 4926 * wq_worker_running(). Without it, NOT_RUNNING test may
4932 * fail incorrectly leading to premature concurrency 4927 * fail incorrectly leading to premature concurrency
4933 * management operations. 4928 * management operations.
4934 */ 4929 */
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index cb68b03ca89a..498de0e909a4 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -44,6 +44,7 @@ struct worker {
44 unsigned long last_active; /* L: last active timestamp */ 44 unsigned long last_active; /* L: last active timestamp */
45 unsigned int flags; /* X: flags */ 45 unsigned int flags; /* X: flags */
46 int id; /* I: worker id */ 46 int id; /* I: worker id */
47 int sleeping; /* None */
47 48
48 /* 49 /*
49 * Opaque string set with work_set_desc(). Printed out with task 50 * Opaque string set with work_set_desc(). Printed out with task
@@ -72,8 +73,8 @@ static inline struct worker *current_wq_worker(void)
72 * Scheduler hooks for concurrency managed workqueue. Only to be used from 73 * Scheduler hooks for concurrency managed workqueue. Only to be used from
73 * sched/ and workqueue.c. 74 * sched/ and workqueue.c.
74 */ 75 */
75void wq_worker_waking_up(struct task_struct *task, int cpu); 76void wq_worker_running(struct task_struct *task);
76struct task_struct *wq_worker_sleeping(struct task_struct *task); 77void wq_worker_sleeping(struct task_struct *task);
77work_func_t wq_worker_last_func(struct task_struct *task); 78work_func_t wq_worker_last_func(struct task_struct *task);
78 79
79#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */ 80#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */