diff options
Diffstat (limited to 'kernel')
126 files changed, 4138 insertions, 2955 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore index 6e699100872f..34d1e77ee9df 100644 --- a/kernel/.gitignore +++ b/kernel/.gitignore | |||
@@ -1,5 +1,6 @@ | |||
1 | # | 1 | # |
2 | # Generated files | 2 | # Generated files |
3 | # | 3 | # |
4 | kheaders.md5 | ||
4 | timeconst.h | 5 | timeconst.h |
5 | hz.bc | 6 | hz.bc |
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index fbba478ae522..bf770d7556f7 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks | |||
@@ -229,7 +229,7 @@ config MUTEX_SPIN_ON_OWNER | |||
229 | 229 | ||
230 | config RWSEM_SPIN_ON_OWNER | 230 | config RWSEM_SPIN_ON_OWNER |
231 | def_bool y | 231 | def_bool y |
232 | depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW | 232 | depends on SMP && ARCH_SUPPORTS_ATOMIC_RMW |
233 | 233 | ||
234 | config LOCK_SPIN_ON_OWNER | 234 | config LOCK_SPIN_ON_OWNER |
235 | def_bool y | 235 | def_bool y |
@@ -251,3 +251,10 @@ config ARCH_USE_QUEUED_RWLOCKS | |||
251 | config QUEUED_RWLOCKS | 251 | config QUEUED_RWLOCKS |
252 | def_bool y if ARCH_USE_QUEUED_RWLOCKS | 252 | def_bool y if ARCH_USE_QUEUED_RWLOCKS |
253 | depends on SMP | 253 | depends on SMP |
254 | |||
255 | config ARCH_HAS_MMIOWB | ||
256 | bool | ||
257 | |||
258 | config MMIOWB | ||
259 | def_bool y if ARCH_HAS_MMIOWB | ||
260 | depends on SMP | ||
diff --git a/kernel/Makefile b/kernel/Makefile index 6c57e78817da..298437bb2c6a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -30,6 +30,7 @@ KCOV_INSTRUMENT_extable.o := n | |||
30 | # Don't self-instrument. | 30 | # Don't self-instrument. |
31 | KCOV_INSTRUMENT_kcov.o := n | 31 | KCOV_INSTRUMENT_kcov.o := n |
32 | KASAN_SANITIZE_kcov.o := n | 32 | KASAN_SANITIZE_kcov.o := n |
33 | CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) | ||
33 | 34 | ||
34 | # cond_syscall is currently not LTO compatible | 35 | # cond_syscall is currently not LTO compatible |
35 | CFLAGS_sys_ni.o = $(DISABLE_LTO) | 36 | CFLAGS_sys_ni.o = $(DISABLE_LTO) |
@@ -70,6 +71,7 @@ obj-$(CONFIG_UTS_NS) += utsname.o | |||
70 | obj-$(CONFIG_USER_NS) += user_namespace.o | 71 | obj-$(CONFIG_USER_NS) += user_namespace.o |
71 | obj-$(CONFIG_PID_NS) += pid_namespace.o | 72 | obj-$(CONFIG_PID_NS) += pid_namespace.o |
72 | obj-$(CONFIG_IKCONFIG) += configs.o | 73 | obj-$(CONFIG_IKCONFIG) += configs.o |
74 | obj-$(CONFIG_IKHEADERS_PROC) += kheaders.o | ||
73 | obj-$(CONFIG_SMP) += stop_machine.o | 75 | obj-$(CONFIG_SMP) += stop_machine.o |
74 | obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o | 76 | obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o |
75 | obj-$(CONFIG_AUDIT) += audit.o auditfilter.o | 77 | obj-$(CONFIG_AUDIT) += audit.o auditfilter.o |
@@ -121,3 +123,12 @@ $(obj)/configs.o: $(obj)/config_data.gz | |||
121 | targets += config_data.gz | 123 | targets += config_data.gz |
122 | $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE | 124 | $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE |
123 | $(call if_changed,gzip) | 125 | $(call if_changed,gzip) |
126 | |||
127 | $(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz | ||
128 | |||
129 | quiet_cmd_genikh = CHK $(obj)/kheaders_data.tar.xz | ||
130 | cmd_genikh = $(srctree)/kernel/gen_ikh_data.sh $@ | ||
131 | $(obj)/kheaders_data.tar.xz: FORCE | ||
132 | $(call cmd,genikh) | ||
133 | |||
134 | clean-files := kheaders_data.tar.xz kheaders.md5 | ||
diff --git a/kernel/acct.c b/kernel/acct.c index addf7732fb56..81f9831a7859 100644 --- a/kernel/acct.c +++ b/kernel/acct.c | |||
@@ -227,7 +227,7 @@ static int acct_on(struct filename *pathname) | |||
227 | filp_close(file, NULL); | 227 | filp_close(file, NULL); |
228 | return PTR_ERR(internal); | 228 | return PTR_ERR(internal); |
229 | } | 229 | } |
230 | err = mnt_want_write(internal); | 230 | err = __mnt_want_write(internal); |
231 | if (err) { | 231 | if (err) { |
232 | mntput(internal); | 232 | mntput(internal); |
233 | kfree(acct); | 233 | kfree(acct); |
@@ -252,7 +252,7 @@ static int acct_on(struct filename *pathname) | |||
252 | old = xchg(&ns->bacct, &acct->pin); | 252 | old = xchg(&ns->bacct, &acct->pin); |
253 | mutex_unlock(&acct->lock); | 253 | mutex_unlock(&acct->lock); |
254 | pin_kill(old); | 254 | pin_kill(old); |
255 | mnt_drop_write(mnt); | 255 | __mnt_drop_write(mnt); |
256 | mntput(mnt); | 256 | mntput(mnt); |
257 | return 0; | 257 | return 0; |
258 | } | 258 | } |
diff --git a/kernel/async.c b/kernel/async.c index f6bd0d9885e1..12c332e4e13e 100644 --- a/kernel/async.c +++ b/kernel/async.c | |||
@@ -119,7 +119,7 @@ static void async_run_entry_fn(struct work_struct *work) | |||
119 | 119 | ||
120 | /* 1) run (and print duration) */ | 120 | /* 1) run (and print duration) */ |
121 | if (initcall_debug && system_state < SYSTEM_RUNNING) { | 121 | if (initcall_debug && system_state < SYSTEM_RUNNING) { |
122 | pr_debug("calling %lli_%pF @ %i\n", | 122 | pr_debug("calling %lli_%pS @ %i\n", |
123 | (long long)entry->cookie, | 123 | (long long)entry->cookie, |
124 | entry->func, task_pid_nr(current)); | 124 | entry->func, task_pid_nr(current)); |
125 | calltime = ktime_get(); | 125 | calltime = ktime_get(); |
@@ -128,7 +128,7 @@ static void async_run_entry_fn(struct work_struct *work) | |||
128 | if (initcall_debug && system_state < SYSTEM_RUNNING) { | 128 | if (initcall_debug && system_state < SYSTEM_RUNNING) { |
129 | rettime = ktime_get(); | 129 | rettime = ktime_get(); |
130 | delta = ktime_sub(rettime, calltime); | 130 | delta = ktime_sub(rettime, calltime); |
131 | pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n", | 131 | pr_debug("initcall %lli_%pS returned 0 after %lld usecs\n", |
132 | (long long)entry->cookie, | 132 | (long long)entry->cookie, |
133 | entry->func, | 133 | entry->func, |
134 | (long long)ktime_to_ns(delta) >> 10); | 134 | (long long)ktime_to_ns(delta) >> 10); |
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c index 1323360d90e3..a563c8fdad0d 100644 --- a/kernel/backtracetest.c +++ b/kernel/backtracetest.c | |||
@@ -48,19 +48,14 @@ static void backtrace_test_irq(void) | |||
48 | #ifdef CONFIG_STACKTRACE | 48 | #ifdef CONFIG_STACKTRACE |
49 | static void backtrace_test_saved(void) | 49 | static void backtrace_test_saved(void) |
50 | { | 50 | { |
51 | struct stack_trace trace; | ||
52 | unsigned long entries[8]; | 51 | unsigned long entries[8]; |
52 | unsigned int nr_entries; | ||
53 | 53 | ||
54 | pr_info("Testing a saved backtrace.\n"); | 54 | pr_info("Testing a saved backtrace.\n"); |
55 | pr_info("The following trace is a kernel self test and not a bug!\n"); | 55 | pr_info("The following trace is a kernel self test and not a bug!\n"); |
56 | 56 | ||
57 | trace.nr_entries = 0; | 57 | nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0); |
58 | trace.max_entries = ARRAY_SIZE(entries); | 58 | stack_trace_print(entries, nr_entries, 0); |
59 | trace.entries = entries; | ||
60 | trace.skip = 0; | ||
61 | |||
62 | save_stack_trace(&trace); | ||
63 | print_stack_trace(&trace, 0); | ||
64 | } | 59 | } |
65 | #else | 60 | #else |
66 | static void backtrace_test_saved(void) | 61 | static void backtrace_test_saved(void) |
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ff09d32a8a1b..c605397c79f0 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c | |||
@@ -848,7 +848,6 @@ void __weak bpf_jit_free(struct bpf_prog *fp) | |||
848 | if (fp->jited) { | 848 | if (fp->jited) { |
849 | struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); | 849 | struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); |
850 | 850 | ||
851 | bpf_jit_binary_unlock_ro(hdr); | ||
852 | bpf_jit_binary_free(hdr); | 851 | bpf_jit_binary_free(hdr); |
853 | 852 | ||
854 | WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp)); | 853 | WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp)); |
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 8974b3755670..3c18260403dd 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c | |||
@@ -162,10 +162,14 @@ static void cpu_map_kthread_stop(struct work_struct *work) | |||
162 | static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, | 162 | static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, |
163 | struct xdp_frame *xdpf) | 163 | struct xdp_frame *xdpf) |
164 | { | 164 | { |
165 | unsigned int hard_start_headroom; | ||
165 | unsigned int frame_size; | 166 | unsigned int frame_size; |
166 | void *pkt_data_start; | 167 | void *pkt_data_start; |
167 | struct sk_buff *skb; | 168 | struct sk_buff *skb; |
168 | 169 | ||
170 | /* Part of headroom was reserved to xdpf */ | ||
171 | hard_start_headroom = sizeof(struct xdp_frame) + xdpf->headroom; | ||
172 | |||
169 | /* build_skb need to place skb_shared_info after SKB end, and | 173 | /* build_skb need to place skb_shared_info after SKB end, and |
170 | * also want to know the memory "truesize". Thus, need to | 174 | * also want to know the memory "truesize". Thus, need to |
171 | * know the memory frame size backing xdp_buff. | 175 | * know the memory frame size backing xdp_buff. |
@@ -183,15 +187,15 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, | |||
183 | * is not at a fixed memory location, with mixed length | 187 | * is not at a fixed memory location, with mixed length |
184 | * packets, which is bad for cache-line hotness. | 188 | * packets, which is bad for cache-line hotness. |
185 | */ | 189 | */ |
186 | frame_size = SKB_DATA_ALIGN(xdpf->len + xdpf->headroom) + | 190 | frame_size = SKB_DATA_ALIGN(xdpf->len + hard_start_headroom) + |
187 | SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); | 191 | SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); |
188 | 192 | ||
189 | pkt_data_start = xdpf->data - xdpf->headroom; | 193 | pkt_data_start = xdpf->data - hard_start_headroom; |
190 | skb = build_skb(pkt_data_start, frame_size); | 194 | skb = build_skb(pkt_data_start, frame_size); |
191 | if (!skb) | 195 | if (!skb) |
192 | return NULL; | 196 | return NULL; |
193 | 197 | ||
194 | skb_reserve(skb, xdpf->headroom); | 198 | skb_reserve(skb, hard_start_headroom); |
195 | __skb_put(skb, xdpf->len); | 199 | __skb_put(skb, xdpf->len); |
196 | if (xdpf->metasize) | 200 | if (xdpf->metasize) |
197 | skb_metadata_set(skb, xdpf->metasize); | 201 | skb_metadata_set(skb, xdpf->metasize); |
@@ -205,6 +209,9 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, | |||
205 | * - RX ring dev queue index (skb_record_rx_queue) | 209 | * - RX ring dev queue index (skb_record_rx_queue) |
206 | */ | 210 | */ |
207 | 211 | ||
212 | /* Allow SKB to reuse area used by xdp_frame */ | ||
213 | xdp_scrub_frame(xdpf); | ||
214 | |||
208 | return skb; | 215 | return skb; |
209 | } | 216 | } |
210 | 217 | ||
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 2ada5e21dfa6..bc53e5b20ddc 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c | |||
@@ -554,19 +554,6 @@ struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type typ | |||
554 | } | 554 | } |
555 | EXPORT_SYMBOL(bpf_prog_get_type_path); | 555 | EXPORT_SYMBOL(bpf_prog_get_type_path); |
556 | 556 | ||
557 | static void bpf_evict_inode(struct inode *inode) | ||
558 | { | ||
559 | enum bpf_type type; | ||
560 | |||
561 | truncate_inode_pages_final(&inode->i_data); | ||
562 | clear_inode(inode); | ||
563 | |||
564 | if (S_ISLNK(inode->i_mode)) | ||
565 | kfree(inode->i_link); | ||
566 | if (!bpf_inode_type(inode, &type)) | ||
567 | bpf_any_put(inode->i_private, type); | ||
568 | } | ||
569 | |||
570 | /* | 557 | /* |
571 | * Display the mount options in /proc/mounts. | 558 | * Display the mount options in /proc/mounts. |
572 | */ | 559 | */ |
@@ -579,11 +566,22 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root) | |||
579 | return 0; | 566 | return 0; |
580 | } | 567 | } |
581 | 568 | ||
569 | static void bpf_free_inode(struct inode *inode) | ||
570 | { | ||
571 | enum bpf_type type; | ||
572 | |||
573 | if (S_ISLNK(inode->i_mode)) | ||
574 | kfree(inode->i_link); | ||
575 | if (!bpf_inode_type(inode, &type)) | ||
576 | bpf_any_put(inode->i_private, type); | ||
577 | free_inode_nonrcu(inode); | ||
578 | } | ||
579 | |||
582 | static const struct super_operations bpf_super_ops = { | 580 | static const struct super_operations bpf_super_ops = { |
583 | .statfs = simple_statfs, | 581 | .statfs = simple_statfs, |
584 | .drop_inode = generic_delete_inode, | 582 | .drop_inode = generic_delete_inode, |
585 | .show_options = bpf_show_options, | 583 | .show_options = bpf_show_options, |
586 | .evict_inode = bpf_evict_inode, | 584 | .free_inode = bpf_free_inode, |
587 | }; | 585 | }; |
588 | 586 | ||
589 | enum { | 587 | enum { |
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 62f6bced3a3c..afca36f53c49 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c | |||
@@ -136,21 +136,29 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) | |||
136 | 136 | ||
137 | void *bpf_map_area_alloc(size_t size, int numa_node) | 137 | void *bpf_map_area_alloc(size_t size, int numa_node) |
138 | { | 138 | { |
139 | /* We definitely need __GFP_NORETRY, so OOM killer doesn't | 139 | /* We really just want to fail instead of triggering OOM killer |
140 | * trigger under memory pressure as we really just want to | 140 | * under memory pressure, therefore we set __GFP_NORETRY to kmalloc, |
141 | * fail instead. | 141 | * which is used for lower order allocation requests. |
142 | * | ||
143 | * It has been observed that higher order allocation requests done by | ||
144 | * vmalloc with __GFP_NORETRY being set might fail due to not trying | ||
145 | * to reclaim memory from the page cache, thus we set | ||
146 | * __GFP_RETRY_MAYFAIL to avoid such situations. | ||
142 | */ | 147 | */ |
143 | const gfp_t flags = __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO; | 148 | |
149 | const gfp_t flags = __GFP_NOWARN | __GFP_ZERO; | ||
144 | void *area; | 150 | void *area; |
145 | 151 | ||
146 | if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { | 152 | if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { |
147 | area = kmalloc_node(size, GFP_USER | flags, numa_node); | 153 | area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags, |
154 | numa_node); | ||
148 | if (area != NULL) | 155 | if (area != NULL) |
149 | return area; | 156 | return area; |
150 | } | 157 | } |
151 | 158 | ||
152 | return __vmalloc_node_flags_caller(size, numa_node, GFP_KERNEL | flags, | 159 | return __vmalloc_node_flags_caller(size, numa_node, |
153 | __builtin_return_address(0)); | 160 | GFP_KERNEL | __GFP_RETRY_MAYFAIL | |
161 | flags, __builtin_return_address(0)); | ||
154 | } | 162 | } |
155 | 163 | ||
156 | void bpf_map_area_free(void *area) | 164 | void bpf_map_area_free(void *area) |
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ce166a002d16..09d5d972c9ff 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c | |||
@@ -212,7 +212,7 @@ struct bpf_call_arg_meta { | |||
212 | int access_size; | 212 | int access_size; |
213 | s64 msize_smax_value; | 213 | s64 msize_smax_value; |
214 | u64 msize_umax_value; | 214 | u64 msize_umax_value; |
215 | int ptr_id; | 215 | int ref_obj_id; |
216 | int func_id; | 216 | int func_id; |
217 | }; | 217 | }; |
218 | 218 | ||
@@ -346,35 +346,23 @@ static bool reg_type_may_be_null(enum bpf_reg_type type) | |||
346 | type == PTR_TO_TCP_SOCK_OR_NULL; | 346 | type == PTR_TO_TCP_SOCK_OR_NULL; |
347 | } | 347 | } |
348 | 348 | ||
349 | static bool type_is_refcounted(enum bpf_reg_type type) | ||
350 | { | ||
351 | return type == PTR_TO_SOCKET; | ||
352 | } | ||
353 | |||
354 | static bool type_is_refcounted_or_null(enum bpf_reg_type type) | ||
355 | { | ||
356 | return type == PTR_TO_SOCKET || type == PTR_TO_SOCKET_OR_NULL; | ||
357 | } | ||
358 | |||
359 | static bool reg_is_refcounted(const struct bpf_reg_state *reg) | ||
360 | { | ||
361 | return type_is_refcounted(reg->type); | ||
362 | } | ||
363 | |||
364 | static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) | 349 | static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) |
365 | { | 350 | { |
366 | return reg->type == PTR_TO_MAP_VALUE && | 351 | return reg->type == PTR_TO_MAP_VALUE && |
367 | map_value_has_spin_lock(reg->map_ptr); | 352 | map_value_has_spin_lock(reg->map_ptr); |
368 | } | 353 | } |
369 | 354 | ||
370 | static bool reg_is_refcounted_or_null(const struct bpf_reg_state *reg) | 355 | static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type) |
371 | { | 356 | { |
372 | return type_is_refcounted_or_null(reg->type); | 357 | return type == PTR_TO_SOCKET || |
358 | type == PTR_TO_SOCKET_OR_NULL || | ||
359 | type == PTR_TO_TCP_SOCK || | ||
360 | type == PTR_TO_TCP_SOCK_OR_NULL; | ||
373 | } | 361 | } |
374 | 362 | ||
375 | static bool arg_type_is_refcounted(enum bpf_arg_type type) | 363 | static bool arg_type_may_be_refcounted(enum bpf_arg_type type) |
376 | { | 364 | { |
377 | return type == ARG_PTR_TO_SOCKET; | 365 | return type == ARG_PTR_TO_SOCK_COMMON; |
378 | } | 366 | } |
379 | 367 | ||
380 | /* Determine whether the function releases some resources allocated by another | 368 | /* Determine whether the function releases some resources allocated by another |
@@ -392,6 +380,12 @@ static bool is_acquire_function(enum bpf_func_id func_id) | |||
392 | func_id == BPF_FUNC_sk_lookup_udp; | 380 | func_id == BPF_FUNC_sk_lookup_udp; |
393 | } | 381 | } |
394 | 382 | ||
383 | static bool is_ptr_cast_function(enum bpf_func_id func_id) | ||
384 | { | ||
385 | return func_id == BPF_FUNC_tcp_sock || | ||
386 | func_id == BPF_FUNC_sk_fullsock; | ||
387 | } | ||
388 | |||
395 | /* string representation of 'enum bpf_reg_type' */ | 389 | /* string representation of 'enum bpf_reg_type' */ |
396 | static const char * const reg_type_str[] = { | 390 | static const char * const reg_type_str[] = { |
397 | [NOT_INIT] = "?", | 391 | [NOT_INIT] = "?", |
@@ -466,6 +460,8 @@ static void print_verifier_state(struct bpf_verifier_env *env, | |||
466 | verbose(env, ",call_%d", func(env, reg)->callsite); | 460 | verbose(env, ",call_%d", func(env, reg)->callsite); |
467 | } else { | 461 | } else { |
468 | verbose(env, "(id=%d", reg->id); | 462 | verbose(env, "(id=%d", reg->id); |
463 | if (reg_type_may_be_refcounted_or_null(t)) | ||
464 | verbose(env, ",ref_obj_id=%d", reg->ref_obj_id); | ||
469 | if (t != SCALAR_VALUE) | 465 | if (t != SCALAR_VALUE) |
470 | verbose(env, ",off=%d", reg->off); | 466 | verbose(env, ",off=%d", reg->off); |
471 | if (type_is_pkt_pointer(t)) | 467 | if (type_is_pkt_pointer(t)) |
@@ -1901,8 +1897,9 @@ continue_func: | |||
1901 | } | 1897 | } |
1902 | frame++; | 1898 | frame++; |
1903 | if (frame >= MAX_CALL_FRAMES) { | 1899 | if (frame >= MAX_CALL_FRAMES) { |
1904 | WARN_ONCE(1, "verifier bug. Call stack is too deep\n"); | 1900 | verbose(env, "the call stack of %d frames is too deep !\n", |
1905 | return -EFAULT; | 1901 | frame); |
1902 | return -E2BIG; | ||
1906 | } | 1903 | } |
1907 | goto process_func; | 1904 | goto process_func; |
1908 | } | 1905 | } |
@@ -2414,16 +2411,15 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, | |||
2414 | /* Any sk pointer can be ARG_PTR_TO_SOCK_COMMON */ | 2411 | /* Any sk pointer can be ARG_PTR_TO_SOCK_COMMON */ |
2415 | if (!type_is_sk_pointer(type)) | 2412 | if (!type_is_sk_pointer(type)) |
2416 | goto err_type; | 2413 | goto err_type; |
2417 | } else if (arg_type == ARG_PTR_TO_SOCKET) { | 2414 | if (reg->ref_obj_id) { |
2418 | expected_type = PTR_TO_SOCKET; | 2415 | if (meta->ref_obj_id) { |
2419 | if (type != expected_type) | 2416 | verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", |
2420 | goto err_type; | 2417 | regno, reg->ref_obj_id, |
2421 | if (meta->ptr_id || !reg->id) { | 2418 | meta->ref_obj_id); |
2422 | verbose(env, "verifier internal error: mismatched references meta=%d, reg=%d\n", | 2419 | return -EFAULT; |
2423 | meta->ptr_id, reg->id); | 2420 | } |
2424 | return -EFAULT; | 2421 | meta->ref_obj_id = reg->ref_obj_id; |
2425 | } | 2422 | } |
2426 | meta->ptr_id = reg->id; | ||
2427 | } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { | 2423 | } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { |
2428 | if (meta->func_id == BPF_FUNC_spin_lock) { | 2424 | if (meta->func_id == BPF_FUNC_spin_lock) { |
2429 | if (process_spin_lock(env, regno, true)) | 2425 | if (process_spin_lock(env, regno, true)) |
@@ -2740,32 +2736,38 @@ static bool check_arg_pair_ok(const struct bpf_func_proto *fn) | |||
2740 | return true; | 2736 | return true; |
2741 | } | 2737 | } |
2742 | 2738 | ||
2743 | static bool check_refcount_ok(const struct bpf_func_proto *fn) | 2739 | static bool check_refcount_ok(const struct bpf_func_proto *fn, int func_id) |
2744 | { | 2740 | { |
2745 | int count = 0; | 2741 | int count = 0; |
2746 | 2742 | ||
2747 | if (arg_type_is_refcounted(fn->arg1_type)) | 2743 | if (arg_type_may_be_refcounted(fn->arg1_type)) |
2748 | count++; | 2744 | count++; |
2749 | if (arg_type_is_refcounted(fn->arg2_type)) | 2745 | if (arg_type_may_be_refcounted(fn->arg2_type)) |
2750 | count++; | 2746 | count++; |
2751 | if (arg_type_is_refcounted(fn->arg3_type)) | 2747 | if (arg_type_may_be_refcounted(fn->arg3_type)) |
2752 | count++; | 2748 | count++; |
2753 | if (arg_type_is_refcounted(fn->arg4_type)) | 2749 | if (arg_type_may_be_refcounted(fn->arg4_type)) |
2754 | count++; | 2750 | count++; |
2755 | if (arg_type_is_refcounted(fn->arg5_type)) | 2751 | if (arg_type_may_be_refcounted(fn->arg5_type)) |
2756 | count++; | 2752 | count++; |
2757 | 2753 | ||
2754 | /* A reference acquiring function cannot acquire | ||
2755 | * another refcounted ptr. | ||
2756 | */ | ||
2757 | if (is_acquire_function(func_id) && count) | ||
2758 | return false; | ||
2759 | |||
2758 | /* We only support one arg being unreferenced at the moment, | 2760 | /* We only support one arg being unreferenced at the moment, |
2759 | * which is sufficient for the helper functions we have right now. | 2761 | * which is sufficient for the helper functions we have right now. |
2760 | */ | 2762 | */ |
2761 | return count <= 1; | 2763 | return count <= 1; |
2762 | } | 2764 | } |
2763 | 2765 | ||
2764 | static int check_func_proto(const struct bpf_func_proto *fn) | 2766 | static int check_func_proto(const struct bpf_func_proto *fn, int func_id) |
2765 | { | 2767 | { |
2766 | return check_raw_mode_ok(fn) && | 2768 | return check_raw_mode_ok(fn) && |
2767 | check_arg_pair_ok(fn) && | 2769 | check_arg_pair_ok(fn) && |
2768 | check_refcount_ok(fn) ? 0 : -EINVAL; | 2770 | check_refcount_ok(fn, func_id) ? 0 : -EINVAL; |
2769 | } | 2771 | } |
2770 | 2772 | ||
2771 | /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] | 2773 | /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] |
@@ -2799,19 +2801,20 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) | |||
2799 | } | 2801 | } |
2800 | 2802 | ||
2801 | static void release_reg_references(struct bpf_verifier_env *env, | 2803 | static void release_reg_references(struct bpf_verifier_env *env, |
2802 | struct bpf_func_state *state, int id) | 2804 | struct bpf_func_state *state, |
2805 | int ref_obj_id) | ||
2803 | { | 2806 | { |
2804 | struct bpf_reg_state *regs = state->regs, *reg; | 2807 | struct bpf_reg_state *regs = state->regs, *reg; |
2805 | int i; | 2808 | int i; |
2806 | 2809 | ||
2807 | for (i = 0; i < MAX_BPF_REG; i++) | 2810 | for (i = 0; i < MAX_BPF_REG; i++) |
2808 | if (regs[i].id == id) | 2811 | if (regs[i].ref_obj_id == ref_obj_id) |
2809 | mark_reg_unknown(env, regs, i); | 2812 | mark_reg_unknown(env, regs, i); |
2810 | 2813 | ||
2811 | bpf_for_each_spilled_reg(i, state, reg) { | 2814 | bpf_for_each_spilled_reg(i, state, reg) { |
2812 | if (!reg) | 2815 | if (!reg) |
2813 | continue; | 2816 | continue; |
2814 | if (reg_is_refcounted(reg) && reg->id == id) | 2817 | if (reg->ref_obj_id == ref_obj_id) |
2815 | __mark_reg_unknown(reg); | 2818 | __mark_reg_unknown(reg); |
2816 | } | 2819 | } |
2817 | } | 2820 | } |
@@ -2820,15 +2823,20 @@ static void release_reg_references(struct bpf_verifier_env *env, | |||
2820 | * resources. Identify all copies of the same pointer and clear the reference. | 2823 | * resources. Identify all copies of the same pointer and clear the reference. |
2821 | */ | 2824 | */ |
2822 | static int release_reference(struct bpf_verifier_env *env, | 2825 | static int release_reference(struct bpf_verifier_env *env, |
2823 | struct bpf_call_arg_meta *meta) | 2826 | int ref_obj_id) |
2824 | { | 2827 | { |
2825 | struct bpf_verifier_state *vstate = env->cur_state; | 2828 | struct bpf_verifier_state *vstate = env->cur_state; |
2829 | int err; | ||
2826 | int i; | 2830 | int i; |
2827 | 2831 | ||
2832 | err = release_reference_state(cur_func(env), ref_obj_id); | ||
2833 | if (err) | ||
2834 | return err; | ||
2835 | |||
2828 | for (i = 0; i <= vstate->curframe; i++) | 2836 | for (i = 0; i <= vstate->curframe; i++) |
2829 | release_reg_references(env, vstate->frame[i], meta->ptr_id); | 2837 | release_reg_references(env, vstate->frame[i], ref_obj_id); |
2830 | 2838 | ||
2831 | return release_reference_state(cur_func(env), meta->ptr_id); | 2839 | return 0; |
2832 | } | 2840 | } |
2833 | 2841 | ||
2834 | static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, | 2842 | static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, |
@@ -3047,7 +3055,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn | |||
3047 | memset(&meta, 0, sizeof(meta)); | 3055 | memset(&meta, 0, sizeof(meta)); |
3048 | meta.pkt_access = fn->pkt_access; | 3056 | meta.pkt_access = fn->pkt_access; |
3049 | 3057 | ||
3050 | err = check_func_proto(fn); | 3058 | err = check_func_proto(fn, func_id); |
3051 | if (err) { | 3059 | if (err) { |
3052 | verbose(env, "kernel subsystem misconfigured func %s#%d\n", | 3060 | verbose(env, "kernel subsystem misconfigured func %s#%d\n", |
3053 | func_id_name(func_id), func_id); | 3061 | func_id_name(func_id), func_id); |
@@ -3093,7 +3101,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn | |||
3093 | return err; | 3101 | return err; |
3094 | } | 3102 | } |
3095 | } else if (is_release_function(func_id)) { | 3103 | } else if (is_release_function(func_id)) { |
3096 | err = release_reference(env, &meta); | 3104 | err = release_reference(env, meta.ref_obj_id); |
3097 | if (err) { | 3105 | if (err) { |
3098 | verbose(env, "func %s#%d reference has not been acquired before\n", | 3106 | verbose(env, "func %s#%d reference has not been acquired before\n", |
3099 | func_id_name(func_id), func_id); | 3107 | func_id_name(func_id), func_id); |
@@ -3154,8 +3162,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn | |||
3154 | 3162 | ||
3155 | if (id < 0) | 3163 | if (id < 0) |
3156 | return id; | 3164 | return id; |
3157 | /* For release_reference() */ | 3165 | /* For mark_ptr_or_null_reg() */ |
3158 | regs[BPF_REG_0].id = id; | 3166 | regs[BPF_REG_0].id = id; |
3167 | /* For release_reference() */ | ||
3168 | regs[BPF_REG_0].ref_obj_id = id; | ||
3159 | } else { | 3169 | } else { |
3160 | /* For mark_ptr_or_null_reg() */ | 3170 | /* For mark_ptr_or_null_reg() */ |
3161 | regs[BPF_REG_0].id = ++env->id_gen; | 3171 | regs[BPF_REG_0].id = ++env->id_gen; |
@@ -3170,6 +3180,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn | |||
3170 | return -EINVAL; | 3180 | return -EINVAL; |
3171 | } | 3181 | } |
3172 | 3182 | ||
3183 | if (is_ptr_cast_function(func_id)) | ||
3184 | /* For release_reference() */ | ||
3185 | regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; | ||
3186 | |||
3173 | do_refine_retval_range(regs, fn->ret_type, func_id, &meta); | 3187 | do_refine_retval_range(regs, fn->ret_type, func_id, &meta); |
3174 | 3188 | ||
3175 | err = check_map_func_compatibility(env, meta.map_ptr, func_id); | 3189 | err = check_map_func_compatibility(env, meta.map_ptr, func_id); |
@@ -3368,7 +3382,7 @@ do_sim: | |||
3368 | *dst_reg = *ptr_reg; | 3382 | *dst_reg = *ptr_reg; |
3369 | } | 3383 | } |
3370 | ret = push_stack(env, env->insn_idx + 1, env->insn_idx, true); | 3384 | ret = push_stack(env, env->insn_idx + 1, env->insn_idx, true); |
3371 | if (!ptr_is_dst_reg) | 3385 | if (!ptr_is_dst_reg && ret) |
3372 | *dst_reg = tmp; | 3386 | *dst_reg = tmp; |
3373 | return !ret ? -EFAULT : 0; | 3387 | return !ret ? -EFAULT : 0; |
3374 | } | 3388 | } |
@@ -4124,15 +4138,35 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) | |||
4124 | return 0; | 4138 | return 0; |
4125 | } | 4139 | } |
4126 | 4140 | ||
4141 | static void __find_good_pkt_pointers(struct bpf_func_state *state, | ||
4142 | struct bpf_reg_state *dst_reg, | ||
4143 | enum bpf_reg_type type, u16 new_range) | ||
4144 | { | ||
4145 | struct bpf_reg_state *reg; | ||
4146 | int i; | ||
4147 | |||
4148 | for (i = 0; i < MAX_BPF_REG; i++) { | ||
4149 | reg = &state->regs[i]; | ||
4150 | if (reg->type == type && reg->id == dst_reg->id) | ||
4151 | /* keep the maximum range already checked */ | ||
4152 | reg->range = max(reg->range, new_range); | ||
4153 | } | ||
4154 | |||
4155 | bpf_for_each_spilled_reg(i, state, reg) { | ||
4156 | if (!reg) | ||
4157 | continue; | ||
4158 | if (reg->type == type && reg->id == dst_reg->id) | ||
4159 | reg->range = max(reg->range, new_range); | ||
4160 | } | ||
4161 | } | ||
4162 | |||
4127 | static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, | 4163 | static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, |
4128 | struct bpf_reg_state *dst_reg, | 4164 | struct bpf_reg_state *dst_reg, |
4129 | enum bpf_reg_type type, | 4165 | enum bpf_reg_type type, |
4130 | bool range_right_open) | 4166 | bool range_right_open) |
4131 | { | 4167 | { |
4132 | struct bpf_func_state *state = vstate->frame[vstate->curframe]; | ||
4133 | struct bpf_reg_state *regs = state->regs, *reg; | ||
4134 | u16 new_range; | 4168 | u16 new_range; |
4135 | int i, j; | 4169 | int i; |
4136 | 4170 | ||
4137 | if (dst_reg->off < 0 || | 4171 | if (dst_reg->off < 0 || |
4138 | (dst_reg->off == 0 && range_right_open)) | 4172 | (dst_reg->off == 0 && range_right_open)) |
@@ -4197,20 +4231,9 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, | |||
4197 | * the range won't allow anything. | 4231 | * the range won't allow anything. |
4198 | * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16. | 4232 | * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16. |
4199 | */ | 4233 | */ |
4200 | for (i = 0; i < MAX_BPF_REG; i++) | 4234 | for (i = 0; i <= vstate->curframe; i++) |
4201 | if (regs[i].type == type && regs[i].id == dst_reg->id) | 4235 | __find_good_pkt_pointers(vstate->frame[i], dst_reg, type, |
4202 | /* keep the maximum range already checked */ | 4236 | new_range); |
4203 | regs[i].range = max(regs[i].range, new_range); | ||
4204 | |||
4205 | for (j = 0; j <= vstate->curframe; j++) { | ||
4206 | state = vstate->frame[j]; | ||
4207 | bpf_for_each_spilled_reg(i, state, reg) { | ||
4208 | if (!reg) | ||
4209 | continue; | ||
4210 | if (reg->type == type && reg->id == dst_reg->id) | ||
4211 | reg->range = max(reg->range, new_range); | ||
4212 | } | ||
4213 | } | ||
4214 | } | 4237 | } |
4215 | 4238 | ||
4216 | /* compute branch direction of the expression "if (reg opcode val) goto target;" | 4239 | /* compute branch direction of the expression "if (reg opcode val) goto target;" |
@@ -4665,17 +4688,41 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, | |||
4665 | } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) { | 4688 | } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) { |
4666 | reg->type = PTR_TO_TCP_SOCK; | 4689 | reg->type = PTR_TO_TCP_SOCK; |
4667 | } | 4690 | } |
4668 | if (is_null || !(reg_is_refcounted(reg) || | 4691 | if (is_null) { |
4669 | reg_may_point_to_spin_lock(reg))) { | 4692 | /* We don't need id and ref_obj_id from this point |
4670 | /* We don't need id from this point onwards anymore, | 4693 | * onwards anymore, thus we should better reset it, |
4671 | * thus we should better reset it, so that state | 4694 | * so that state pruning has chances to take effect. |
4672 | * pruning has chances to take effect. | 4695 | */ |
4696 | reg->id = 0; | ||
4697 | reg->ref_obj_id = 0; | ||
4698 | } else if (!reg_may_point_to_spin_lock(reg)) { | ||
4699 | /* For not-NULL ptr, reg->ref_obj_id will be reset | ||
4700 | * in release_reg_references(). | ||
4701 | * | ||
4702 | * reg->id is still used by spin_lock ptr. Other | ||
4703 | * than spin_lock ptr type, reg->id can be reset. | ||
4673 | */ | 4704 | */ |
4674 | reg->id = 0; | 4705 | reg->id = 0; |
4675 | } | 4706 | } |
4676 | } | 4707 | } |
4677 | } | 4708 | } |
4678 | 4709 | ||
4710 | static void __mark_ptr_or_null_regs(struct bpf_func_state *state, u32 id, | ||
4711 | bool is_null) | ||
4712 | { | ||
4713 | struct bpf_reg_state *reg; | ||
4714 | int i; | ||
4715 | |||
4716 | for (i = 0; i < MAX_BPF_REG; i++) | ||
4717 | mark_ptr_or_null_reg(state, &state->regs[i], id, is_null); | ||
4718 | |||
4719 | bpf_for_each_spilled_reg(i, state, reg) { | ||
4720 | if (!reg) | ||
4721 | continue; | ||
4722 | mark_ptr_or_null_reg(state, reg, id, is_null); | ||
4723 | } | ||
4724 | } | ||
4725 | |||
4679 | /* The logic is similar to find_good_pkt_pointers(), both could eventually | 4726 | /* The logic is similar to find_good_pkt_pointers(), both could eventually |
4680 | * be folded together at some point. | 4727 | * be folded together at some point. |
4681 | */ | 4728 | */ |
@@ -4683,24 +4730,20 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno, | |||
4683 | bool is_null) | 4730 | bool is_null) |
4684 | { | 4731 | { |
4685 | struct bpf_func_state *state = vstate->frame[vstate->curframe]; | 4732 | struct bpf_func_state *state = vstate->frame[vstate->curframe]; |
4686 | struct bpf_reg_state *reg, *regs = state->regs; | 4733 | struct bpf_reg_state *regs = state->regs; |
4734 | u32 ref_obj_id = regs[regno].ref_obj_id; | ||
4687 | u32 id = regs[regno].id; | 4735 | u32 id = regs[regno].id; |
4688 | int i, j; | 4736 | int i; |
4689 | |||
4690 | if (reg_is_refcounted_or_null(®s[regno]) && is_null) | ||
4691 | release_reference_state(state, id); | ||
4692 | 4737 | ||
4693 | for (i = 0; i < MAX_BPF_REG; i++) | 4738 | if (ref_obj_id && ref_obj_id == id && is_null) |
4694 | mark_ptr_or_null_reg(state, ®s[i], id, is_null); | 4739 | /* regs[regno] is in the " == NULL" branch. |
4740 | * No one could have freed the reference state before | ||
4741 | * doing the NULL check. | ||
4742 | */ | ||
4743 | WARN_ON_ONCE(release_reference_state(state, id)); | ||
4695 | 4744 | ||
4696 | for (j = 0; j <= vstate->curframe; j++) { | 4745 | for (i = 0; i <= vstate->curframe; i++) |
4697 | state = vstate->frame[j]; | 4746 | __mark_ptr_or_null_regs(vstate->frame[i], id, is_null); |
4698 | bpf_for_each_spilled_reg(i, state, reg) { | ||
4699 | if (!reg) | ||
4700 | continue; | ||
4701 | mark_ptr_or_null_reg(state, reg, id, is_null); | ||
4702 | } | ||
4703 | } | ||
4704 | } | 4747 | } |
4705 | 4748 | ||
4706 | static bool try_match_pkt_pointers(const struct bpf_insn *insn, | 4749 | static bool try_match_pkt_pointers(const struct bpf_insn *insn, |
@@ -6052,15 +6095,17 @@ static int propagate_liveness(struct bpf_verifier_env *env, | |||
6052 | } | 6095 | } |
6053 | /* Propagate read liveness of registers... */ | 6096 | /* Propagate read liveness of registers... */ |
6054 | BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); | 6097 | BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); |
6055 | /* We don't need to worry about FP liveness because it's read-only */ | 6098 | for (frame = 0; frame <= vstate->curframe; frame++) { |
6056 | for (i = 0; i < BPF_REG_FP; i++) { | 6099 | /* We don't need to worry about FP liveness, it's read-only */ |
6057 | if (vparent->frame[vparent->curframe]->regs[i].live & REG_LIVE_READ) | 6100 | for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) { |
6058 | continue; | 6101 | if (vparent->frame[frame]->regs[i].live & REG_LIVE_READ) |
6059 | if (vstate->frame[vstate->curframe]->regs[i].live & REG_LIVE_READ) { | 6102 | continue; |
6060 | err = mark_reg_read(env, &vstate->frame[vstate->curframe]->regs[i], | 6103 | if (vstate->frame[frame]->regs[i].live & REG_LIVE_READ) { |
6061 | &vparent->frame[vstate->curframe]->regs[i]); | 6104 | err = mark_reg_read(env, &vstate->frame[frame]->regs[i], |
6062 | if (err) | 6105 | &vparent->frame[frame]->regs[i]); |
6063 | return err; | 6106 | if (err) |
6107 | return err; | ||
6108 | } | ||
6064 | } | 6109 | } |
6065 | } | 6110 | } |
6066 | 6111 | ||
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 4834c4214e9c..6a1942ed781c 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c | |||
@@ -740,11 +740,10 @@ static inline int nr_cpusets(void) | |||
740 | * Must be called with cpuset_mutex held. | 740 | * Must be called with cpuset_mutex held. |
741 | * | 741 | * |
742 | * The three key local variables below are: | 742 | * The three key local variables below are: |
743 | * q - a linked-list queue of cpuset pointers, used to implement a | 743 | * cp - cpuset pointer, used (together with pos_css) to perform a |
744 | * top-down scan of all cpusets. This scan loads a pointer | 744 | * top-down scan of all cpusets. For our purposes, rebuilding |
745 | * to each cpuset marked is_sched_load_balance into the | 745 | * the schedulers sched domains, we can ignore !is_sched_load_ |
746 | * array 'csa'. For our purposes, rebuilding the schedulers | 746 | * balance cpusets. |
747 | * sched domains, we can ignore !is_sched_load_balance cpusets. | ||
748 | * csa - (for CpuSet Array) Array of pointers to all the cpusets | 747 | * csa - (for CpuSet Array) Array of pointers to all the cpusets |
749 | * that need to be load balanced, for convenient iterative | 748 | * that need to be load balanced, for convenient iterative |
750 | * access by the subsequent code that finds the best partition, | 749 | * access by the subsequent code that finds the best partition, |
@@ -775,7 +774,7 @@ static inline int nr_cpusets(void) | |||
775 | static int generate_sched_domains(cpumask_var_t **domains, | 774 | static int generate_sched_domains(cpumask_var_t **domains, |
776 | struct sched_domain_attr **attributes) | 775 | struct sched_domain_attr **attributes) |
777 | { | 776 | { |
778 | struct cpuset *cp; /* scans q */ | 777 | struct cpuset *cp; /* top-down scan of cpusets */ |
779 | struct cpuset **csa; /* array of all cpuset ptrs */ | 778 | struct cpuset **csa; /* array of all cpuset ptrs */ |
780 | int csn; /* how many cpuset ptrs in csa so far */ | 779 | int csn; /* how many cpuset ptrs in csa so far */ |
781 | int i, j, k; /* indices for partition finding loops */ | 780 | int i, j, k; /* indices for partition finding loops */ |
diff --git a/kernel/cpu.c b/kernel/cpu.c index 025f419d16f6..f2ef10460698 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include <linux/notifier.h> | 9 | #include <linux/notifier.h> |
10 | #include <linux/sched/signal.h> | 10 | #include <linux/sched/signal.h> |
11 | #include <linux/sched/hotplug.h> | 11 | #include <linux/sched/hotplug.h> |
12 | #include <linux/sched/isolation.h> | ||
12 | #include <linux/sched/task.h> | 13 | #include <linux/sched/task.h> |
13 | #include <linux/sched/smt.h> | 14 | #include <linux/sched/smt.h> |
14 | #include <linux/unistd.h> | 15 | #include <linux/unistd.h> |
@@ -564,6 +565,20 @@ static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st) | |||
564 | cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); | 565 | cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); |
565 | } | 566 | } |
566 | 567 | ||
568 | static inline bool can_rollback_cpu(struct cpuhp_cpu_state *st) | ||
569 | { | ||
570 | if (IS_ENABLED(CONFIG_HOTPLUG_CPU)) | ||
571 | return true; | ||
572 | /* | ||
573 | * When CPU hotplug is disabled, then taking the CPU down is not | ||
574 | * possible because takedown_cpu() and the architecture and | ||
575 | * subsystem specific mechanisms are not available. So the CPU | ||
576 | * which would be completely unplugged again needs to stay around | ||
577 | * in the current state. | ||
578 | */ | ||
579 | return st->state <= CPUHP_BRINGUP_CPU; | ||
580 | } | ||
581 | |||
567 | static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, | 582 | static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, |
568 | enum cpuhp_state target) | 583 | enum cpuhp_state target) |
569 | { | 584 | { |
@@ -574,8 +589,10 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, | |||
574 | st->state++; | 589 | st->state++; |
575 | ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); | 590 | ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); |
576 | if (ret) { | 591 | if (ret) { |
577 | st->target = prev_state; | 592 | if (can_rollback_cpu(st)) { |
578 | undo_cpu_up(cpu, st); | 593 | st->target = prev_state; |
594 | undo_cpu_up(cpu, st); | ||
595 | } | ||
579 | break; | 596 | break; |
580 | } | 597 | } |
581 | } | 598 | } |
@@ -844,6 +861,8 @@ static int take_cpu_down(void *_param) | |||
844 | 861 | ||
845 | /* Give up timekeeping duties */ | 862 | /* Give up timekeeping duties */ |
846 | tick_handover_do_timer(); | 863 | tick_handover_do_timer(); |
864 | /* Remove CPU from timer broadcasting */ | ||
865 | tick_offline_cpu(cpu); | ||
847 | /* Park the stopper thread */ | 866 | /* Park the stopper thread */ |
848 | stop_machine_park(cpu); | 867 | stop_machine_park(cpu); |
849 | return 0; | 868 | return 0; |
@@ -1183,8 +1202,15 @@ int freeze_secondary_cpus(int primary) | |||
1183 | int cpu, error = 0; | 1202 | int cpu, error = 0; |
1184 | 1203 | ||
1185 | cpu_maps_update_begin(); | 1204 | cpu_maps_update_begin(); |
1186 | if (!cpu_online(primary)) | 1205 | if (primary == -1) { |
1187 | primary = cpumask_first(cpu_online_mask); | 1206 | primary = cpumask_first(cpu_online_mask); |
1207 | if (!housekeeping_cpu(primary, HK_FLAG_TIMER)) | ||
1208 | primary = housekeeping_any_cpu(HK_FLAG_TIMER); | ||
1209 | } else { | ||
1210 | if (!cpu_online(primary)) | ||
1211 | primary = cpumask_first(cpu_online_mask); | ||
1212 | } | ||
1213 | |||
1188 | /* | 1214 | /* |
1189 | * We take down all of the non-boot CPUs in one shot to avoid races | 1215 | * We take down all of the non-boot CPUs in one shot to avoid races |
1190 | * with the userspace trying to use the CPU hotplug at the same time | 1216 | * with the userspace trying to use the CPU hotplug at the same time |
@@ -2017,19 +2043,6 @@ static const struct attribute_group cpuhp_cpu_root_attr_group = { | |||
2017 | 2043 | ||
2018 | #ifdef CONFIG_HOTPLUG_SMT | 2044 | #ifdef CONFIG_HOTPLUG_SMT |
2019 | 2045 | ||
2020 | static const char *smt_states[] = { | ||
2021 | [CPU_SMT_ENABLED] = "on", | ||
2022 | [CPU_SMT_DISABLED] = "off", | ||
2023 | [CPU_SMT_FORCE_DISABLED] = "forceoff", | ||
2024 | [CPU_SMT_NOT_SUPPORTED] = "notsupported", | ||
2025 | }; | ||
2026 | |||
2027 | static ssize_t | ||
2028 | show_smt_control(struct device *dev, struct device_attribute *attr, char *buf) | ||
2029 | { | ||
2030 | return snprintf(buf, PAGE_SIZE - 2, "%s\n", smt_states[cpu_smt_control]); | ||
2031 | } | ||
2032 | |||
2033 | static void cpuhp_offline_cpu_device(unsigned int cpu) | 2046 | static void cpuhp_offline_cpu_device(unsigned int cpu) |
2034 | { | 2047 | { |
2035 | struct device *dev = get_cpu_device(cpu); | 2048 | struct device *dev = get_cpu_device(cpu); |
@@ -2100,9 +2113,10 @@ static int cpuhp_smt_enable(void) | |||
2100 | return ret; | 2113 | return ret; |
2101 | } | 2114 | } |
2102 | 2115 | ||
2116 | |||
2103 | static ssize_t | 2117 | static ssize_t |
2104 | store_smt_control(struct device *dev, struct device_attribute *attr, | 2118 | __store_smt_control(struct device *dev, struct device_attribute *attr, |
2105 | const char *buf, size_t count) | 2119 | const char *buf, size_t count) |
2106 | { | 2120 | { |
2107 | int ctrlval, ret; | 2121 | int ctrlval, ret; |
2108 | 2122 | ||
@@ -2140,14 +2154,44 @@ store_smt_control(struct device *dev, struct device_attribute *attr, | |||
2140 | unlock_device_hotplug(); | 2154 | unlock_device_hotplug(); |
2141 | return ret ? ret : count; | 2155 | return ret ? ret : count; |
2142 | } | 2156 | } |
2157 | |||
2158 | #else /* !CONFIG_HOTPLUG_SMT */ | ||
2159 | static ssize_t | ||
2160 | __store_smt_control(struct device *dev, struct device_attribute *attr, | ||
2161 | const char *buf, size_t count) | ||
2162 | { | ||
2163 | return -ENODEV; | ||
2164 | } | ||
2165 | #endif /* CONFIG_HOTPLUG_SMT */ | ||
2166 | |||
2167 | static const char *smt_states[] = { | ||
2168 | [CPU_SMT_ENABLED] = "on", | ||
2169 | [CPU_SMT_DISABLED] = "off", | ||
2170 | [CPU_SMT_FORCE_DISABLED] = "forceoff", | ||
2171 | [CPU_SMT_NOT_SUPPORTED] = "notsupported", | ||
2172 | [CPU_SMT_NOT_IMPLEMENTED] = "notimplemented", | ||
2173 | }; | ||
2174 | |||
2175 | static ssize_t | ||
2176 | show_smt_control(struct device *dev, struct device_attribute *attr, char *buf) | ||
2177 | { | ||
2178 | const char *state = smt_states[cpu_smt_control]; | ||
2179 | |||
2180 | return snprintf(buf, PAGE_SIZE - 2, "%s\n", state); | ||
2181 | } | ||
2182 | |||
2183 | static ssize_t | ||
2184 | store_smt_control(struct device *dev, struct device_attribute *attr, | ||
2185 | const char *buf, size_t count) | ||
2186 | { | ||
2187 | return __store_smt_control(dev, attr, buf, count); | ||
2188 | } | ||
2143 | static DEVICE_ATTR(control, 0644, show_smt_control, store_smt_control); | 2189 | static DEVICE_ATTR(control, 0644, show_smt_control, store_smt_control); |
2144 | 2190 | ||
2145 | static ssize_t | 2191 | static ssize_t |
2146 | show_smt_active(struct device *dev, struct device_attribute *attr, char *buf) | 2192 | show_smt_active(struct device *dev, struct device_attribute *attr, char *buf) |
2147 | { | 2193 | { |
2148 | bool active = topology_max_smt_threads() > 1; | 2194 | return snprintf(buf, PAGE_SIZE - 2, "%d\n", sched_smt_active()); |
2149 | |||
2150 | return snprintf(buf, PAGE_SIZE - 2, "%d\n", active); | ||
2151 | } | 2195 | } |
2152 | static DEVICE_ATTR(active, 0444, show_smt_active, NULL); | 2196 | static DEVICE_ATTR(active, 0444, show_smt_active, NULL); |
2153 | 2197 | ||
@@ -2163,21 +2207,17 @@ static const struct attribute_group cpuhp_smt_attr_group = { | |||
2163 | NULL | 2207 | NULL |
2164 | }; | 2208 | }; |
2165 | 2209 | ||
2166 | static int __init cpu_smt_state_init(void) | 2210 | static int __init cpu_smt_sysfs_init(void) |
2167 | { | 2211 | { |
2168 | return sysfs_create_group(&cpu_subsys.dev_root->kobj, | 2212 | return sysfs_create_group(&cpu_subsys.dev_root->kobj, |
2169 | &cpuhp_smt_attr_group); | 2213 | &cpuhp_smt_attr_group); |
2170 | } | 2214 | } |
2171 | 2215 | ||
2172 | #else | ||
2173 | static inline int cpu_smt_state_init(void) { return 0; } | ||
2174 | #endif | ||
2175 | |||
2176 | static int __init cpuhp_sysfs_init(void) | 2216 | static int __init cpuhp_sysfs_init(void) |
2177 | { | 2217 | { |
2178 | int cpu, ret; | 2218 | int cpu, ret; |
2179 | 2219 | ||
2180 | ret = cpu_smt_state_init(); | 2220 | ret = cpu_smt_sysfs_init(); |
2181 | if (ret) | 2221 | if (ret) |
2182 | return ret; | 2222 | return ret; |
2183 | 2223 | ||
@@ -2198,7 +2238,7 @@ static int __init cpuhp_sysfs_init(void) | |||
2198 | return 0; | 2238 | return 0; |
2199 | } | 2239 | } |
2200 | device_initcall(cpuhp_sysfs_init); | 2240 | device_initcall(cpuhp_sysfs_init); |
2201 | #endif | 2241 | #endif /* CONFIG_SYSFS && CONFIG_HOTPLUG_CPU */ |
2202 | 2242 | ||
2203 | /* | 2243 | /* |
2204 | * cpu_bit_bitmap[] is a special, "compressed" data structure that | 2244 | * cpu_bit_bitmap[] is a special, "compressed" data structure that |
@@ -2288,3 +2328,18 @@ void __init boot_cpu_hotplug_init(void) | |||
2288 | #endif | 2328 | #endif |
2289 | this_cpu_write(cpuhp_state.state, CPUHP_ONLINE); | 2329 | this_cpu_write(cpuhp_state.state, CPUHP_ONLINE); |
2290 | } | 2330 | } |
2331 | |||
2332 | enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO; | ||
2333 | |||
2334 | static int __init mitigations_parse_cmdline(char *arg) | ||
2335 | { | ||
2336 | if (!strcmp(arg, "off")) | ||
2337 | cpu_mitigations = CPU_MITIGATIONS_OFF; | ||
2338 | else if (!strcmp(arg, "auto")) | ||
2339 | cpu_mitigations = CPU_MITIGATIONS_AUTO; | ||
2340 | else if (!strcmp(arg, "auto,nosmt")) | ||
2341 | cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT; | ||
2342 | |||
2343 | return 0; | ||
2344 | } | ||
2345 | early_param("mitigations", mitigations_parse_cmdline); | ||
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 45d51e8e26f6..badd77670d00 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c | |||
@@ -89,8 +89,8 @@ struct dma_debug_entry { | |||
89 | int sg_mapped_ents; | 89 | int sg_mapped_ents; |
90 | enum map_err_types map_err_type; | 90 | enum map_err_types map_err_type; |
91 | #ifdef CONFIG_STACKTRACE | 91 | #ifdef CONFIG_STACKTRACE |
92 | struct stack_trace stacktrace; | 92 | unsigned int stack_len; |
93 | unsigned long st_entries[DMA_DEBUG_STACKTRACE_ENTRIES]; | 93 | unsigned long stack_entries[DMA_DEBUG_STACKTRACE_ENTRIES]; |
94 | #endif | 94 | #endif |
95 | }; | 95 | }; |
96 | 96 | ||
@@ -174,7 +174,7 @@ static inline void dump_entry_trace(struct dma_debug_entry *entry) | |||
174 | #ifdef CONFIG_STACKTRACE | 174 | #ifdef CONFIG_STACKTRACE |
175 | if (entry) { | 175 | if (entry) { |
176 | pr_warning("Mapped at:\n"); | 176 | pr_warning("Mapped at:\n"); |
177 | print_stack_trace(&entry->stacktrace, 0); | 177 | stack_trace_print(entry->stack_entries, entry->stack_len, 0); |
178 | } | 178 | } |
179 | #endif | 179 | #endif |
180 | } | 180 | } |
@@ -704,12 +704,10 @@ static struct dma_debug_entry *dma_entry_alloc(void) | |||
704 | spin_unlock_irqrestore(&free_entries_lock, flags); | 704 | spin_unlock_irqrestore(&free_entries_lock, flags); |
705 | 705 | ||
706 | #ifdef CONFIG_STACKTRACE | 706 | #ifdef CONFIG_STACKTRACE |
707 | entry->stacktrace.max_entries = DMA_DEBUG_STACKTRACE_ENTRIES; | 707 | entry->stack_len = stack_trace_save(entry->stack_entries, |
708 | entry->stacktrace.entries = entry->st_entries; | 708 | ARRAY_SIZE(entry->stack_entries), |
709 | entry->stacktrace.skip = 2; | 709 | 1); |
710 | save_stack_trace(&entry->stacktrace); | ||
711 | #endif | 710 | #endif |
712 | |||
713 | return entry; | 711 | return entry; |
714 | } | 712 | } |
715 | 713 | ||
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 53012db1e53c..6f7619c1f877 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c | |||
@@ -452,6 +452,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, | |||
452 | unsigned long mask; | 452 | unsigned long mask; |
453 | unsigned long offset_slots; | 453 | unsigned long offset_slots; |
454 | unsigned long max_slots; | 454 | unsigned long max_slots; |
455 | unsigned long tmp_io_tlb_used; | ||
455 | 456 | ||
456 | if (no_iotlb_memory) | 457 | if (no_iotlb_memory) |
457 | panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer"); | 458 | panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer"); |
@@ -538,9 +539,12 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, | |||
538 | } while (index != wrap); | 539 | } while (index != wrap); |
539 | 540 | ||
540 | not_found: | 541 | not_found: |
542 | tmp_io_tlb_used = io_tlb_used; | ||
543 | |||
541 | spin_unlock_irqrestore(&io_tlb_lock, flags); | 544 | spin_unlock_irqrestore(&io_tlb_lock, flags); |
542 | if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit()) | 545 | if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit()) |
543 | dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes)\n", size); | 546 | dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n", |
547 | size, io_tlb_nslabs, tmp_io_tlb_used); | ||
544 | return DMA_MAPPING_ERROR; | 548 | return DMA_MAPPING_ERROR; |
545 | found: | 549 | found: |
546 | io_tlb_used += nslots; | 550 | io_tlb_used += nslots; |
diff --git a/kernel/events/core.c b/kernel/events/core.c index 1032a16bd186..abbd4b3b96c2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -2009,8 +2009,8 @@ event_sched_out(struct perf_event *event, | |||
2009 | event->pmu->del(event, 0); | 2009 | event->pmu->del(event, 0); |
2010 | event->oncpu = -1; | 2010 | event->oncpu = -1; |
2011 | 2011 | ||
2012 | if (event->pending_disable) { | 2012 | if (READ_ONCE(event->pending_disable) >= 0) { |
2013 | event->pending_disable = 0; | 2013 | WRITE_ONCE(event->pending_disable, -1); |
2014 | state = PERF_EVENT_STATE_OFF; | 2014 | state = PERF_EVENT_STATE_OFF; |
2015 | } | 2015 | } |
2016 | perf_event_set_state(event, state); | 2016 | perf_event_set_state(event, state); |
@@ -2198,7 +2198,8 @@ EXPORT_SYMBOL_GPL(perf_event_disable); | |||
2198 | 2198 | ||
2199 | void perf_event_disable_inatomic(struct perf_event *event) | 2199 | void perf_event_disable_inatomic(struct perf_event *event) |
2200 | { | 2200 | { |
2201 | event->pending_disable = 1; | 2201 | WRITE_ONCE(event->pending_disable, smp_processor_id()); |
2202 | /* can fail, see perf_pending_event_disable() */ | ||
2202 | irq_work_queue(&event->pending); | 2203 | irq_work_queue(&event->pending); |
2203 | } | 2204 | } |
2204 | 2205 | ||
@@ -2477,6 +2478,16 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, | |||
2477 | perf_pmu_enable(cpuctx->ctx.pmu); | 2478 | perf_pmu_enable(cpuctx->ctx.pmu); |
2478 | } | 2479 | } |
2479 | 2480 | ||
2481 | void perf_pmu_resched(struct pmu *pmu) | ||
2482 | { | ||
2483 | struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); | ||
2484 | struct perf_event_context *task_ctx = cpuctx->task_ctx; | ||
2485 | |||
2486 | perf_ctx_lock(cpuctx, task_ctx); | ||
2487 | ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU); | ||
2488 | perf_ctx_unlock(cpuctx, task_ctx); | ||
2489 | } | ||
2490 | |||
2480 | /* | 2491 | /* |
2481 | * Cross CPU call to install and enable a performance event | 2492 | * Cross CPU call to install and enable a performance event |
2482 | * | 2493 | * |
@@ -5810,10 +5821,45 @@ void perf_event_wakeup(struct perf_event *event) | |||
5810 | } | 5821 | } |
5811 | } | 5822 | } |
5812 | 5823 | ||
5824 | static void perf_pending_event_disable(struct perf_event *event) | ||
5825 | { | ||
5826 | int cpu = READ_ONCE(event->pending_disable); | ||
5827 | |||
5828 | if (cpu < 0) | ||
5829 | return; | ||
5830 | |||
5831 | if (cpu == smp_processor_id()) { | ||
5832 | WRITE_ONCE(event->pending_disable, -1); | ||
5833 | perf_event_disable_local(event); | ||
5834 | return; | ||
5835 | } | ||
5836 | |||
5837 | /* | ||
5838 | * CPU-A CPU-B | ||
5839 | * | ||
5840 | * perf_event_disable_inatomic() | ||
5841 | * @pending_disable = CPU-A; | ||
5842 | * irq_work_queue(); | ||
5843 | * | ||
5844 | * sched-out | ||
5845 | * @pending_disable = -1; | ||
5846 | * | ||
5847 | * sched-in | ||
5848 | * perf_event_disable_inatomic() | ||
5849 | * @pending_disable = CPU-B; | ||
5850 | * irq_work_queue(); // FAILS | ||
5851 | * | ||
5852 | * irq_work_run() | ||
5853 | * perf_pending_event() | ||
5854 | * | ||
5855 | * But the event runs on CPU-B and wants disabling there. | ||
5856 | */ | ||
5857 | irq_work_queue_on(&event->pending, cpu); | ||
5858 | } | ||
5859 | |||
5813 | static void perf_pending_event(struct irq_work *entry) | 5860 | static void perf_pending_event(struct irq_work *entry) |
5814 | { | 5861 | { |
5815 | struct perf_event *event = container_of(entry, | 5862 | struct perf_event *event = container_of(entry, struct perf_event, pending); |
5816 | struct perf_event, pending); | ||
5817 | int rctx; | 5863 | int rctx; |
5818 | 5864 | ||
5819 | rctx = perf_swevent_get_recursion_context(); | 5865 | rctx = perf_swevent_get_recursion_context(); |
@@ -5822,10 +5868,7 @@ static void perf_pending_event(struct irq_work *entry) | |||
5822 | * and we won't recurse 'further'. | 5868 | * and we won't recurse 'further'. |
5823 | */ | 5869 | */ |
5824 | 5870 | ||
5825 | if (event->pending_disable) { | 5871 | perf_pending_event_disable(event); |
5826 | event->pending_disable = 0; | ||
5827 | perf_event_disable_local(event); | ||
5828 | } | ||
5829 | 5872 | ||
5830 | if (event->pending_wakeup) { | 5873 | if (event->pending_wakeup) { |
5831 | event->pending_wakeup = 0; | 5874 | event->pending_wakeup = 0; |
@@ -7189,6 +7232,7 @@ static void perf_event_mmap_output(struct perf_event *event, | |||
7189 | struct perf_output_handle handle; | 7232 | struct perf_output_handle handle; |
7190 | struct perf_sample_data sample; | 7233 | struct perf_sample_data sample; |
7191 | int size = mmap_event->event_id.header.size; | 7234 | int size = mmap_event->event_id.header.size; |
7235 | u32 type = mmap_event->event_id.header.type; | ||
7192 | int ret; | 7236 | int ret; |
7193 | 7237 | ||
7194 | if (!perf_event_mmap_match(event, data)) | 7238 | if (!perf_event_mmap_match(event, data)) |
@@ -7232,6 +7276,7 @@ static void perf_event_mmap_output(struct perf_event *event, | |||
7232 | perf_output_end(&handle); | 7276 | perf_output_end(&handle); |
7233 | out: | 7277 | out: |
7234 | mmap_event->event_id.header.size = size; | 7278 | mmap_event->event_id.header.size = size; |
7279 | mmap_event->event_id.header.type = type; | ||
7235 | } | 7280 | } |
7236 | 7281 | ||
7237 | static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) | 7282 | static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) |
@@ -9042,26 +9087,29 @@ static void perf_event_addr_filters_apply(struct perf_event *event) | |||
9042 | if (task == TASK_TOMBSTONE) | 9087 | if (task == TASK_TOMBSTONE) |
9043 | return; | 9088 | return; |
9044 | 9089 | ||
9045 | if (!ifh->nr_file_filters) | 9090 | if (ifh->nr_file_filters) { |
9046 | return; | 9091 | mm = get_task_mm(event->ctx->task); |
9047 | 9092 | if (!mm) | |
9048 | mm = get_task_mm(event->ctx->task); | 9093 | goto restart; |
9049 | if (!mm) | ||
9050 | goto restart; | ||
9051 | 9094 | ||
9052 | down_read(&mm->mmap_sem); | 9095 | down_read(&mm->mmap_sem); |
9096 | } | ||
9053 | 9097 | ||
9054 | raw_spin_lock_irqsave(&ifh->lock, flags); | 9098 | raw_spin_lock_irqsave(&ifh->lock, flags); |
9055 | list_for_each_entry(filter, &ifh->list, entry) { | 9099 | list_for_each_entry(filter, &ifh->list, entry) { |
9056 | event->addr_filter_ranges[count].start = 0; | 9100 | if (filter->path.dentry) { |
9057 | event->addr_filter_ranges[count].size = 0; | 9101 | /* |
9102 | * Adjust base offset if the filter is associated to a | ||
9103 | * binary that needs to be mapped: | ||
9104 | */ | ||
9105 | event->addr_filter_ranges[count].start = 0; | ||
9106 | event->addr_filter_ranges[count].size = 0; | ||
9058 | 9107 | ||
9059 | /* | ||
9060 | * Adjust base offset if the filter is associated to a binary | ||
9061 | * that needs to be mapped: | ||
9062 | */ | ||
9063 | if (filter->path.dentry) | ||
9064 | perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]); | 9108 | perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]); |
9109 | } else { | ||
9110 | event->addr_filter_ranges[count].start = filter->offset; | ||
9111 | event->addr_filter_ranges[count].size = filter->size; | ||
9112 | } | ||
9065 | 9113 | ||
9066 | count++; | 9114 | count++; |
9067 | } | 9115 | } |
@@ -9069,9 +9117,11 @@ static void perf_event_addr_filters_apply(struct perf_event *event) | |||
9069 | event->addr_filters_gen++; | 9117 | event->addr_filters_gen++; |
9070 | raw_spin_unlock_irqrestore(&ifh->lock, flags); | 9118 | raw_spin_unlock_irqrestore(&ifh->lock, flags); |
9071 | 9119 | ||
9072 | up_read(&mm->mmap_sem); | 9120 | if (ifh->nr_file_filters) { |
9121 | up_read(&mm->mmap_sem); | ||
9073 | 9122 | ||
9074 | mmput(mm); | 9123 | mmput(mm); |
9124 | } | ||
9075 | 9125 | ||
9076 | restart: | 9126 | restart: |
9077 | perf_event_stop(event, 1); | 9127 | perf_event_stop(event, 1); |
@@ -10234,6 +10284,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
10234 | 10284 | ||
10235 | 10285 | ||
10236 | init_waitqueue_head(&event->waitq); | 10286 | init_waitqueue_head(&event->waitq); |
10287 | event->pending_disable = -1; | ||
10237 | init_irq_work(&event->pending, perf_pending_event); | 10288 | init_irq_work(&event->pending, perf_pending_event); |
10238 | 10289 | ||
10239 | mutex_init(&event->mmap_mutex); | 10290 | mutex_init(&event->mmap_mutex); |
@@ -11876,7 +11927,7 @@ static void __init perf_event_init_all_cpus(void) | |||
11876 | } | 11927 | } |
11877 | } | 11928 | } |
11878 | 11929 | ||
11879 | void perf_swevent_init_cpu(unsigned int cpu) | 11930 | static void perf_swevent_init_cpu(unsigned int cpu) |
11880 | { | 11931 | { |
11881 | struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); | 11932 | struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); |
11882 | 11933 | ||
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index a4047321d7d8..674b35383491 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c | |||
@@ -392,7 +392,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, | |||
392 | * store that will be enabled on successful return | 392 | * store that will be enabled on successful return |
393 | */ | 393 | */ |
394 | if (!handle->size) { /* A, matches D */ | 394 | if (!handle->size) { /* A, matches D */ |
395 | event->pending_disable = 1; | 395 | event->pending_disable = smp_processor_id(); |
396 | perf_output_wakeup(handle); | 396 | perf_output_wakeup(handle); |
397 | local_set(&rb->aux_nest, 0); | 397 | local_set(&rb->aux_nest, 0); |
398 | goto err_put; | 398 | goto err_put; |
@@ -455,24 +455,21 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) | |||
455 | rb->aux_head += size; | 455 | rb->aux_head += size; |
456 | } | 456 | } |
457 | 457 | ||
458 | if (size || handle->aux_flags) { | 458 | /* |
459 | /* | 459 | * Only send RECORD_AUX if we have something useful to communicate |
460 | * Only send RECORD_AUX if we have something useful to communicate | 460 | * |
461 | * | 461 | * Note: the OVERWRITE records by themselves are not considered |
462 | * Note: the OVERWRITE records by themselves are not considered | 462 | * useful, as they don't communicate any *new* information, |
463 | * useful, as they don't communicate any *new* information, | 463 | * aside from the short-lived offset, that becomes history at |
464 | * aside from the short-lived offset, that becomes history at | 464 | * the next event sched-in and therefore isn't useful. |
465 | * the next event sched-in and therefore isn't useful. | 465 | * The userspace that needs to copy out AUX data in overwrite |
466 | * The userspace that needs to copy out AUX data in overwrite | 466 | * mode should know to use user_page::aux_head for the actual |
467 | * mode should know to use user_page::aux_head for the actual | 467 | * offset. So, from now on we don't output AUX records that |
468 | * offset. So, from now on we don't output AUX records that | 468 | * have *only* OVERWRITE flag set. |
469 | * have *only* OVERWRITE flag set. | 469 | */ |
470 | */ | 470 | if (size || (handle->aux_flags & ~(u64)PERF_AUX_FLAG_OVERWRITE)) |
471 | 471 | perf_event_aux_event(handle->event, aux_head, size, | |
472 | if (handle->aux_flags & ~(u64)PERF_AUX_FLAG_OVERWRITE) | 472 | handle->aux_flags); |
473 | perf_event_aux_event(handle->event, aux_head, size, | ||
474 | handle->aux_flags); | ||
475 | } | ||
476 | 473 | ||
477 | rb->user_page->aux_head = rb->aux_head; | 474 | rb->user_page->aux_head = rb->aux_head; |
478 | if (rb_need_aux_wakeup(rb)) | 475 | if (rb_need_aux_wakeup(rb)) |
@@ -480,7 +477,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) | |||
480 | 477 | ||
481 | if (wakeup) { | 478 | if (wakeup) { |
482 | if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED) | 479 | if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED) |
483 | handle->event->pending_disable = 1; | 480 | handle->event->pending_disable = smp_processor_id(); |
484 | perf_output_wakeup(handle); | 481 | perf_output_wakeup(handle); |
485 | } | 482 | } |
486 | 483 | ||
@@ -613,8 +610,7 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, | |||
613 | * PMU requests more than one contiguous chunks of memory | 610 | * PMU requests more than one contiguous chunks of memory |
614 | * for SW double buffering | 611 | * for SW double buffering |
615 | */ | 612 | */ |
616 | if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_SW_DOUBLEBUF) && | 613 | if (!overwrite) { |
617 | !overwrite) { | ||
618 | if (!max_order) | 614 | if (!max_order) |
619 | return -EINVAL; | 615 | return -EINVAL; |
620 | 616 | ||
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index c5cde87329c7..4ca7364c956d 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c | |||
@@ -2028,7 +2028,7 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) | |||
2028 | if (uc->handler) { | 2028 | if (uc->handler) { |
2029 | rc = uc->handler(uc, regs); | 2029 | rc = uc->handler(uc, regs); |
2030 | WARN(rc & ~UPROBE_HANDLER_MASK, | 2030 | WARN(rc & ~UPROBE_HANDLER_MASK, |
2031 | "bad rc=0x%x from %pf()\n", rc, uc->handler); | 2031 | "bad rc=0x%x from %ps()\n", rc, uc->handler); |
2032 | } | 2032 | } |
2033 | 2033 | ||
2034 | if (uc->ret_handler) | 2034 | if (uc->ret_handler) |
@@ -2294,16 +2294,14 @@ static struct notifier_block uprobe_exception_nb = { | |||
2294 | .priority = INT_MAX-1, /* notified after kprobes, kgdb */ | 2294 | .priority = INT_MAX-1, /* notified after kprobes, kgdb */ |
2295 | }; | 2295 | }; |
2296 | 2296 | ||
2297 | static int __init init_uprobes(void) | 2297 | void __init uprobes_init(void) |
2298 | { | 2298 | { |
2299 | int i; | 2299 | int i; |
2300 | 2300 | ||
2301 | for (i = 0; i < UPROBES_HASH_SZ; i++) | 2301 | for (i = 0; i < UPROBES_HASH_SZ; i++) |
2302 | mutex_init(&uprobes_mmap_mutex[i]); | 2302 | mutex_init(&uprobes_mmap_mutex[i]); |
2303 | 2303 | ||
2304 | if (percpu_init_rwsem(&dup_mmap_sem)) | 2304 | BUG_ON(percpu_init_rwsem(&dup_mmap_sem)); |
2305 | return -ENOMEM; | ||
2306 | 2305 | ||
2307 | return register_die_notifier(&uprobe_exception_nb); | 2306 | BUG_ON(register_die_notifier(&uprobe_exception_nb)); |
2308 | } | 2307 | } |
2309 | __initcall(init_uprobes); | ||
diff --git a/kernel/fail_function.c b/kernel/fail_function.c index 17f75b545f66..feb80712b913 100644 --- a/kernel/fail_function.c +++ b/kernel/fail_function.c | |||
@@ -210,7 +210,7 @@ static int fei_seq_show(struct seq_file *m, void *v) | |||
210 | { | 210 | { |
211 | struct fei_attr *attr = list_entry(v, struct fei_attr, list); | 211 | struct fei_attr *attr = list_entry(v, struct fei_attr, list); |
212 | 212 | ||
213 | seq_printf(m, "%pf\n", attr->kp.addr); | 213 | seq_printf(m, "%ps\n", attr->kp.addr); |
214 | return 0; | 214 | return 0; |
215 | } | 215 | } |
216 | 216 | ||
diff --git a/kernel/fork.c b/kernel/fork.c index 9dcd18aa210b..8b03d93ba068 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -11,6 +11,7 @@ | |||
11 | * management can be a bitch. See 'mm/memory.c': 'copy_page_range()' | 11 | * management can be a bitch. See 'mm/memory.c': 'copy_page_range()' |
12 | */ | 12 | */ |
13 | 13 | ||
14 | #include <linux/anon_inodes.h> | ||
14 | #include <linux/slab.h> | 15 | #include <linux/slab.h> |
15 | #include <linux/sched/autogroup.h> | 16 | #include <linux/sched/autogroup.h> |
16 | #include <linux/sched/mm.h> | 17 | #include <linux/sched/mm.h> |
@@ -21,6 +22,7 @@ | |||
21 | #include <linux/sched/task.h> | 22 | #include <linux/sched/task.h> |
22 | #include <linux/sched/task_stack.h> | 23 | #include <linux/sched/task_stack.h> |
23 | #include <linux/sched/cputime.h> | 24 | #include <linux/sched/cputime.h> |
25 | #include <linux/seq_file.h> | ||
24 | #include <linux/rtmutex.h> | 26 | #include <linux/rtmutex.h> |
25 | #include <linux/init.h> | 27 | #include <linux/init.h> |
26 | #include <linux/unistd.h> | 28 | #include <linux/unistd.h> |
@@ -815,6 +817,7 @@ void __init fork_init(void) | |||
815 | #endif | 817 | #endif |
816 | 818 | ||
817 | lockdep_init_task(&init_task); | 819 | lockdep_init_task(&init_task); |
820 | uprobes_init(); | ||
818 | } | 821 | } |
819 | 822 | ||
820 | int __weak arch_dup_task_struct(struct task_struct *dst, | 823 | int __weak arch_dup_task_struct(struct task_struct *dst, |
@@ -1298,13 +1301,20 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) | |||
1298 | complete_vfork_done(tsk); | 1301 | complete_vfork_done(tsk); |
1299 | } | 1302 | } |
1300 | 1303 | ||
1301 | /* | 1304 | /** |
1302 | * Allocate a new mm structure and copy contents from the | 1305 | * dup_mm() - duplicates an existing mm structure |
1303 | * mm structure of the passed in task structure. | 1306 | * @tsk: the task_struct with which the new mm will be associated. |
1307 | * @oldmm: the mm to duplicate. | ||
1308 | * | ||
1309 | * Allocates a new mm structure and duplicates the provided @oldmm structure | ||
1310 | * content into it. | ||
1311 | * | ||
1312 | * Return: the duplicated mm or NULL on failure. | ||
1304 | */ | 1313 | */ |
1305 | static struct mm_struct *dup_mm(struct task_struct *tsk) | 1314 | static struct mm_struct *dup_mm(struct task_struct *tsk, |
1315 | struct mm_struct *oldmm) | ||
1306 | { | 1316 | { |
1307 | struct mm_struct *mm, *oldmm = current->mm; | 1317 | struct mm_struct *mm; |
1308 | int err; | 1318 | int err; |
1309 | 1319 | ||
1310 | mm = allocate_mm(); | 1320 | mm = allocate_mm(); |
@@ -1371,7 +1381,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) | |||
1371 | } | 1381 | } |
1372 | 1382 | ||
1373 | retval = -ENOMEM; | 1383 | retval = -ENOMEM; |
1374 | mm = dup_mm(tsk); | 1384 | mm = dup_mm(tsk, current->mm); |
1375 | if (!mm) | 1385 | if (!mm) |
1376 | goto fail_nomem; | 1386 | goto fail_nomem; |
1377 | 1387 | ||
@@ -1662,6 +1672,58 @@ static inline void rcu_copy_process(struct task_struct *p) | |||
1662 | #endif /* #ifdef CONFIG_TASKS_RCU */ | 1672 | #endif /* #ifdef CONFIG_TASKS_RCU */ |
1663 | } | 1673 | } |
1664 | 1674 | ||
1675 | static int pidfd_release(struct inode *inode, struct file *file) | ||
1676 | { | ||
1677 | struct pid *pid = file->private_data; | ||
1678 | |||
1679 | file->private_data = NULL; | ||
1680 | put_pid(pid); | ||
1681 | return 0; | ||
1682 | } | ||
1683 | |||
1684 | #ifdef CONFIG_PROC_FS | ||
1685 | static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) | ||
1686 | { | ||
1687 | struct pid_namespace *ns = proc_pid_ns(file_inode(m->file)); | ||
1688 | struct pid *pid = f->private_data; | ||
1689 | |||
1690 | seq_put_decimal_ull(m, "Pid:\t", pid_nr_ns(pid, ns)); | ||
1691 | seq_putc(m, '\n'); | ||
1692 | } | ||
1693 | #endif | ||
1694 | |||
1695 | const struct file_operations pidfd_fops = { | ||
1696 | .release = pidfd_release, | ||
1697 | #ifdef CONFIG_PROC_FS | ||
1698 | .show_fdinfo = pidfd_show_fdinfo, | ||
1699 | #endif | ||
1700 | }; | ||
1701 | |||
1702 | /** | ||
1703 | * pidfd_create() - Create a new pid file descriptor. | ||
1704 | * | ||
1705 | * @pid: struct pid that the pidfd will reference | ||
1706 | * | ||
1707 | * This creates a new pid file descriptor with the O_CLOEXEC flag set. | ||
1708 | * | ||
1709 | * Note, that this function can only be called after the fd table has | ||
1710 | * been unshared to avoid leaking the pidfd to the new process. | ||
1711 | * | ||
1712 | * Return: On success, a cloexec pidfd is returned. | ||
1713 | * On error, a negative errno number will be returned. | ||
1714 | */ | ||
1715 | static int pidfd_create(struct pid *pid) | ||
1716 | { | ||
1717 | int fd; | ||
1718 | |||
1719 | fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid), | ||
1720 | O_RDWR | O_CLOEXEC); | ||
1721 | if (fd < 0) | ||
1722 | put_pid(pid); | ||
1723 | |||
1724 | return fd; | ||
1725 | } | ||
1726 | |||
1665 | /* | 1727 | /* |
1666 | * This creates a new process as a copy of the old one, | 1728 | * This creates a new process as a copy of the old one, |
1667 | * but does not actually start it yet. | 1729 | * but does not actually start it yet. |
@@ -1674,13 +1736,14 @@ static __latent_entropy struct task_struct *copy_process( | |||
1674 | unsigned long clone_flags, | 1736 | unsigned long clone_flags, |
1675 | unsigned long stack_start, | 1737 | unsigned long stack_start, |
1676 | unsigned long stack_size, | 1738 | unsigned long stack_size, |
1739 | int __user *parent_tidptr, | ||
1677 | int __user *child_tidptr, | 1740 | int __user *child_tidptr, |
1678 | struct pid *pid, | 1741 | struct pid *pid, |
1679 | int trace, | 1742 | int trace, |
1680 | unsigned long tls, | 1743 | unsigned long tls, |
1681 | int node) | 1744 | int node) |
1682 | { | 1745 | { |
1683 | int retval; | 1746 | int pidfd = -1, retval; |
1684 | struct task_struct *p; | 1747 | struct task_struct *p; |
1685 | struct multiprocess_signals delayed; | 1748 | struct multiprocess_signals delayed; |
1686 | 1749 | ||
@@ -1730,6 +1793,31 @@ static __latent_entropy struct task_struct *copy_process( | |||
1730 | return ERR_PTR(-EINVAL); | 1793 | return ERR_PTR(-EINVAL); |
1731 | } | 1794 | } |
1732 | 1795 | ||
1796 | if (clone_flags & CLONE_PIDFD) { | ||
1797 | int reserved; | ||
1798 | |||
1799 | /* | ||
1800 | * - CLONE_PARENT_SETTID is useless for pidfds and also | ||
1801 | * parent_tidptr is used to return pidfds. | ||
1802 | * - CLONE_DETACHED is blocked so that we can potentially | ||
1803 | * reuse it later for CLONE_PIDFD. | ||
1804 | * - CLONE_THREAD is blocked until someone really needs it. | ||
1805 | */ | ||
1806 | if (clone_flags & | ||
1807 | (CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD)) | ||
1808 | return ERR_PTR(-EINVAL); | ||
1809 | |||
1810 | /* | ||
1811 | * Verify that parent_tidptr is sane so we can potentially | ||
1812 | * reuse it later. | ||
1813 | */ | ||
1814 | if (get_user(reserved, parent_tidptr)) | ||
1815 | return ERR_PTR(-EFAULT); | ||
1816 | |||
1817 | if (reserved != 0) | ||
1818 | return ERR_PTR(-EINVAL); | ||
1819 | } | ||
1820 | |||
1733 | /* | 1821 | /* |
1734 | * Force any signals received before this point to be delivered | 1822 | * Force any signals received before this point to be delivered |
1735 | * before the fork happens. Collect up signals sent to multiple | 1823 | * before the fork happens. Collect up signals sent to multiple |
@@ -1936,6 +2024,22 @@ static __latent_entropy struct task_struct *copy_process( | |||
1936 | } | 2024 | } |
1937 | } | 2025 | } |
1938 | 2026 | ||
2027 | /* | ||
2028 | * This has to happen after we've potentially unshared the file | ||
2029 | * descriptor table (so that the pidfd doesn't leak into the child | ||
2030 | * if the fd table isn't shared). | ||
2031 | */ | ||
2032 | if (clone_flags & CLONE_PIDFD) { | ||
2033 | retval = pidfd_create(pid); | ||
2034 | if (retval < 0) | ||
2035 | goto bad_fork_free_pid; | ||
2036 | |||
2037 | pidfd = retval; | ||
2038 | retval = put_user(pidfd, parent_tidptr); | ||
2039 | if (retval) | ||
2040 | goto bad_fork_put_pidfd; | ||
2041 | } | ||
2042 | |||
1939 | #ifdef CONFIG_BLOCK | 2043 | #ifdef CONFIG_BLOCK |
1940 | p->plug = NULL; | 2044 | p->plug = NULL; |
1941 | #endif | 2045 | #endif |
@@ -1996,7 +2100,7 @@ static __latent_entropy struct task_struct *copy_process( | |||
1996 | */ | 2100 | */ |
1997 | retval = cgroup_can_fork(p); | 2101 | retval = cgroup_can_fork(p); |
1998 | if (retval) | 2102 | if (retval) |
1999 | goto bad_fork_free_pid; | 2103 | goto bad_fork_put_pidfd; |
2000 | 2104 | ||
2001 | /* | 2105 | /* |
2002 | * From this point on we must avoid any synchronous user-space | 2106 | * From this point on we must avoid any synchronous user-space |
@@ -2111,6 +2215,9 @@ bad_fork_cancel_cgroup: | |||
2111 | spin_unlock(¤t->sighand->siglock); | 2215 | spin_unlock(¤t->sighand->siglock); |
2112 | write_unlock_irq(&tasklist_lock); | 2216 | write_unlock_irq(&tasklist_lock); |
2113 | cgroup_cancel_fork(p); | 2217 | cgroup_cancel_fork(p); |
2218 | bad_fork_put_pidfd: | ||
2219 | if (clone_flags & CLONE_PIDFD) | ||
2220 | ksys_close(pidfd); | ||
2114 | bad_fork_free_pid: | 2221 | bad_fork_free_pid: |
2115 | cgroup_threadgroup_change_end(current); | 2222 | cgroup_threadgroup_change_end(current); |
2116 | if (pid != &init_struct_pid) | 2223 | if (pid != &init_struct_pid) |
@@ -2176,7 +2283,7 @@ static inline void init_idle_pids(struct task_struct *idle) | |||
2176 | struct task_struct *fork_idle(int cpu) | 2283 | struct task_struct *fork_idle(int cpu) |
2177 | { | 2284 | { |
2178 | struct task_struct *task; | 2285 | struct task_struct *task; |
2179 | task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0, | 2286 | task = copy_process(CLONE_VM, 0, 0, NULL, NULL, &init_struct_pid, 0, 0, |
2180 | cpu_to_node(cpu)); | 2287 | cpu_to_node(cpu)); |
2181 | if (!IS_ERR(task)) { | 2288 | if (!IS_ERR(task)) { |
2182 | init_idle_pids(task); | 2289 | init_idle_pids(task); |
@@ -2186,6 +2293,11 @@ struct task_struct *fork_idle(int cpu) | |||
2186 | return task; | 2293 | return task; |
2187 | } | 2294 | } |
2188 | 2295 | ||
2296 | struct mm_struct *copy_init_mm(void) | ||
2297 | { | ||
2298 | return dup_mm(NULL, &init_mm); | ||
2299 | } | ||
2300 | |||
2189 | /* | 2301 | /* |
2190 | * Ok, this is the main fork-routine. | 2302 | * Ok, this is the main fork-routine. |
2191 | * | 2303 | * |
@@ -2223,7 +2335,7 @@ long _do_fork(unsigned long clone_flags, | |||
2223 | trace = 0; | 2335 | trace = 0; |
2224 | } | 2336 | } |
2225 | 2337 | ||
2226 | p = copy_process(clone_flags, stack_start, stack_size, | 2338 | p = copy_process(clone_flags, stack_start, stack_size, parent_tidptr, |
2227 | child_tidptr, NULL, trace, tls, NUMA_NO_NODE); | 2339 | child_tidptr, NULL, trace, tls, NUMA_NO_NODE); |
2228 | add_latent_entropy(); | 2340 | add_latent_entropy(); |
2229 | 2341 | ||
diff --git a/kernel/futex.c b/kernel/futex.c index c3b73b0311bc..6262f1534ac9 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -1311,13 +1311,15 @@ static int lookup_pi_state(u32 __user *uaddr, u32 uval, | |||
1311 | 1311 | ||
1312 | static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) | 1312 | static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) |
1313 | { | 1313 | { |
1314 | int err; | ||
1314 | u32 uninitialized_var(curval); | 1315 | u32 uninitialized_var(curval); |
1315 | 1316 | ||
1316 | if (unlikely(should_fail_futex(true))) | 1317 | if (unlikely(should_fail_futex(true))) |
1317 | return -EFAULT; | 1318 | return -EFAULT; |
1318 | 1319 | ||
1319 | if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) | 1320 | err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); |
1320 | return -EFAULT; | 1321 | if (unlikely(err)) |
1322 | return err; | ||
1321 | 1323 | ||
1322 | /* If user space value changed, let the caller retry */ | 1324 | /* If user space value changed, let the caller retry */ |
1323 | return curval != uval ? -EAGAIN : 0; | 1325 | return curval != uval ? -EAGAIN : 0; |
@@ -1502,10 +1504,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_ | |||
1502 | if (unlikely(should_fail_futex(true))) | 1504 | if (unlikely(should_fail_futex(true))) |
1503 | ret = -EFAULT; | 1505 | ret = -EFAULT; |
1504 | 1506 | ||
1505 | if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) { | 1507 | ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); |
1506 | ret = -EFAULT; | 1508 | if (!ret && (curval != uval)) { |
1507 | |||
1508 | } else if (curval != uval) { | ||
1509 | /* | 1509 | /* |
1510 | * If a unconditional UNLOCK_PI operation (user space did not | 1510 | * If a unconditional UNLOCK_PI operation (user space did not |
1511 | * try the TID->0 transition) raced with a waiter setting the | 1511 | * try the TID->0 transition) raced with a waiter setting the |
@@ -1700,32 +1700,32 @@ retry_private: | |||
1700 | double_lock_hb(hb1, hb2); | 1700 | double_lock_hb(hb1, hb2); |
1701 | op_ret = futex_atomic_op_inuser(op, uaddr2); | 1701 | op_ret = futex_atomic_op_inuser(op, uaddr2); |
1702 | if (unlikely(op_ret < 0)) { | 1702 | if (unlikely(op_ret < 0)) { |
1703 | |||
1704 | double_unlock_hb(hb1, hb2); | 1703 | double_unlock_hb(hb1, hb2); |
1705 | 1704 | ||
1706 | #ifndef CONFIG_MMU | 1705 | if (!IS_ENABLED(CONFIG_MMU) || |
1707 | /* | 1706 | unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) { |
1708 | * we don't get EFAULT from MMU faults if we don't have an MMU, | 1707 | /* |
1709 | * but we might get them from range checking | 1708 | * we don't get EFAULT from MMU faults if we don't have |
1710 | */ | 1709 | * an MMU, but we might get them from range checking |
1711 | ret = op_ret; | 1710 | */ |
1712 | goto out_put_keys; | ||
1713 | #endif | ||
1714 | |||
1715 | if (unlikely(op_ret != -EFAULT)) { | ||
1716 | ret = op_ret; | 1711 | ret = op_ret; |
1717 | goto out_put_keys; | 1712 | goto out_put_keys; |
1718 | } | 1713 | } |
1719 | 1714 | ||
1720 | ret = fault_in_user_writeable(uaddr2); | 1715 | if (op_ret == -EFAULT) { |
1721 | if (ret) | 1716 | ret = fault_in_user_writeable(uaddr2); |
1722 | goto out_put_keys; | 1717 | if (ret) |
1718 | goto out_put_keys; | ||
1719 | } | ||
1723 | 1720 | ||
1724 | if (!(flags & FLAGS_SHARED)) | 1721 | if (!(flags & FLAGS_SHARED)) { |
1722 | cond_resched(); | ||
1725 | goto retry_private; | 1723 | goto retry_private; |
1724 | } | ||
1726 | 1725 | ||
1727 | put_futex_key(&key2); | 1726 | put_futex_key(&key2); |
1728 | put_futex_key(&key1); | 1727 | put_futex_key(&key1); |
1728 | cond_resched(); | ||
1729 | goto retry; | 1729 | goto retry; |
1730 | } | 1730 | } |
1731 | 1731 | ||
@@ -2350,7 +2350,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, | |||
2350 | u32 uval, uninitialized_var(curval), newval; | 2350 | u32 uval, uninitialized_var(curval), newval; |
2351 | struct task_struct *oldowner, *newowner; | 2351 | struct task_struct *oldowner, *newowner; |
2352 | u32 newtid; | 2352 | u32 newtid; |
2353 | int ret; | 2353 | int ret, err = 0; |
2354 | 2354 | ||
2355 | lockdep_assert_held(q->lock_ptr); | 2355 | lockdep_assert_held(q->lock_ptr); |
2356 | 2356 | ||
@@ -2421,14 +2421,17 @@ retry: | |||
2421 | if (!pi_state->owner) | 2421 | if (!pi_state->owner) |
2422 | newtid |= FUTEX_OWNER_DIED; | 2422 | newtid |= FUTEX_OWNER_DIED; |
2423 | 2423 | ||
2424 | if (get_futex_value_locked(&uval, uaddr)) | 2424 | err = get_futex_value_locked(&uval, uaddr); |
2425 | goto handle_fault; | 2425 | if (err) |
2426 | goto handle_err; | ||
2426 | 2427 | ||
2427 | for (;;) { | 2428 | for (;;) { |
2428 | newval = (uval & FUTEX_OWNER_DIED) | newtid; | 2429 | newval = (uval & FUTEX_OWNER_DIED) | newtid; |
2429 | 2430 | ||
2430 | if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) | 2431 | err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); |
2431 | goto handle_fault; | 2432 | if (err) |
2433 | goto handle_err; | ||
2434 | |||
2432 | if (curval == uval) | 2435 | if (curval == uval) |
2433 | break; | 2436 | break; |
2434 | uval = curval; | 2437 | uval = curval; |
@@ -2456,23 +2459,37 @@ retry: | |||
2456 | return 0; | 2459 | return 0; |
2457 | 2460 | ||
2458 | /* | 2461 | /* |
2459 | * To handle the page fault we need to drop the locks here. That gives | 2462 | * In order to reschedule or handle a page fault, we need to drop the |
2460 | * the other task (either the highest priority waiter itself or the | 2463 | * locks here. In the case of a fault, this gives the other task |
2461 | * task which stole the rtmutex) the chance to try the fixup of the | 2464 | * (either the highest priority waiter itself or the task which stole |
2462 | * pi_state. So once we are back from handling the fault we need to | 2465 | * the rtmutex) the chance to try the fixup of the pi_state. So once we |
2463 | * check the pi_state after reacquiring the locks and before trying to | 2466 | * are back from handling the fault we need to check the pi_state after |
2464 | * do another fixup. When the fixup has been done already we simply | 2467 | * reacquiring the locks and before trying to do another fixup. When |
2465 | * return. | 2468 | * the fixup has been done already we simply return. |
2466 | * | 2469 | * |
2467 | * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely | 2470 | * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely |
2468 | * drop hb->lock since the caller owns the hb -> futex_q relation. | 2471 | * drop hb->lock since the caller owns the hb -> futex_q relation. |
2469 | * Dropping the pi_mutex->wait_lock requires the state revalidate. | 2472 | * Dropping the pi_mutex->wait_lock requires the state revalidate. |
2470 | */ | 2473 | */ |
2471 | handle_fault: | 2474 | handle_err: |
2472 | raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); | 2475 | raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); |
2473 | spin_unlock(q->lock_ptr); | 2476 | spin_unlock(q->lock_ptr); |
2474 | 2477 | ||
2475 | ret = fault_in_user_writeable(uaddr); | 2478 | switch (err) { |
2479 | case -EFAULT: | ||
2480 | ret = fault_in_user_writeable(uaddr); | ||
2481 | break; | ||
2482 | |||
2483 | case -EAGAIN: | ||
2484 | cond_resched(); | ||
2485 | ret = 0; | ||
2486 | break; | ||
2487 | |||
2488 | default: | ||
2489 | WARN_ON_ONCE(1); | ||
2490 | ret = err; | ||
2491 | break; | ||
2492 | } | ||
2476 | 2493 | ||
2477 | spin_lock(q->lock_ptr); | 2494 | spin_lock(q->lock_ptr); |
2478 | raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); | 2495 | raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); |
@@ -3041,10 +3058,8 @@ retry: | |||
3041 | * A unconditional UNLOCK_PI op raced against a waiter | 3058 | * A unconditional UNLOCK_PI op raced against a waiter |
3042 | * setting the FUTEX_WAITERS bit. Try again. | 3059 | * setting the FUTEX_WAITERS bit. Try again. |
3043 | */ | 3060 | */ |
3044 | if (ret == -EAGAIN) { | 3061 | if (ret == -EAGAIN) |
3045 | put_futex_key(&key); | 3062 | goto pi_retry; |
3046 | goto retry; | ||
3047 | } | ||
3048 | /* | 3063 | /* |
3049 | * wake_futex_pi has detected invalid state. Tell user | 3064 | * wake_futex_pi has detected invalid state. Tell user |
3050 | * space. | 3065 | * space. |
@@ -3059,9 +3074,19 @@ retry: | |||
3059 | * preserve the WAITERS bit not the OWNER_DIED one. We are the | 3074 | * preserve the WAITERS bit not the OWNER_DIED one. We are the |
3060 | * owner. | 3075 | * owner. |
3061 | */ | 3076 | */ |
3062 | if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) { | 3077 | if ((ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))) { |
3063 | spin_unlock(&hb->lock); | 3078 | spin_unlock(&hb->lock); |
3064 | goto pi_faulted; | 3079 | switch (ret) { |
3080 | case -EFAULT: | ||
3081 | goto pi_faulted; | ||
3082 | |||
3083 | case -EAGAIN: | ||
3084 | goto pi_retry; | ||
3085 | |||
3086 | default: | ||
3087 | WARN_ON_ONCE(1); | ||
3088 | goto out_putkey; | ||
3089 | } | ||
3065 | } | 3090 | } |
3066 | 3091 | ||
3067 | /* | 3092 | /* |
@@ -3075,6 +3100,11 @@ out_putkey: | |||
3075 | put_futex_key(&key); | 3100 | put_futex_key(&key); |
3076 | return ret; | 3101 | return ret; |
3077 | 3102 | ||
3103 | pi_retry: | ||
3104 | put_futex_key(&key); | ||
3105 | cond_resched(); | ||
3106 | goto retry; | ||
3107 | |||
3078 | pi_faulted: | 3108 | pi_faulted: |
3079 | put_futex_key(&key); | 3109 | put_futex_key(&key); |
3080 | 3110 | ||
@@ -3435,47 +3465,67 @@ err_unlock: | |||
3435 | static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) | 3465 | static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) |
3436 | { | 3466 | { |
3437 | u32 uval, uninitialized_var(nval), mval; | 3467 | u32 uval, uninitialized_var(nval), mval; |
3468 | int err; | ||
3469 | |||
3470 | /* Futex address must be 32bit aligned */ | ||
3471 | if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0) | ||
3472 | return -1; | ||
3438 | 3473 | ||
3439 | retry: | 3474 | retry: |
3440 | if (get_user(uval, uaddr)) | 3475 | if (get_user(uval, uaddr)) |
3441 | return -1; | 3476 | return -1; |
3442 | 3477 | ||
3443 | if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) { | 3478 | if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr)) |
3444 | /* | 3479 | return 0; |
3445 | * Ok, this dying thread is truly holding a futex | 3480 | |
3446 | * of interest. Set the OWNER_DIED bit atomically | 3481 | /* |
3447 | * via cmpxchg, and if the value had FUTEX_WAITERS | 3482 | * Ok, this dying thread is truly holding a futex |
3448 | * set, wake up a waiter (if any). (We have to do a | 3483 | * of interest. Set the OWNER_DIED bit atomically |
3449 | * futex_wake() even if OWNER_DIED is already set - | 3484 | * via cmpxchg, and if the value had FUTEX_WAITERS |
3450 | * to handle the rare but possible case of recursive | 3485 | * set, wake up a waiter (if any). (We have to do a |
3451 | * thread-death.) The rest of the cleanup is done in | 3486 | * futex_wake() even if OWNER_DIED is already set - |
3452 | * userspace. | 3487 | * to handle the rare but possible case of recursive |
3453 | */ | 3488 | * thread-death.) The rest of the cleanup is done in |
3454 | mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; | 3489 | * userspace. |
3455 | /* | 3490 | */ |
3456 | * We are not holding a lock here, but we want to have | 3491 | mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; |
3457 | * the pagefault_disable/enable() protection because | 3492 | |
3458 | * we want to handle the fault gracefully. If the | 3493 | /* |
3459 | * access fails we try to fault in the futex with R/W | 3494 | * We are not holding a lock here, but we want to have |
3460 | * verification via get_user_pages. get_user() above | 3495 | * the pagefault_disable/enable() protection because |
3461 | * does not guarantee R/W access. If that fails we | 3496 | * we want to handle the fault gracefully. If the |
3462 | * give up and leave the futex locked. | 3497 | * access fails we try to fault in the futex with R/W |
3463 | */ | 3498 | * verification via get_user_pages. get_user() above |
3464 | if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) { | 3499 | * does not guarantee R/W access. If that fails we |
3500 | * give up and leave the futex locked. | ||
3501 | */ | ||
3502 | if ((err = cmpxchg_futex_value_locked(&nval, uaddr, uval, mval))) { | ||
3503 | switch (err) { | ||
3504 | case -EFAULT: | ||
3465 | if (fault_in_user_writeable(uaddr)) | 3505 | if (fault_in_user_writeable(uaddr)) |
3466 | return -1; | 3506 | return -1; |
3467 | goto retry; | 3507 | goto retry; |
3468 | } | 3508 | |
3469 | if (nval != uval) | 3509 | case -EAGAIN: |
3510 | cond_resched(); | ||
3470 | goto retry; | 3511 | goto retry; |
3471 | 3512 | ||
3472 | /* | 3513 | default: |
3473 | * Wake robust non-PI futexes here. The wakeup of | 3514 | WARN_ON_ONCE(1); |
3474 | * PI futexes happens in exit_pi_state(): | 3515 | return err; |
3475 | */ | 3516 | } |
3476 | if (!pi && (uval & FUTEX_WAITERS)) | ||
3477 | futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); | ||
3478 | } | 3517 | } |
3518 | |||
3519 | if (nval != uval) | ||
3520 | goto retry; | ||
3521 | |||
3522 | /* | ||
3523 | * Wake robust non-PI futexes here. The wakeup of | ||
3524 | * PI futexes happens in exit_pi_state(): | ||
3525 | */ | ||
3526 | if (!pi && (uval & FUTEX_WAITERS)) | ||
3527 | futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); | ||
3528 | |||
3479 | return 0; | 3529 | return 0; |
3480 | } | 3530 | } |
3481 | 3531 | ||
diff --git a/kernel/gen_ikh_data.sh b/kernel/gen_ikh_data.sh new file mode 100755 index 000000000000..591a94f7b387 --- /dev/null +++ b/kernel/gen_ikh_data.sh | |||
@@ -0,0 +1,89 @@ | |||
1 | #!/bin/bash | ||
2 | # SPDX-License-Identifier: GPL-2.0 | ||
3 | |||
4 | # This script generates an archive consisting of kernel headers | ||
5 | # for CONFIG_IKHEADERS_PROC. | ||
6 | set -e | ||
7 | spath="$(dirname "$(readlink -f "$0")")" | ||
8 | kroot="$spath/.." | ||
9 | outdir="$(pwd)" | ||
10 | tarfile=$1 | ||
11 | cpio_dir=$outdir/$tarfile.tmp | ||
12 | |||
13 | # Script filename relative to the kernel source root | ||
14 | # We add it to the archive because it is small and any changes | ||
15 | # to this script will also cause a rebuild of the archive. | ||
16 | sfile="$(realpath --relative-to $kroot "$(readlink -f "$0")")" | ||
17 | |||
18 | src_file_list=" | ||
19 | include/ | ||
20 | arch/$SRCARCH/include/ | ||
21 | $sfile | ||
22 | " | ||
23 | |||
24 | obj_file_list=" | ||
25 | include/ | ||
26 | arch/$SRCARCH/include/ | ||
27 | " | ||
28 | |||
29 | # Support incremental builds by skipping archive generation | ||
30 | # if timestamps of files being archived are not changed. | ||
31 | |||
32 | # This block is useful for debugging the incremental builds. | ||
33 | # Uncomment it for debugging. | ||
34 | # iter=1 | ||
35 | # if [ ! -f /tmp/iter ]; then echo 1 > /tmp/iter; | ||
36 | # else; iter=$(($(cat /tmp/iter) + 1)); fi | ||
37 | # find $src_file_list -type f | xargs ls -lR > /tmp/src-ls-$iter | ||
38 | # find $obj_file_list -type f | xargs ls -lR > /tmp/obj-ls-$iter | ||
39 | |||
40 | # include/generated/compile.h is ignored because it is touched even when none | ||
41 | # of the source files changed. This causes pointless regeneration, so let us | ||
42 | # ignore them for md5 calculation. | ||
43 | pushd $kroot > /dev/null | ||
44 | src_files_md5="$(find $src_file_list -type f | | ||
45 | grep -v "include/generated/compile.h" | | ||
46 | xargs ls -lR | md5sum | cut -d ' ' -f1)" | ||
47 | popd > /dev/null | ||
48 | obj_files_md5="$(find $obj_file_list -type f | | ||
49 | grep -v "include/generated/compile.h" | | ||
50 | xargs ls -lR | md5sum | cut -d ' ' -f1)" | ||
51 | |||
52 | if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi | ||
53 | if [ -f kernel/kheaders.md5 ] && | ||
54 | [ "$(cat kernel/kheaders.md5|head -1)" == "$src_files_md5" ] && | ||
55 | [ "$(cat kernel/kheaders.md5|head -2|tail -1)" == "$obj_files_md5" ] && | ||
56 | [ "$(cat kernel/kheaders.md5|tail -1)" == "$tarfile_md5" ]; then | ||
57 | exit | ||
58 | fi | ||
59 | |||
60 | if [ "${quiet}" != "silent_" ]; then | ||
61 | echo " GEN $tarfile" | ||
62 | fi | ||
63 | |||
64 | rm -rf $cpio_dir | ||
65 | mkdir $cpio_dir | ||
66 | |||
67 | pushd $kroot > /dev/null | ||
68 | for f in $src_file_list; | ||
69 | do find "$f" ! -name "*.cmd" ! -name ".*"; | ||
70 | done | cpio --quiet -pd $cpio_dir | ||
71 | popd > /dev/null | ||
72 | |||
73 | # The second CPIO can complain if files already exist which can | ||
74 | # happen with out of tree builds. Just silence CPIO for now. | ||
75 | for f in $obj_file_list; | ||
76 | do find "$f" ! -name "*.cmd" ! -name ".*"; | ||
77 | done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1 | ||
78 | |||
79 | # Remove comments except SDPX lines | ||
80 | find $cpio_dir -type f -print0 | | ||
81 | xargs -0 -P8 -n1 perl -pi -e 'BEGIN {undef $/;}; s/\/\*((?!SPDX).)*?\*\///smg;' | ||
82 | |||
83 | tar -Jcf $tarfile -C $cpio_dir/ . > /dev/null | ||
84 | |||
85 | echo "$src_files_md5" > kernel/kheaders.md5 | ||
86 | echo "$obj_files_md5" >> kernel/kheaders.md5 | ||
87 | echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5 | ||
88 | |||
89 | rm -rf $cpio_dir | ||
diff --git a/kernel/iomem.c b/kernel/iomem.c index f7525e14ebc6..93c264444510 100644 --- a/kernel/iomem.c +++ b/kernel/iomem.c | |||
@@ -55,7 +55,7 @@ static void *try_ram_remap(resource_size_t offset, size_t size, | |||
55 | * | 55 | * |
56 | * MEMREMAP_WB - matches the default mapping for System RAM on | 56 | * MEMREMAP_WB - matches the default mapping for System RAM on |
57 | * the architecture. This is usually a read-allocate write-back cache. | 57 | * the architecture. This is usually a read-allocate write-back cache. |
58 | * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM | 58 | * Moreover, if MEMREMAP_WB is specified and the requested remap region is RAM |
59 | * memremap() will bypass establishing a new mapping and instead return | 59 | * memremap() will bypass establishing a new mapping and instead return |
60 | * a pointer into the direct map. | 60 | * a pointer into the direct map. |
61 | * | 61 | * |
@@ -86,7 +86,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags) | |||
86 | /* Try all mapping types requested until one returns non-NULL */ | 86 | /* Try all mapping types requested until one returns non-NULL */ |
87 | if (flags & MEMREMAP_WB) { | 87 | if (flags & MEMREMAP_WB) { |
88 | /* | 88 | /* |
89 | * MEMREMAP_WB is special in that it can be satisifed | 89 | * MEMREMAP_WB is special in that it can be satisfied |
90 | * from the direct map. Some archs depend on the | 90 | * from the direct map. Some archs depend on the |
91 | * capability of memremap() to autodetect cases where | 91 | * capability of memremap() to autodetect cases where |
92 | * the requested range is potentially in System RAM. | 92 | * the requested range is potentially in System RAM. |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 3faef4a77f71..51128bea3846 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -1449,6 +1449,10 @@ int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info) | |||
1449 | int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on) | 1449 | int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on) |
1450 | { | 1450 | { |
1451 | data = data->parent_data; | 1451 | data = data->parent_data; |
1452 | |||
1453 | if (data->chip->flags & IRQCHIP_SKIP_SET_WAKE) | ||
1454 | return 0; | ||
1455 | |||
1452 | if (data->chip->irq_set_wake) | 1456 | if (data->chip->irq_set_wake) |
1453 | return data->chip->irq_set_wake(data, on); | 1457 | return data->chip->irq_set_wake(data, on); |
1454 | 1458 | ||
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index 516c00a5e867..c1eccd4f6520 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c | |||
@@ -152,7 +152,7 @@ static int irq_debug_show(struct seq_file *m, void *p) | |||
152 | 152 | ||
153 | raw_spin_lock_irq(&desc->lock); | 153 | raw_spin_lock_irq(&desc->lock); |
154 | data = irq_desc_get_irq_data(desc); | 154 | data = irq_desc_get_irq_data(desc); |
155 | seq_printf(m, "handler: %pf\n", desc->handle_irq); | 155 | seq_printf(m, "handler: %ps\n", desc->handle_irq); |
156 | seq_printf(m, "device: %s\n", desc->dev_name); | 156 | seq_printf(m, "device: %s\n", desc->dev_name); |
157 | seq_printf(m, "status: 0x%08x\n", desc->status_use_accessors); | 157 | seq_printf(m, "status: 0x%08x\n", desc->status_use_accessors); |
158 | irq_debug_show_bits(m, 0, desc->status_use_accessors, irqdesc_states, | 158 | irq_debug_show_bits(m, 0, desc->status_use_accessors, irqdesc_states, |
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index 5d5378ea0afe..f6e5515ee077 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c | |||
@@ -84,8 +84,6 @@ EXPORT_SYMBOL(devm_request_threaded_irq); | |||
84 | * @dev: device to request interrupt for | 84 | * @dev: device to request interrupt for |
85 | * @irq: Interrupt line to allocate | 85 | * @irq: Interrupt line to allocate |
86 | * @handler: Function to be called when the IRQ occurs | 86 | * @handler: Function to be called when the IRQ occurs |
87 | * @thread_fn: function to be called in a threaded interrupt context. NULL | ||
88 | * for devices which handle everything in @handler | ||
89 | * @irqflags: Interrupt type flags | 87 | * @irqflags: Interrupt type flags |
90 | * @devname: An ascii name for the claiming device, dev_name(dev) if NULL | 88 | * @devname: An ascii name for the claiming device, dev_name(dev) if NULL |
91 | * @dev_id: A cookie passed back to the handler function | 89 | * @dev_id: A cookie passed back to the handler function |
@@ -222,9 +220,8 @@ devm_irq_alloc_generic_chip(struct device *dev, const char *name, int num_ct, | |||
222 | irq_flow_handler_t handler) | 220 | irq_flow_handler_t handler) |
223 | { | 221 | { |
224 | struct irq_chip_generic *gc; | 222 | struct irq_chip_generic *gc; |
225 | unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); | ||
226 | 223 | ||
227 | gc = devm_kzalloc(dev, sz, GFP_KERNEL); | 224 | gc = devm_kzalloc(dev, struct_size(gc, chip_types, num_ct), GFP_KERNEL); |
228 | if (gc) | 225 | if (gc) |
229 | irq_init_generic_chip(gc, name, num_ct, | 226 | irq_init_generic_chip(gc, name, num_ct, |
230 | irq_base, reg_base, handler); | 227 | irq_base, reg_base, handler); |
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 6df5ddfdb0f8..a4ace611f47f 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c | |||
@@ -149,7 +149,7 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags | |||
149 | res = action->handler(irq, action->dev_id); | 149 | res = action->handler(irq, action->dev_id); |
150 | trace_irq_handler_exit(irq, action, res); | 150 | trace_irq_handler_exit(irq, action, res); |
151 | 151 | ||
152 | if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n", | 152 | if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pS enabled interrupts\n", |
153 | irq, action->handler)) | 153 | irq, action->handler)) |
154 | local_irq_disable(); | 154 | local_irq_disable(); |
155 | 155 | ||
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 13539e12cd80..c52b737ab8e3 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c | |||
@@ -275,11 +275,12 @@ static struct attribute *irq_attrs[] = { | |||
275 | &actions_attr.attr, | 275 | &actions_attr.attr, |
276 | NULL | 276 | NULL |
277 | }; | 277 | }; |
278 | ATTRIBUTE_GROUPS(irq); | ||
278 | 279 | ||
279 | static struct kobj_type irq_kobj_type = { | 280 | static struct kobj_type irq_kobj_type = { |
280 | .release = irq_kobj_release, | 281 | .release = irq_kobj_release, |
281 | .sysfs_ops = &kobj_sysfs_ops, | 282 | .sysfs_ops = &kobj_sysfs_ops, |
282 | .default_attrs = irq_attrs, | 283 | .default_groups = irq_groups, |
283 | }; | 284 | }; |
284 | 285 | ||
285 | static void irq_sysfs_add(int irq, struct irq_desc *desc) | 286 | static void irq_sysfs_add(int irq, struct irq_desc *desc) |
@@ -558,6 +559,7 @@ int __init early_irq_init(void) | |||
558 | alloc_masks(&desc[i], node); | 559 | alloc_masks(&desc[i], node); |
559 | raw_spin_lock_init(&desc[i].lock); | 560 | raw_spin_lock_init(&desc[i].lock); |
560 | lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); | 561 | lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); |
562 | mutex_init(&desc[i].request_mutex); | ||
561 | desc_set_defaults(i, &desc[i], node, NULL, NULL); | 563 | desc_set_defaults(i, &desc[i], node, NULL, NULL); |
562 | } | 564 | } |
563 | return arch_early_irq_init(); | 565 | return arch_early_irq_init(); |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 9ec34a2a6638..78f3ddeb7fe4 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -196,6 +196,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, | |||
196 | case IRQ_SET_MASK_OK: | 196 | case IRQ_SET_MASK_OK: |
197 | case IRQ_SET_MASK_OK_DONE: | 197 | case IRQ_SET_MASK_OK_DONE: |
198 | cpumask_copy(desc->irq_common_data.affinity, mask); | 198 | cpumask_copy(desc->irq_common_data.affinity, mask); |
199 | /* fall through */ | ||
199 | case IRQ_SET_MASK_OK_NOCOPY: | 200 | case IRQ_SET_MASK_OK_NOCOPY: |
200 | irq_validate_effective_affinity(data); | 201 | irq_validate_effective_affinity(data); |
201 | irq_set_thread_affinity(desc); | 202 | irq_set_thread_affinity(desc); |
@@ -356,8 +357,10 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) | |||
356 | desc->affinity_notify = notify; | 357 | desc->affinity_notify = notify; |
357 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 358 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
358 | 359 | ||
359 | if (old_notify) | 360 | if (old_notify) { |
361 | cancel_work_sync(&old_notify->work); | ||
360 | kref_put(&old_notify->kref, old_notify->release); | 362 | kref_put(&old_notify->kref, old_notify->release); |
363 | } | ||
361 | 364 | ||
362 | return 0; | 365 | return 0; |
363 | } | 366 | } |
@@ -778,7 +781,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags) | |||
778 | ret = 0; | 781 | ret = 0; |
779 | break; | 782 | break; |
780 | default: | 783 | default: |
781 | pr_err("Setting trigger mode %lu for irq %u failed (%pF)\n", | 784 | pr_err("Setting trigger mode %lu for irq %u failed (%pS)\n", |
782 | flags, irq_desc_get_irq(desc), chip->irq_set_type); | 785 | flags, irq_desc_get_irq(desc), chip->irq_set_type); |
783 | } | 786 | } |
784 | if (unmask) | 787 | if (unmask) |
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 6d2fa6914b30..2ed97a7c9b2a 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c | |||
@@ -212,9 +212,9 @@ static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret) | |||
212 | */ | 212 | */ |
213 | raw_spin_lock_irqsave(&desc->lock, flags); | 213 | raw_spin_lock_irqsave(&desc->lock, flags); |
214 | for_each_action_of_desc(desc, action) { | 214 | for_each_action_of_desc(desc, action) { |
215 | printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler); | 215 | printk(KERN_ERR "[<%p>] %ps", action->handler, action->handler); |
216 | if (action->thread_fn) | 216 | if (action->thread_fn) |
217 | printk(KERN_CONT " threaded [<%p>] %pf", | 217 | printk(KERN_CONT " threaded [<%p>] %ps", |
218 | action->thread_fn, action->thread_fn); | 218 | action->thread_fn, action->thread_fn); |
219 | printk(KERN_CONT "\n"); | 219 | printk(KERN_CONT "\n"); |
220 | } | 220 | } |
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c index 1e4cb63a5c82..90c735da15d0 100644 --- a/kernel/irq/timings.c +++ b/kernel/irq/timings.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include <linux/idr.h> | 9 | #include <linux/idr.h> |
10 | #include <linux/irq.h> | 10 | #include <linux/irq.h> |
11 | #include <linux/math64.h> | 11 | #include <linux/math64.h> |
12 | #include <linux/log2.h> | ||
12 | 13 | ||
13 | #include <trace/events/irq.h> | 14 | #include <trace/events/irq.h> |
14 | 15 | ||
@@ -18,16 +19,6 @@ DEFINE_STATIC_KEY_FALSE(irq_timing_enabled); | |||
18 | 19 | ||
19 | DEFINE_PER_CPU(struct irq_timings, irq_timings); | 20 | DEFINE_PER_CPU(struct irq_timings, irq_timings); |
20 | 21 | ||
21 | struct irqt_stat { | ||
22 | u64 next_evt; | ||
23 | u64 last_ts; | ||
24 | u64 variance; | ||
25 | u32 avg; | ||
26 | u32 nr_samples; | ||
27 | int anomalies; | ||
28 | int valid; | ||
29 | }; | ||
30 | |||
31 | static DEFINE_IDR(irqt_stats); | 22 | static DEFINE_IDR(irqt_stats); |
32 | 23 | ||
33 | void irq_timings_enable(void) | 24 | void irq_timings_enable(void) |
@@ -40,75 +31,360 @@ void irq_timings_disable(void) | |||
40 | static_branch_disable(&irq_timing_enabled); | 31 | static_branch_disable(&irq_timing_enabled); |
41 | } | 32 | } |
42 | 33 | ||
43 | /** | 34 | /* |
44 | * irqs_update - update the irq timing statistics with a new timestamp | 35 | * The main goal of this algorithm is to predict the next interrupt |
36 | * occurrence on the current CPU. | ||
37 | * | ||
38 | * Currently, the interrupt timings are stored in a circular array | ||
39 | * buffer every time there is an interrupt, as a tuple: the interrupt | ||
40 | * number and the associated timestamp when the event occurred <irq, | ||
41 | * timestamp>. | ||
42 | * | ||
43 | * For every interrupt occurring in a short period of time, we can | ||
44 | * measure the elapsed time between the occurrences for the same | ||
45 | * interrupt and we end up with a suite of intervals. The experience | ||
46 | * showed the interrupts are often coming following a periodic | ||
47 | * pattern. | ||
48 | * | ||
49 | * The objective of the algorithm is to find out this periodic pattern | ||
50 | * in a fastest way and use its period to predict the next irq event. | ||
51 | * | ||
52 | * When the next interrupt event is requested, we are in the situation | ||
53 | * where the interrupts are disabled and the circular buffer | ||
54 | * containing the timings is filled with the events which happened | ||
55 | * after the previous next-interrupt-event request. | ||
56 | * | ||
57 | * At this point, we read the circular buffer and we fill the irq | ||
58 | * related statistics structure. After this step, the circular array | ||
59 | * containing the timings is empty because all the values are | ||
60 | * dispatched in their corresponding buffers. | ||
61 | * | ||
62 | * Now for each interrupt, we can predict the next event by using the | ||
63 | * suffix array, log interval and exponential moving average | ||
64 | * | ||
65 | * 1. Suffix array | ||
66 | * | ||
67 | * Suffix array is an array of all the suffixes of a string. It is | ||
68 | * widely used as a data structure for compression, text search, ... | ||
69 | * For instance for the word 'banana', the suffixes will be: 'banana' | ||
70 | * 'anana' 'nana' 'ana' 'na' 'a' | ||
71 | * | ||
72 | * Usually, the suffix array is sorted but for our purpose it is | ||
73 | * not necessary and won't provide any improvement in the context of | ||
74 | * the solved problem where we clearly define the boundaries of the | ||
75 | * search by a max period and min period. | ||
76 | * | ||
77 | * The suffix array will build a suite of intervals of different | ||
78 | * length and will look for the repetition of each suite. If the suite | ||
79 | * is repeating then we have the period because it is the length of | ||
80 | * the suite whatever its position in the buffer. | ||
81 | * | ||
82 | * 2. Log interval | ||
83 | * | ||
84 | * We saw the irq timings allow to compute the interval of the | ||
85 | * occurrences for a specific interrupt. We can reasonibly assume the | ||
86 | * longer is the interval, the higher is the error for the next event | ||
87 | * and we can consider storing those interval values into an array | ||
88 | * where each slot in the array correspond to an interval at the power | ||
89 | * of 2 of the index. For example, index 12 will contain values | ||
90 | * between 2^11 and 2^12. | ||
91 | * | ||
92 | * At the end we have an array of values where at each index defines a | ||
93 | * [2^index - 1, 2 ^ index] interval values allowing to store a large | ||
94 | * number of values inside a small array. | ||
95 | * | ||
96 | * For example, if we have the value 1123, then we store it at | ||
97 | * ilog2(1123) = 10 index value. | ||
98 | * | ||
99 | * Storing those value at the specific index is done by computing an | ||
100 | * exponential moving average for this specific slot. For instance, | ||
101 | * for values 1800, 1123, 1453, ... fall under the same slot (10) and | ||
102 | * the exponential moving average is computed every time a new value | ||
103 | * is stored at this slot. | ||
104 | * | ||
105 | * 3. Exponential Moving Average | ||
106 | * | ||
107 | * The EMA is largely used to track a signal for stocks or as a low | ||
108 | * pass filter. The magic of the formula, is it is very simple and the | ||
109 | * reactivity of the average can be tuned with the factors called | ||
110 | * alpha. | ||
111 | * | ||
112 | * The higher the alphas are, the faster the average respond to the | ||
113 | * signal change. In our case, if a slot in the array is a big | ||
114 | * interval, we can have numbers with a big difference between | ||
115 | * them. The impact of those differences in the average computation | ||
116 | * can be tuned by changing the alpha value. | ||
117 | * | ||
118 | * | ||
119 | * -- The algorithm -- | ||
120 | * | ||
121 | * We saw the different processing above, now let's see how they are | ||
122 | * used together. | ||
123 | * | ||
124 | * For each interrupt: | ||
125 | * For each interval: | ||
126 | * Compute the index = ilog2(interval) | ||
127 | * Compute a new_ema(buffer[index], interval) | ||
128 | * Store the index in a circular buffer | ||
129 | * | ||
130 | * Compute the suffix array of the indexes | ||
131 | * | ||
132 | * For each suffix: | ||
133 | * If the suffix is reverse-found 3 times | ||
134 | * Return suffix | ||
135 | * | ||
136 | * Return Not found | ||
137 | * | ||
138 | * However we can not have endless suffix array to be build, it won't | ||
139 | * make sense and it will add an extra overhead, so we can restrict | ||
140 | * this to a maximum suffix length of 5 and a minimum suffix length of | ||
141 | * 2. The experience showed 5 is the majority of the maximum pattern | ||
142 | * period found for different devices. | ||
143 | * | ||
144 | * The result is a pattern finding less than 1us for an interrupt. | ||
45 | * | 145 | * |
46 | * @irqs: an irqt_stat struct pointer | 146 | * Example based on real values: |
47 | * @ts: the new timestamp | ||
48 | * | 147 | * |
49 | * The statistics are computed online, in other words, the code is | 148 | * Example 1 : MMC write/read interrupt interval: |
50 | * designed to compute the statistics on a stream of values rather | ||
51 | * than doing multiple passes on the values to compute the average, | ||
52 | * then the variance. The integer division introduces a loss of | ||
53 | * precision but with an acceptable error margin regarding the results | ||
54 | * we would have with the double floating precision: we are dealing | ||
55 | * with nanosec, so big numbers, consequently the mantisse is | ||
56 | * negligeable, especially when converting the time in usec | ||
57 | * afterwards. | ||
58 | * | 149 | * |
59 | * The computation happens at idle time. When the CPU is not idle, the | 150 | * 223947, 1240, 1384, 1386, 1386, |
60 | * interrupts' timestamps are stored in the circular buffer, when the | 151 | * 217416, 1236, 1384, 1386, 1387, |
61 | * CPU goes idle and this routine is called, all the buffer's values | 152 | * 214719, 1241, 1386, 1387, 1384, |
62 | * are injected in the statistical model continuying to extend the | 153 | * 213696, 1234, 1384, 1386, 1388, |
63 | * statistics from the previous busy-idle cycle. | 154 | * 219904, 1240, 1385, 1389, 1385, |
155 | * 212240, 1240, 1386, 1386, 1386, | ||
156 | * 214415, 1236, 1384, 1386, 1387, | ||
157 | * 214276, 1234, 1384, 1388, ? | ||
64 | * | 158 | * |
65 | * The observations showed a device will trigger a burst of periodic | 159 | * For each element, apply ilog2(value) |
66 | * interrupts followed by one or two peaks of longer time, for | ||
67 | * instance when a SD card device flushes its cache, then the periodic | ||
68 | * intervals occur again. A one second inactivity period resets the | ||
69 | * stats, that gives us the certitude the statistical values won't | ||
70 | * exceed 1x10^9, thus the computation won't overflow. | ||
71 | * | 160 | * |
72 | * Basically, the purpose of the algorithm is to watch the periodic | 161 | * 15, 8, 8, 8, 8, |
73 | * interrupts and eliminate the peaks. | 162 | * 15, 8, 8, 8, 8, |
163 | * 15, 8, 8, 8, 8, | ||
164 | * 15, 8, 8, 8, 8, | ||
165 | * 15, 8, 8, 8, 8, | ||
166 | * 15, 8, 8, 8, 8, | ||
167 | * 15, 8, 8, 8, 8, | ||
168 | * 15, 8, 8, 8, ? | ||
74 | * | 169 | * |
75 | * An interrupt is considered periodically stable if the interval of | 170 | * Max period of 5, we take the last (max_period * 3) 15 elements as |
76 | * its occurences follow the normal distribution, thus the values | 171 | * we can be confident if the pattern repeats itself three times it is |
77 | * comply with: | 172 | * a repeating pattern. |
78 | * | 173 | * |
79 | * avg - 3 x stddev < value < avg + 3 x stddev | 174 | * 8, |
175 | * 15, 8, 8, 8, 8, | ||
176 | * 15, 8, 8, 8, 8, | ||
177 | * 15, 8, 8, 8, ? | ||
80 | * | 178 | * |
81 | * Which can be simplified to: | 179 | * Suffixes are: |
82 | * | 180 | * |
83 | * -3 x stddev < value - avg < 3 x stddev | 181 | * 1) 8, 15, 8, 8, 8 <- max period |
182 | * 2) 8, 15, 8, 8 | ||
183 | * 3) 8, 15, 8 | ||
184 | * 4) 8, 15 <- min period | ||
84 | * | 185 | * |
85 | * abs(value - avg) < 3 x stddev | 186 | * From there we search the repeating pattern for each suffix. |
86 | * | 187 | * |
87 | * In order to save a costly square root computation, we use the | 188 | * buffer: 8, 15, 8, 8, 8, 8, 15, 8, 8, 8, 8, 15, 8, 8, 8 |
88 | * variance. For the record, stddev = sqrt(variance). The equation | 189 | * | | | | | | | | | | | | | | | |
89 | * above becomes: | 190 | * 8, 15, 8, 8, 8 | | | | | | | | | | |
191 | * 8, 15, 8, 8, 8 | | | | | | ||
192 | * 8, 15, 8, 8, 8 | ||
90 | * | 193 | * |
91 | * abs(value - avg) < 3 x sqrt(variance) | 194 | * When moving the suffix, we found exactly 3 matches. |
92 | * | 195 | * |
93 | * And finally we square it: | 196 | * The first suffix with period 5 is repeating. |
94 | * | 197 | * |
95 | * (value - avg) ^ 2 < (3 x sqrt(variance)) ^ 2 | 198 | * The next event is (3 * max_period) % suffix_period |
96 | * | 199 | * |
97 | * (value - avg) x (value - avg) < 9 x variance | 200 | * In this example, the result 0, so the next event is suffix[0] => 8 |
98 | * | 201 | * |
99 | * Statistically speaking, any values out of this interval is | 202 | * However, 8 is the index in the array of exponential moving average |
100 | * considered as an anomaly and is discarded. However, a normal | 203 | * which was calculated on the fly when storing the values, so the |
101 | * distribution appears when the number of samples is 30 (it is the | 204 | * interval is ema[8] = 1366 |
102 | * rule of thumb in statistics, cf. "30 samples" on Internet). When | ||
103 | * there are three consecutive anomalies, the statistics are resetted. | ||
104 | * | 205 | * |
206 | * | ||
207 | * Example 2: | ||
208 | * | ||
209 | * 4, 3, 5, 100, | ||
210 | * 3, 3, 5, 117, | ||
211 | * 4, 4, 5, 112, | ||
212 | * 4, 3, 4, 110, | ||
213 | * 3, 5, 3, 117, | ||
214 | * 4, 4, 5, 112, | ||
215 | * 4, 3, 4, 110, | ||
216 | * 3, 4, 5, 112, | ||
217 | * 4, 3, 4, 110 | ||
218 | * | ||
219 | * ilog2 | ||
220 | * | ||
221 | * 0, 0, 0, 4, | ||
222 | * 0, 0, 0, 4, | ||
223 | * 0, 0, 0, 4, | ||
224 | * 0, 0, 0, 4, | ||
225 | * 0, 0, 0, 4, | ||
226 | * 0, 0, 0, 4, | ||
227 | * 0, 0, 0, 4, | ||
228 | * 0, 0, 0, 4, | ||
229 | * 0, 0, 0, 4 | ||
230 | * | ||
231 | * Max period 5: | ||
232 | * 0, 0, 4, | ||
233 | * 0, 0, 0, 4, | ||
234 | * 0, 0, 0, 4, | ||
235 | * 0, 0, 0, 4 | ||
236 | * | ||
237 | * Suffixes: | ||
238 | * | ||
239 | * 1) 0, 0, 4, 0, 0 | ||
240 | * 2) 0, 0, 4, 0 | ||
241 | * 3) 0, 0, 4 | ||
242 | * 4) 0, 0 | ||
243 | * | ||
244 | * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4 | ||
245 | * | | | | | | X | ||
246 | * 0, 0, 4, 0, 0, | X | ||
247 | * 0, 0 | ||
248 | * | ||
249 | * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4 | ||
250 | * | | | | | | | | | | | | | | | | ||
251 | * 0, 0, 4, 0, | | | | | | | | | | | | ||
252 | * 0, 0, 4, 0, | | | | | | | | ||
253 | * 0, 0, 4, 0, | | | | ||
254 | * 0 0 4 | ||
255 | * | ||
256 | * Pattern is found 3 times, the remaining is 1 which results from | ||
257 | * (max_period * 3) % suffix_period. This value is the index in the | ||
258 | * suffix arrays. The suffix array for a period 4 has the value 4 | ||
259 | * at index 1. | ||
260 | */ | ||
261 | #define EMA_ALPHA_VAL 64 | ||
262 | #define EMA_ALPHA_SHIFT 7 | ||
263 | |||
264 | #define PREDICTION_PERIOD_MIN 2 | ||
265 | #define PREDICTION_PERIOD_MAX 5 | ||
266 | #define PREDICTION_FACTOR 4 | ||
267 | #define PREDICTION_MAX 10 /* 2 ^ PREDICTION_MAX useconds */ | ||
268 | #define PREDICTION_BUFFER_SIZE 16 /* slots for EMAs, hardly more than 16 */ | ||
269 | |||
270 | struct irqt_stat { | ||
271 | u64 last_ts; | ||
272 | u64 ema_time[PREDICTION_BUFFER_SIZE]; | ||
273 | int timings[IRQ_TIMINGS_SIZE]; | ||
274 | int circ_timings[IRQ_TIMINGS_SIZE]; | ||
275 | int count; | ||
276 | }; | ||
277 | |||
278 | /* | ||
279 | * Exponential moving average computation | ||
105 | */ | 280 | */ |
106 | static void irqs_update(struct irqt_stat *irqs, u64 ts) | 281 | static u64 irq_timings_ema_new(u64 value, u64 ema_old) |
282 | { | ||
283 | s64 diff; | ||
284 | |||
285 | if (unlikely(!ema_old)) | ||
286 | return value; | ||
287 | |||
288 | diff = (value - ema_old) * EMA_ALPHA_VAL; | ||
289 | /* | ||
290 | * We can use a s64 type variable to be added with the u64 | ||
291 | * ema_old variable as this one will never have its topmost | ||
292 | * bit set, it will be always smaller than 2^63 nanosec | ||
293 | * interrupt interval (292 years). | ||
294 | */ | ||
295 | return ema_old + (diff >> EMA_ALPHA_SHIFT); | ||
296 | } | ||
297 | |||
298 | static int irq_timings_next_event_index(int *buffer, size_t len, int period_max) | ||
299 | { | ||
300 | int i; | ||
301 | |||
302 | /* | ||
303 | * The buffer contains the suite of intervals, in a ilog2 | ||
304 | * basis, we are looking for a repetition. We point the | ||
305 | * beginning of the search three times the length of the | ||
306 | * period beginning at the end of the buffer. We do that for | ||
307 | * each suffix. | ||
308 | */ | ||
309 | for (i = period_max; i >= PREDICTION_PERIOD_MIN ; i--) { | ||
310 | |||
311 | int *begin = &buffer[len - (i * 3)]; | ||
312 | int *ptr = begin; | ||
313 | |||
314 | /* | ||
315 | * We look if the suite with period 'i' repeat | ||
316 | * itself. If it is truncated at the end, as it | ||
317 | * repeats we can use the period to find out the next | ||
318 | * element. | ||
319 | */ | ||
320 | while (!memcmp(ptr, begin, i * sizeof(*ptr))) { | ||
321 | ptr += i; | ||
322 | if (ptr >= &buffer[len]) | ||
323 | return begin[((i * 3) % i)]; | ||
324 | } | ||
325 | } | ||
326 | |||
327 | return -1; | ||
328 | } | ||
329 | |||
330 | static u64 __irq_timings_next_event(struct irqt_stat *irqs, int irq, u64 now) | ||
331 | { | ||
332 | int index, i, period_max, count, start, min = INT_MAX; | ||
333 | |||
334 | if ((now - irqs->last_ts) >= NSEC_PER_SEC) { | ||
335 | irqs->count = irqs->last_ts = 0; | ||
336 | return U64_MAX; | ||
337 | } | ||
338 | |||
339 | /* | ||
340 | * As we want to find three times the repetition, we need a | ||
341 | * number of intervals greater or equal to three times the | ||
342 | * maximum period, otherwise we truncate the max period. | ||
343 | */ | ||
344 | period_max = irqs->count > (3 * PREDICTION_PERIOD_MAX) ? | ||
345 | PREDICTION_PERIOD_MAX : irqs->count / 3; | ||
346 | |||
347 | /* | ||
348 | * If we don't have enough irq timings for this prediction, | ||
349 | * just bail out. | ||
350 | */ | ||
351 | if (period_max <= PREDICTION_PERIOD_MIN) | ||
352 | return U64_MAX; | ||
353 | |||
354 | /* | ||
355 | * 'count' will depends if the circular buffer wrapped or not | ||
356 | */ | ||
357 | count = irqs->count < IRQ_TIMINGS_SIZE ? | ||
358 | irqs->count : IRQ_TIMINGS_SIZE; | ||
359 | |||
360 | start = irqs->count < IRQ_TIMINGS_SIZE ? | ||
361 | 0 : (irqs->count & IRQ_TIMINGS_MASK); | ||
362 | |||
363 | /* | ||
364 | * Copy the content of the circular buffer into another buffer | ||
365 | * in order to linearize the buffer instead of dealing with | ||
366 | * wrapping indexes and shifted array which will be prone to | ||
367 | * error and extremelly difficult to debug. | ||
368 | */ | ||
369 | for (i = 0; i < count; i++) { | ||
370 | int index = (start + i) & IRQ_TIMINGS_MASK; | ||
371 | |||
372 | irqs->timings[i] = irqs->circ_timings[index]; | ||
373 | min = min_t(int, irqs->timings[i], min); | ||
374 | } | ||
375 | |||
376 | index = irq_timings_next_event_index(irqs->timings, count, period_max); | ||
377 | if (index < 0) | ||
378 | return irqs->last_ts + irqs->ema_time[min]; | ||
379 | |||
380 | return irqs->last_ts + irqs->ema_time[index]; | ||
381 | } | ||
382 | |||
383 | static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts) | ||
107 | { | 384 | { |
108 | u64 old_ts = irqs->last_ts; | 385 | u64 old_ts = irqs->last_ts; |
109 | u64 variance = 0; | ||
110 | u64 interval; | 386 | u64 interval; |
111 | s64 diff; | 387 | int index; |
112 | 388 | ||
113 | /* | 389 | /* |
114 | * The timestamps are absolute time values, we need to compute | 390 | * The timestamps are absolute time values, we need to compute |
@@ -135,87 +411,28 @@ static void irqs_update(struct irqt_stat *irqs, u64 ts) | |||
135 | * want as we need another timestamp to compute an interval. | 411 | * want as we need another timestamp to compute an interval. |
136 | */ | 412 | */ |
137 | if (interval >= NSEC_PER_SEC) { | 413 | if (interval >= NSEC_PER_SEC) { |
138 | memset(irqs, 0, sizeof(*irqs)); | 414 | irqs->count = 0; |
139 | irqs->last_ts = ts; | ||
140 | return; | 415 | return; |
141 | } | 416 | } |
142 | 417 | ||
143 | /* | 418 | /* |
144 | * Pre-compute the delta with the average as the result is | 419 | * Get the index in the ema table for this interrupt. The |
145 | * used several times in this function. | 420 | * PREDICTION_FACTOR increase the interval size for the array |
146 | */ | 421 | * of exponential average. |
147 | diff = interval - irqs->avg; | ||
148 | |||
149 | /* | ||
150 | * Increment the number of samples. | ||
151 | */ | ||
152 | irqs->nr_samples++; | ||
153 | |||
154 | /* | ||
155 | * Online variance divided by the number of elements if there | ||
156 | * is more than one sample. Normally the formula is division | ||
157 | * by nr_samples - 1 but we assume the number of element will be | ||
158 | * more than 32 and dividing by 32 instead of 31 is enough | ||
159 | * precise. | ||
160 | */ | ||
161 | if (likely(irqs->nr_samples > 1)) | ||
162 | variance = irqs->variance >> IRQ_TIMINGS_SHIFT; | ||
163 | |||
164 | /* | ||
165 | * The rule of thumb in statistics for the normal distribution | ||
166 | * is having at least 30 samples in order to have the model to | ||
167 | * apply. Values outside the interval are considered as an | ||
168 | * anomaly. | ||
169 | */ | ||
170 | if ((irqs->nr_samples >= 30) && ((diff * diff) > (9 * variance))) { | ||
171 | /* | ||
172 | * After three consecutive anomalies, we reset the | ||
173 | * stats as it is no longer stable enough. | ||
174 | */ | ||
175 | if (irqs->anomalies++ >= 3) { | ||
176 | memset(irqs, 0, sizeof(*irqs)); | ||
177 | irqs->last_ts = ts; | ||
178 | return; | ||
179 | } | ||
180 | } else { | ||
181 | /* | ||
182 | * The anomalies must be consecutives, so at this | ||
183 | * point, we reset the anomalies counter. | ||
184 | */ | ||
185 | irqs->anomalies = 0; | ||
186 | } | ||
187 | |||
188 | /* | ||
189 | * The interrupt is considered stable enough to try to predict | ||
190 | * the next event on it. | ||
191 | */ | 422 | */ |
192 | irqs->valid = 1; | 423 | index = likely(interval) ? |
424 | ilog2((interval >> 10) / PREDICTION_FACTOR) : 0; | ||
193 | 425 | ||
194 | /* | 426 | /* |
195 | * Online average algorithm: | 427 | * Store the index as an element of the pattern in another |
196 | * | 428 | * circular array. |
197 | * new_average = average + ((value - average) / count) | ||
198 | * | ||
199 | * The variance computation depends on the new average | ||
200 | * to be computed here first. | ||
201 | * | ||
202 | */ | 429 | */ |
203 | irqs->avg = irqs->avg + (diff >> IRQ_TIMINGS_SHIFT); | 430 | irqs->circ_timings[irqs->count & IRQ_TIMINGS_MASK] = index; |
204 | 431 | ||
205 | /* | 432 | irqs->ema_time[index] = irq_timings_ema_new(interval, |
206 | * Online variance algorithm: | 433 | irqs->ema_time[index]); |
207 | * | ||
208 | * new_variance = variance + (value - average) x (value - new_average) | ||
209 | * | ||
210 | * Warning: irqs->avg is updated with the line above, hence | ||
211 | * 'interval - irqs->avg' is no longer equal to 'diff' | ||
212 | */ | ||
213 | irqs->variance = irqs->variance + (diff * (interval - irqs->avg)); | ||
214 | 434 | ||
215 | /* | 435 | irqs->count++; |
216 | * Update the next event | ||
217 | */ | ||
218 | irqs->next_evt = ts + irqs->avg; | ||
219 | } | 436 | } |
220 | 437 | ||
221 | /** | 438 | /** |
@@ -259,6 +476,9 @@ u64 irq_timings_next_event(u64 now) | |||
259 | */ | 476 | */ |
260 | lockdep_assert_irqs_disabled(); | 477 | lockdep_assert_irqs_disabled(); |
261 | 478 | ||
479 | if (!irqts->count) | ||
480 | return next_evt; | ||
481 | |||
262 | /* | 482 | /* |
263 | * Number of elements in the circular buffer: If it happens it | 483 | * Number of elements in the circular buffer: If it happens it |
264 | * was flushed before, then the number of elements could be | 484 | * was flushed before, then the number of elements could be |
@@ -269,21 +489,19 @@ u64 irq_timings_next_event(u64 now) | |||
269 | * type but with the cost of extra computation in the | 489 | * type but with the cost of extra computation in the |
270 | * interrupt handler hot path. We choose efficiency. | 490 | * interrupt handler hot path. We choose efficiency. |
271 | * | 491 | * |
272 | * Inject measured irq/timestamp to the statistical model | 492 | * Inject measured irq/timestamp to the pattern prediction |
273 | * while decrementing the counter because we consume the data | 493 | * model while decrementing the counter because we consume the |
274 | * from our circular buffer. | 494 | * data from our circular buffer. |
275 | */ | 495 | */ |
276 | for (i = irqts->count & IRQ_TIMINGS_MASK, | ||
277 | irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count); | ||
278 | irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) { | ||
279 | 496 | ||
280 | irq = irq_timing_decode(irqts->values[i], &ts); | 497 | i = (irqts->count & IRQ_TIMINGS_MASK) - 1; |
498 | irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count); | ||
281 | 499 | ||
500 | for (; irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) { | ||
501 | irq = irq_timing_decode(irqts->values[i], &ts); | ||
282 | s = idr_find(&irqt_stats, irq); | 502 | s = idr_find(&irqt_stats, irq); |
283 | if (s) { | 503 | if (s) |
284 | irqs = this_cpu_ptr(s); | 504 | irq_timings_store(irq, this_cpu_ptr(s), ts); |
285 | irqs_update(irqs, ts); | ||
286 | } | ||
287 | } | 505 | } |
288 | 506 | ||
289 | /* | 507 | /* |
@@ -294,26 +512,12 @@ u64 irq_timings_next_event(u64 now) | |||
294 | 512 | ||
295 | irqs = this_cpu_ptr(s); | 513 | irqs = this_cpu_ptr(s); |
296 | 514 | ||
297 | if (!irqs->valid) | 515 | ts = __irq_timings_next_event(irqs, i, now); |
298 | continue; | 516 | if (ts <= now) |
517 | return now; | ||
299 | 518 | ||
300 | if (irqs->next_evt <= now) { | 519 | if (ts < next_evt) |
301 | irq = i; | 520 | next_evt = ts; |
302 | next_evt = now; | ||
303 | |||
304 | /* | ||
305 | * This interrupt mustn't use in the future | ||
306 | * until new events occur and update the | ||
307 | * statistics. | ||
308 | */ | ||
309 | irqs->valid = 0; | ||
310 | break; | ||
311 | } | ||
312 | |||
313 | if (irqs->next_evt < next_evt) { | ||
314 | irq = i; | ||
315 | next_evt = irqs->next_evt; | ||
316 | } | ||
317 | } | 521 | } |
318 | 522 | ||
319 | return next_evt; | 523 | return next_evt; |
diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 6b7cdf17ccf8..73288914ed5e 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c | |||
@@ -56,61 +56,70 @@ void __weak arch_irq_work_raise(void) | |||
56 | */ | 56 | */ |
57 | } | 57 | } |
58 | 58 | ||
59 | /* | 59 | /* Enqueue on current CPU, work must already be claimed and preempt disabled */ |
60 | * Enqueue the irq_work @work on @cpu unless it's already pending | 60 | static void __irq_work_queue_local(struct irq_work *work) |
61 | * somewhere. | ||
62 | * | ||
63 | * Can be re-enqueued while the callback is still in progress. | ||
64 | */ | ||
65 | bool irq_work_queue_on(struct irq_work *work, int cpu) | ||
66 | { | 61 | { |
67 | /* All work should have been flushed before going offline */ | 62 | /* If the work is "lazy", handle it from next tick if any */ |
68 | WARN_ON_ONCE(cpu_is_offline(cpu)); | 63 | if (work->flags & IRQ_WORK_LAZY) { |
69 | 64 | if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) && | |
70 | #ifdef CONFIG_SMP | 65 | tick_nohz_tick_stopped()) |
71 | 66 | arch_irq_work_raise(); | |
72 | /* Arch remote IPI send/receive backend aren't NMI safe */ | 67 | } else { |
73 | WARN_ON_ONCE(in_nmi()); | 68 | if (llist_add(&work->llnode, this_cpu_ptr(&raised_list))) |
69 | arch_irq_work_raise(); | ||
70 | } | ||
71 | } | ||
74 | 72 | ||
73 | /* Enqueue the irq work @work on the current CPU */ | ||
74 | bool irq_work_queue(struct irq_work *work) | ||
75 | { | ||
75 | /* Only queue if not already pending */ | 76 | /* Only queue if not already pending */ |
76 | if (!irq_work_claim(work)) | 77 | if (!irq_work_claim(work)) |
77 | return false; | 78 | return false; |
78 | 79 | ||
79 | if (llist_add(&work->llnode, &per_cpu(raised_list, cpu))) | 80 | /* Queue the entry and raise the IPI if needed. */ |
80 | arch_send_call_function_single_ipi(cpu); | 81 | preempt_disable(); |
81 | 82 | __irq_work_queue_local(work); | |
82 | #else /* #ifdef CONFIG_SMP */ | 83 | preempt_enable(); |
83 | irq_work_queue(work); | ||
84 | #endif /* #else #ifdef CONFIG_SMP */ | ||
85 | 84 | ||
86 | return true; | 85 | return true; |
87 | } | 86 | } |
87 | EXPORT_SYMBOL_GPL(irq_work_queue); | ||
88 | 88 | ||
89 | /* Enqueue the irq work @work on the current CPU */ | 89 | /* |
90 | bool irq_work_queue(struct irq_work *work) | 90 | * Enqueue the irq_work @work on @cpu unless it's already pending |
91 | * somewhere. | ||
92 | * | ||
93 | * Can be re-enqueued while the callback is still in progress. | ||
94 | */ | ||
95 | bool irq_work_queue_on(struct irq_work *work, int cpu) | ||
91 | { | 96 | { |
97 | #ifndef CONFIG_SMP | ||
98 | return irq_work_queue(work); | ||
99 | |||
100 | #else /* CONFIG_SMP: */ | ||
101 | /* All work should have been flushed before going offline */ | ||
102 | WARN_ON_ONCE(cpu_is_offline(cpu)); | ||
103 | |||
92 | /* Only queue if not already pending */ | 104 | /* Only queue if not already pending */ |
93 | if (!irq_work_claim(work)) | 105 | if (!irq_work_claim(work)) |
94 | return false; | 106 | return false; |
95 | 107 | ||
96 | /* Queue the entry and raise the IPI if needed. */ | ||
97 | preempt_disable(); | 108 | preempt_disable(); |
98 | 109 | if (cpu != smp_processor_id()) { | |
99 | /* If the work is "lazy", handle it from next tick if any */ | 110 | /* Arch remote IPI send/receive backend aren't NMI safe */ |
100 | if (work->flags & IRQ_WORK_LAZY) { | 111 | WARN_ON_ONCE(in_nmi()); |
101 | if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) && | 112 | if (llist_add(&work->llnode, &per_cpu(raised_list, cpu))) |
102 | tick_nohz_tick_stopped()) | 113 | arch_send_call_function_single_ipi(cpu); |
103 | arch_irq_work_raise(); | ||
104 | } else { | 114 | } else { |
105 | if (llist_add(&work->llnode, this_cpu_ptr(&raised_list))) | 115 | __irq_work_queue_local(work); |
106 | arch_irq_work_raise(); | ||
107 | } | 116 | } |
108 | |||
109 | preempt_enable(); | 117 | preempt_enable(); |
110 | 118 | ||
111 | return true; | 119 | return true; |
120 | #endif /* CONFIG_SMP */ | ||
112 | } | 121 | } |
113 | EXPORT_SYMBOL_GPL(irq_work_queue); | 122 | |
114 | 123 | ||
115 | bool irq_work_needs_cpu(void) | 124 | bool irq_work_needs_cpu(void) |
116 | { | 125 | { |
diff --git a/kernel/jump_label.c b/kernel/jump_label.c index bad96b476eb6..de6efdecc70d 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c | |||
@@ -202,11 +202,13 @@ void static_key_disable(struct static_key *key) | |||
202 | } | 202 | } |
203 | EXPORT_SYMBOL_GPL(static_key_disable); | 203 | EXPORT_SYMBOL_GPL(static_key_disable); |
204 | 204 | ||
205 | static void __static_key_slow_dec_cpuslocked(struct static_key *key, | 205 | static bool static_key_slow_try_dec(struct static_key *key) |
206 | unsigned long rate_limit, | ||
207 | struct delayed_work *work) | ||
208 | { | 206 | { |
209 | lockdep_assert_cpus_held(); | 207 | int val; |
208 | |||
209 | val = atomic_fetch_add_unless(&key->enabled, -1, 1); | ||
210 | if (val == 1) | ||
211 | return false; | ||
210 | 212 | ||
211 | /* | 213 | /* |
212 | * The negative count check is valid even when a negative | 214 | * The negative count check is valid even when a negative |
@@ -215,63 +217,70 @@ static void __static_key_slow_dec_cpuslocked(struct static_key *key, | |||
215 | * returns is unbalanced, because all other static_key_slow_inc() | 217 | * returns is unbalanced, because all other static_key_slow_inc() |
216 | * instances block while the update is in progress. | 218 | * instances block while the update is in progress. |
217 | */ | 219 | */ |
218 | if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) { | 220 | WARN(val < 0, "jump label: negative count!\n"); |
219 | WARN(atomic_read(&key->enabled) < 0, | 221 | return true; |
220 | "jump label: negative count!\n"); | 222 | } |
223 | |||
224 | static void __static_key_slow_dec_cpuslocked(struct static_key *key) | ||
225 | { | ||
226 | lockdep_assert_cpus_held(); | ||
227 | |||
228 | if (static_key_slow_try_dec(key)) | ||
221 | return; | 229 | return; |
222 | } | ||
223 | 230 | ||
224 | if (rate_limit) { | 231 | jump_label_lock(); |
225 | atomic_inc(&key->enabled); | 232 | if (atomic_dec_and_test(&key->enabled)) |
226 | schedule_delayed_work(work, rate_limit); | ||
227 | } else { | ||
228 | jump_label_update(key); | 233 | jump_label_update(key); |
229 | } | ||
230 | jump_label_unlock(); | 234 | jump_label_unlock(); |
231 | } | 235 | } |
232 | 236 | ||
233 | static void __static_key_slow_dec(struct static_key *key, | 237 | static void __static_key_slow_dec(struct static_key *key) |
234 | unsigned long rate_limit, | ||
235 | struct delayed_work *work) | ||
236 | { | 238 | { |
237 | cpus_read_lock(); | 239 | cpus_read_lock(); |
238 | __static_key_slow_dec_cpuslocked(key, rate_limit, work); | 240 | __static_key_slow_dec_cpuslocked(key); |
239 | cpus_read_unlock(); | 241 | cpus_read_unlock(); |
240 | } | 242 | } |
241 | 243 | ||
242 | static void jump_label_update_timeout(struct work_struct *work) | 244 | void jump_label_update_timeout(struct work_struct *work) |
243 | { | 245 | { |
244 | struct static_key_deferred *key = | 246 | struct static_key_deferred *key = |
245 | container_of(work, struct static_key_deferred, work.work); | 247 | container_of(work, struct static_key_deferred, work.work); |
246 | __static_key_slow_dec(&key->key, 0, NULL); | 248 | __static_key_slow_dec(&key->key); |
247 | } | 249 | } |
250 | EXPORT_SYMBOL_GPL(jump_label_update_timeout); | ||
248 | 251 | ||
249 | void static_key_slow_dec(struct static_key *key) | 252 | void static_key_slow_dec(struct static_key *key) |
250 | { | 253 | { |
251 | STATIC_KEY_CHECK_USE(key); | 254 | STATIC_KEY_CHECK_USE(key); |
252 | __static_key_slow_dec(key, 0, NULL); | 255 | __static_key_slow_dec(key); |
253 | } | 256 | } |
254 | EXPORT_SYMBOL_GPL(static_key_slow_dec); | 257 | EXPORT_SYMBOL_GPL(static_key_slow_dec); |
255 | 258 | ||
256 | void static_key_slow_dec_cpuslocked(struct static_key *key) | 259 | void static_key_slow_dec_cpuslocked(struct static_key *key) |
257 | { | 260 | { |
258 | STATIC_KEY_CHECK_USE(key); | 261 | STATIC_KEY_CHECK_USE(key); |
259 | __static_key_slow_dec_cpuslocked(key, 0, NULL); | 262 | __static_key_slow_dec_cpuslocked(key); |
260 | } | 263 | } |
261 | 264 | ||
262 | void static_key_slow_dec_deferred(struct static_key_deferred *key) | 265 | void __static_key_slow_dec_deferred(struct static_key *key, |
266 | struct delayed_work *work, | ||
267 | unsigned long timeout) | ||
263 | { | 268 | { |
264 | STATIC_KEY_CHECK_USE(key); | 269 | STATIC_KEY_CHECK_USE(key); |
265 | __static_key_slow_dec(&key->key, key->timeout, &key->work); | 270 | |
271 | if (static_key_slow_try_dec(key)) | ||
272 | return; | ||
273 | |||
274 | schedule_delayed_work(work, timeout); | ||
266 | } | 275 | } |
267 | EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred); | 276 | EXPORT_SYMBOL_GPL(__static_key_slow_dec_deferred); |
268 | 277 | ||
269 | void static_key_deferred_flush(struct static_key_deferred *key) | 278 | void __static_key_deferred_flush(void *key, struct delayed_work *work) |
270 | { | 279 | { |
271 | STATIC_KEY_CHECK_USE(key); | 280 | STATIC_KEY_CHECK_USE(key); |
272 | flush_delayed_work(&key->work); | 281 | flush_delayed_work(work); |
273 | } | 282 | } |
274 | EXPORT_SYMBOL_GPL(static_key_deferred_flush); | 283 | EXPORT_SYMBOL_GPL(__static_key_deferred_flush); |
275 | 284 | ||
276 | void jump_label_rate_limit(struct static_key_deferred *key, | 285 | void jump_label_rate_limit(struct static_key_deferred *key, |
277 | unsigned long rl) | 286 | unsigned long rl) |
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index d7140447be75..fd5c95ff9251 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c | |||
@@ -1150,7 +1150,7 @@ int kernel_kexec(void) | |||
1150 | error = dpm_suspend_end(PMSG_FREEZE); | 1150 | error = dpm_suspend_end(PMSG_FREEZE); |
1151 | if (error) | 1151 | if (error) |
1152 | goto Resume_devices; | 1152 | goto Resume_devices; |
1153 | error = disable_nonboot_cpus(); | 1153 | error = suspend_disable_secondary_cpus(); |
1154 | if (error) | 1154 | if (error) |
1155 | goto Enable_cpus; | 1155 | goto Enable_cpus; |
1156 | local_irq_disable(); | 1156 | local_irq_disable(); |
@@ -1183,7 +1183,7 @@ int kernel_kexec(void) | |||
1183 | Enable_irqs: | 1183 | Enable_irqs: |
1184 | local_irq_enable(); | 1184 | local_irq_enable(); |
1185 | Enable_cpus: | 1185 | Enable_cpus: |
1186 | enable_nonboot_cpus(); | 1186 | suspend_enable_secondary_cpus(); |
1187 | dpm_resume_start(PMSG_RESTORE); | 1187 | dpm_resume_start(PMSG_RESTORE); |
1188 | Resume_devices: | 1188 | Resume_devices: |
1189 | dpm_resume_end(PMSG_RESTORE); | 1189 | dpm_resume_end(PMSG_RESTORE); |
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index f1d0e00a3971..f7fb8f6a688f 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c | |||
@@ -688,7 +688,6 @@ static int kexec_calculate_store_digests(struct kimage *image) | |||
688 | goto out_free_desc; | 688 | goto out_free_desc; |
689 | 689 | ||
690 | desc->tfm = tfm; | 690 | desc->tfm = tfm; |
691 | desc->flags = 0; | ||
692 | 691 | ||
693 | ret = crypto_shash_init(desc); | 692 | ret = crypto_shash_init(desc); |
694 | if (ret < 0) | 693 | if (ret < 0) |
diff --git a/kernel/kheaders.c b/kernel/kheaders.c new file mode 100644 index 000000000000..70ae6052920d --- /dev/null +++ b/kernel/kheaders.c | |||
@@ -0,0 +1,74 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* | ||
3 | * Provide kernel headers useful to build tracing programs | ||
4 | * such as for running eBPF tracing tools. | ||
5 | * | ||
6 | * (Borrowed code from kernel/configs.c) | ||
7 | */ | ||
8 | |||
9 | #include <linux/kernel.h> | ||
10 | #include <linux/module.h> | ||
11 | #include <linux/proc_fs.h> | ||
12 | #include <linux/init.h> | ||
13 | #include <linux/uaccess.h> | ||
14 | |||
15 | /* | ||
16 | * Define kernel_headers_data and kernel_headers_data_end, within which the | ||
17 | * compressed kernel headers are stored. The file is first compressed with xz. | ||
18 | */ | ||
19 | |||
20 | asm ( | ||
21 | " .pushsection .rodata, \"a\" \n" | ||
22 | " .global kernel_headers_data \n" | ||
23 | "kernel_headers_data: \n" | ||
24 | " .incbin \"kernel/kheaders_data.tar.xz\" \n" | ||
25 | " .global kernel_headers_data_end \n" | ||
26 | "kernel_headers_data_end: \n" | ||
27 | " .popsection \n" | ||
28 | ); | ||
29 | |||
30 | extern char kernel_headers_data; | ||
31 | extern char kernel_headers_data_end; | ||
32 | |||
33 | static ssize_t | ||
34 | ikheaders_read_current(struct file *file, char __user *buf, | ||
35 | size_t len, loff_t *offset) | ||
36 | { | ||
37 | return simple_read_from_buffer(buf, len, offset, | ||
38 | &kernel_headers_data, | ||
39 | &kernel_headers_data_end - | ||
40 | &kernel_headers_data); | ||
41 | } | ||
42 | |||
43 | static const struct file_operations ikheaders_file_ops = { | ||
44 | .read = ikheaders_read_current, | ||
45 | .llseek = default_llseek, | ||
46 | }; | ||
47 | |||
48 | static int __init ikheaders_init(void) | ||
49 | { | ||
50 | struct proc_dir_entry *entry; | ||
51 | |||
52 | /* create the current headers file */ | ||
53 | entry = proc_create("kheaders.tar.xz", S_IRUGO, NULL, | ||
54 | &ikheaders_file_ops); | ||
55 | if (!entry) | ||
56 | return -ENOMEM; | ||
57 | |||
58 | proc_set_size(entry, | ||
59 | &kernel_headers_data_end - | ||
60 | &kernel_headers_data); | ||
61 | return 0; | ||
62 | } | ||
63 | |||
64 | static void __exit ikheaders_cleanup(void) | ||
65 | { | ||
66 | remove_proc_entry("kheaders.tar.xz", NULL); | ||
67 | } | ||
68 | |||
69 | module_init(ikheaders_init); | ||
70 | module_exit(ikheaders_cleanup); | ||
71 | |||
72 | MODULE_LICENSE("GPL v2"); | ||
73 | MODULE_AUTHOR("Joel Fernandes"); | ||
74 | MODULE_DESCRIPTION("Echo the kernel header artifacts used to build the kernel"); | ||
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index c83e54727131..b1ea30a5540e 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -709,7 +709,6 @@ static void unoptimize_kprobe(struct kprobe *p, bool force) | |||
709 | static int reuse_unused_kprobe(struct kprobe *ap) | 709 | static int reuse_unused_kprobe(struct kprobe *ap) |
710 | { | 710 | { |
711 | struct optimized_kprobe *op; | 711 | struct optimized_kprobe *op; |
712 | int ret; | ||
713 | 712 | ||
714 | /* | 713 | /* |
715 | * Unused kprobe MUST be on the way of delayed unoptimizing (means | 714 | * Unused kprobe MUST be on the way of delayed unoptimizing (means |
@@ -720,9 +719,8 @@ static int reuse_unused_kprobe(struct kprobe *ap) | |||
720 | /* Enable the probe again */ | 719 | /* Enable the probe again */ |
721 | ap->flags &= ~KPROBE_FLAG_DISABLED; | 720 | ap->flags &= ~KPROBE_FLAG_DISABLED; |
722 | /* Optimize it again (remove from op->list) */ | 721 | /* Optimize it again (remove from op->list) */ |
723 | ret = kprobe_optready(ap); | 722 | if (!kprobe_optready(ap)) |
724 | if (ret) | 723 | return -EINVAL; |
725 | return ret; | ||
726 | 724 | ||
727 | optimize_kprobe(ap); | 725 | optimize_kprobe(ap); |
728 | return 0; | 726 | return 0; |
diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 96b4179cee6a..99a5b5f46dc5 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c | |||
@@ -120,8 +120,8 @@ account_global_scheduler_latency(struct task_struct *tsk, | |||
120 | break; | 120 | break; |
121 | } | 121 | } |
122 | 122 | ||
123 | /* 0 and ULONG_MAX entries mean end of backtrace: */ | 123 | /* 0 entry marks end of backtrace: */ |
124 | if (record == 0 || record == ULONG_MAX) | 124 | if (!record) |
125 | break; | 125 | break; |
126 | } | 126 | } |
127 | if (same) { | 127 | if (same) { |
@@ -141,20 +141,6 @@ account_global_scheduler_latency(struct task_struct *tsk, | |||
141 | memcpy(&latency_record[i], lat, sizeof(struct latency_record)); | 141 | memcpy(&latency_record[i], lat, sizeof(struct latency_record)); |
142 | } | 142 | } |
143 | 143 | ||
144 | /* | ||
145 | * Iterator to store a backtrace into a latency record entry | ||
146 | */ | ||
147 | static inline void store_stacktrace(struct task_struct *tsk, | ||
148 | struct latency_record *lat) | ||
149 | { | ||
150 | struct stack_trace trace; | ||
151 | |||
152 | memset(&trace, 0, sizeof(trace)); | ||
153 | trace.max_entries = LT_BACKTRACEDEPTH; | ||
154 | trace.entries = &lat->backtrace[0]; | ||
155 | save_stack_trace_tsk(tsk, &trace); | ||
156 | } | ||
157 | |||
158 | /** | 144 | /** |
159 | * __account_scheduler_latency - record an occurred latency | 145 | * __account_scheduler_latency - record an occurred latency |
160 | * @tsk - the task struct of the task hitting the latency | 146 | * @tsk - the task struct of the task hitting the latency |
@@ -191,7 +177,8 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) | |||
191 | lat.count = 1; | 177 | lat.count = 1; |
192 | lat.time = usecs; | 178 | lat.time = usecs; |
193 | lat.max = usecs; | 179 | lat.max = usecs; |
194 | store_stacktrace(tsk, &lat); | 180 | |
181 | stack_trace_save_tsk(tsk, lat.backtrace, LT_BACKTRACEDEPTH, 0); | ||
195 | 182 | ||
196 | raw_spin_lock_irqsave(&latency_lock, flags); | 183 | raw_spin_lock_irqsave(&latency_lock, flags); |
197 | 184 | ||
@@ -210,8 +197,8 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) | |||
210 | break; | 197 | break; |
211 | } | 198 | } |
212 | 199 | ||
213 | /* 0 and ULONG_MAX entries mean end of backtrace: */ | 200 | /* 0 entry is end of backtrace */ |
214 | if (record == 0 || record == ULONG_MAX) | 201 | if (!record) |
215 | break; | 202 | break; |
216 | } | 203 | } |
217 | if (same) { | 204 | if (same) { |
@@ -252,10 +239,10 @@ static int lstats_show(struct seq_file *m, void *v) | |||
252 | lr->count, lr->time, lr->max); | 239 | lr->count, lr->time, lr->max); |
253 | for (q = 0; q < LT_BACKTRACEDEPTH; q++) { | 240 | for (q = 0; q < LT_BACKTRACEDEPTH; q++) { |
254 | unsigned long bt = lr->backtrace[q]; | 241 | unsigned long bt = lr->backtrace[q]; |
242 | |||
255 | if (!bt) | 243 | if (!bt) |
256 | break; | 244 | break; |
257 | if (bt == ULONG_MAX) | 245 | |
258 | break; | ||
259 | seq_printf(m, " %ps", (void *)bt); | 246 | seq_printf(m, " %ps", (void *)bt); |
260 | } | 247 | } |
261 | seq_puts(m, "\n"); | 248 | seq_puts(m, "\n"); |
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index eb0ee10a1981..f6fbaff10e71 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c | |||
@@ -419,6 +419,7 @@ static struct attribute *klp_patch_attrs[] = { | |||
419 | &force_kobj_attr.attr, | 419 | &force_kobj_attr.attr, |
420 | NULL | 420 | NULL |
421 | }; | 421 | }; |
422 | ATTRIBUTE_GROUPS(klp_patch); | ||
422 | 423 | ||
423 | static void klp_free_object_dynamic(struct klp_object *obj) | 424 | static void klp_free_object_dynamic(struct klp_object *obj) |
424 | { | 425 | { |
@@ -426,7 +427,13 @@ static void klp_free_object_dynamic(struct klp_object *obj) | |||
426 | kfree(obj); | 427 | kfree(obj); |
427 | } | 428 | } |
428 | 429 | ||
429 | static struct klp_object *klp_alloc_object_dynamic(const char *name) | 430 | static void klp_init_func_early(struct klp_object *obj, |
431 | struct klp_func *func); | ||
432 | static void klp_init_object_early(struct klp_patch *patch, | ||
433 | struct klp_object *obj); | ||
434 | |||
435 | static struct klp_object *klp_alloc_object_dynamic(const char *name, | ||
436 | struct klp_patch *patch) | ||
430 | { | 437 | { |
431 | struct klp_object *obj; | 438 | struct klp_object *obj; |
432 | 439 | ||
@@ -442,7 +449,7 @@ static struct klp_object *klp_alloc_object_dynamic(const char *name) | |||
442 | } | 449 | } |
443 | } | 450 | } |
444 | 451 | ||
445 | INIT_LIST_HEAD(&obj->func_list); | 452 | klp_init_object_early(patch, obj); |
446 | obj->dynamic = true; | 453 | obj->dynamic = true; |
447 | 454 | ||
448 | return obj; | 455 | return obj; |
@@ -471,6 +478,7 @@ static struct klp_func *klp_alloc_func_nop(struct klp_func *old_func, | |||
471 | } | 478 | } |
472 | } | 479 | } |
473 | 480 | ||
481 | klp_init_func_early(obj, func); | ||
474 | /* | 482 | /* |
475 | * func->new_func is same as func->old_func. These addresses are | 483 | * func->new_func is same as func->old_func. These addresses are |
476 | * set when the object is loaded, see klp_init_object_loaded(). | 484 | * set when the object is loaded, see klp_init_object_loaded(). |
@@ -490,11 +498,9 @@ static int klp_add_object_nops(struct klp_patch *patch, | |||
490 | obj = klp_find_object(patch, old_obj); | 498 | obj = klp_find_object(patch, old_obj); |
491 | 499 | ||
492 | if (!obj) { | 500 | if (!obj) { |
493 | obj = klp_alloc_object_dynamic(old_obj->name); | 501 | obj = klp_alloc_object_dynamic(old_obj->name, patch); |
494 | if (!obj) | 502 | if (!obj) |
495 | return -ENOMEM; | 503 | return -ENOMEM; |
496 | |||
497 | list_add_tail(&obj->node, &patch->obj_list); | ||
498 | } | 504 | } |
499 | 505 | ||
500 | klp_for_each_func(old_obj, old_func) { | 506 | klp_for_each_func(old_obj, old_func) { |
@@ -505,8 +511,6 @@ static int klp_add_object_nops(struct klp_patch *patch, | |||
505 | func = klp_alloc_func_nop(old_func, obj); | 511 | func = klp_alloc_func_nop(old_func, obj); |
506 | if (!func) | 512 | if (!func) |
507 | return -ENOMEM; | 513 | return -ENOMEM; |
508 | |||
509 | list_add_tail(&func->node, &obj->func_list); | ||
510 | } | 514 | } |
511 | 515 | ||
512 | return 0; | 516 | return 0; |
@@ -546,7 +550,7 @@ static void klp_kobj_release_patch(struct kobject *kobj) | |||
546 | static struct kobj_type klp_ktype_patch = { | 550 | static struct kobj_type klp_ktype_patch = { |
547 | .release = klp_kobj_release_patch, | 551 | .release = klp_kobj_release_patch, |
548 | .sysfs_ops = &kobj_sysfs_ops, | 552 | .sysfs_ops = &kobj_sysfs_ops, |
549 | .default_attrs = klp_patch_attrs, | 553 | .default_groups = klp_patch_groups, |
550 | }; | 554 | }; |
551 | 555 | ||
552 | static void klp_kobj_release_object(struct kobject *kobj) | 556 | static void klp_kobj_release_object(struct kobject *kobj) |
@@ -588,13 +592,7 @@ static void __klp_free_funcs(struct klp_object *obj, bool nops_only) | |||
588 | continue; | 592 | continue; |
589 | 593 | ||
590 | list_del(&func->node); | 594 | list_del(&func->node); |
591 | 595 | kobject_put(&func->kobj); | |
592 | /* Might be called from klp_init_patch() error path. */ | ||
593 | if (func->kobj_added) { | ||
594 | kobject_put(&func->kobj); | ||
595 | } else if (func->nop) { | ||
596 | klp_free_func_nop(func); | ||
597 | } | ||
598 | } | 596 | } |
599 | } | 597 | } |
600 | 598 | ||
@@ -624,13 +622,7 @@ static void __klp_free_objects(struct klp_patch *patch, bool nops_only) | |||
624 | continue; | 622 | continue; |
625 | 623 | ||
626 | list_del(&obj->node); | 624 | list_del(&obj->node); |
627 | 625 | kobject_put(&obj->kobj); | |
628 | /* Might be called from klp_init_patch() error path. */ | ||
629 | if (obj->kobj_added) { | ||
630 | kobject_put(&obj->kobj); | ||
631 | } else if (obj->dynamic) { | ||
632 | klp_free_object_dynamic(obj); | ||
633 | } | ||
634 | } | 626 | } |
635 | } | 627 | } |
636 | 628 | ||
@@ -675,10 +667,8 @@ static void klp_free_patch_finish(struct klp_patch *patch) | |||
675 | * this is called when the patch gets disabled and it | 667 | * this is called when the patch gets disabled and it |
676 | * cannot get enabled again. | 668 | * cannot get enabled again. |
677 | */ | 669 | */ |
678 | if (patch->kobj_added) { | 670 | kobject_put(&patch->kobj); |
679 | kobject_put(&patch->kobj); | 671 | wait_for_completion(&patch->finish); |
680 | wait_for_completion(&patch->finish); | ||
681 | } | ||
682 | 672 | ||
683 | /* Put the module after the last access to struct klp_patch. */ | 673 | /* Put the module after the last access to struct klp_patch. */ |
684 | if (!patch->forced) | 674 | if (!patch->forced) |
@@ -700,8 +690,6 @@ static void klp_free_patch_work_fn(struct work_struct *work) | |||
700 | 690 | ||
701 | static int klp_init_func(struct klp_object *obj, struct klp_func *func) | 691 | static int klp_init_func(struct klp_object *obj, struct klp_func *func) |
702 | { | 692 | { |
703 | int ret; | ||
704 | |||
705 | if (!func->old_name) | 693 | if (!func->old_name) |
706 | return -EINVAL; | 694 | return -EINVAL; |
707 | 695 | ||
@@ -724,13 +712,9 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func) | |||
724 | * object. If the user selects 0 for old_sympos, then 1 will be used | 712 | * object. If the user selects 0 for old_sympos, then 1 will be used |
725 | * since a unique symbol will be the first occurrence. | 713 | * since a unique symbol will be the first occurrence. |
726 | */ | 714 | */ |
727 | ret = kobject_init_and_add(&func->kobj, &klp_ktype_func, | 715 | return kobject_add(&func->kobj, &obj->kobj, "%s,%lu", |
728 | &obj->kobj, "%s,%lu", func->old_name, | 716 | func->old_name, |
729 | func->old_sympos ? func->old_sympos : 1); | 717 | func->old_sympos ? func->old_sympos : 1); |
730 | if (!ret) | ||
731 | func->kobj_added = true; | ||
732 | |||
733 | return ret; | ||
734 | } | 718 | } |
735 | 719 | ||
736 | /* Arches may override this to finish any remaining arch-specific tasks */ | 720 | /* Arches may override this to finish any remaining arch-specific tasks */ |
@@ -801,11 +785,9 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj) | |||
801 | klp_find_object_module(obj); | 785 | klp_find_object_module(obj); |
802 | 786 | ||
803 | name = klp_is_module(obj) ? obj->name : "vmlinux"; | 787 | name = klp_is_module(obj) ? obj->name : "vmlinux"; |
804 | ret = kobject_init_and_add(&obj->kobj, &klp_ktype_object, | 788 | ret = kobject_add(&obj->kobj, &patch->kobj, "%s", name); |
805 | &patch->kobj, "%s", name); | ||
806 | if (ret) | 789 | if (ret) |
807 | return ret; | 790 | return ret; |
808 | obj->kobj_added = true; | ||
809 | 791 | ||
810 | klp_for_each_func(obj, func) { | 792 | klp_for_each_func(obj, func) { |
811 | ret = klp_init_func(obj, func); | 793 | ret = klp_init_func(obj, func); |
@@ -819,6 +801,21 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj) | |||
819 | return ret; | 801 | return ret; |
820 | } | 802 | } |
821 | 803 | ||
804 | static void klp_init_func_early(struct klp_object *obj, | ||
805 | struct klp_func *func) | ||
806 | { | ||
807 | kobject_init(&func->kobj, &klp_ktype_func); | ||
808 | list_add_tail(&func->node, &obj->func_list); | ||
809 | } | ||
810 | |||
811 | static void klp_init_object_early(struct klp_patch *patch, | ||
812 | struct klp_object *obj) | ||
813 | { | ||
814 | INIT_LIST_HEAD(&obj->func_list); | ||
815 | kobject_init(&obj->kobj, &klp_ktype_object); | ||
816 | list_add_tail(&obj->node, &patch->obj_list); | ||
817 | } | ||
818 | |||
822 | static int klp_init_patch_early(struct klp_patch *patch) | 819 | static int klp_init_patch_early(struct klp_patch *patch) |
823 | { | 820 | { |
824 | struct klp_object *obj; | 821 | struct klp_object *obj; |
@@ -829,7 +826,7 @@ static int klp_init_patch_early(struct klp_patch *patch) | |||
829 | 826 | ||
830 | INIT_LIST_HEAD(&patch->list); | 827 | INIT_LIST_HEAD(&patch->list); |
831 | INIT_LIST_HEAD(&patch->obj_list); | 828 | INIT_LIST_HEAD(&patch->obj_list); |
832 | patch->kobj_added = false; | 829 | kobject_init(&patch->kobj, &klp_ktype_patch); |
833 | patch->enabled = false; | 830 | patch->enabled = false; |
834 | patch->forced = false; | 831 | patch->forced = false; |
835 | INIT_WORK(&patch->free_work, klp_free_patch_work_fn); | 832 | INIT_WORK(&patch->free_work, klp_free_patch_work_fn); |
@@ -839,13 +836,10 @@ static int klp_init_patch_early(struct klp_patch *patch) | |||
839 | if (!obj->funcs) | 836 | if (!obj->funcs) |
840 | return -EINVAL; | 837 | return -EINVAL; |
841 | 838 | ||
842 | INIT_LIST_HEAD(&obj->func_list); | 839 | klp_init_object_early(patch, obj); |
843 | obj->kobj_added = false; | ||
844 | list_add_tail(&obj->node, &patch->obj_list); | ||
845 | 840 | ||
846 | klp_for_each_func_static(obj, func) { | 841 | klp_for_each_func_static(obj, func) { |
847 | func->kobj_added = false; | 842 | klp_init_func_early(obj, func); |
848 | list_add_tail(&func->node, &obj->func_list); | ||
849 | } | 843 | } |
850 | } | 844 | } |
851 | 845 | ||
@@ -860,11 +854,9 @@ static int klp_init_patch(struct klp_patch *patch) | |||
860 | struct klp_object *obj; | 854 | struct klp_object *obj; |
861 | int ret; | 855 | int ret; |
862 | 856 | ||
863 | ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch, | 857 | ret = kobject_add(&patch->kobj, klp_root_kobj, "%s", patch->mod->name); |
864 | klp_root_kobj, "%s", patch->mod->name); | ||
865 | if (ret) | 858 | if (ret) |
866 | return ret; | 859 | return ret; |
867 | patch->kobj_added = true; | ||
868 | 860 | ||
869 | if (patch->replace) { | 861 | if (patch->replace) { |
870 | ret = klp_add_nops(patch); | 862 | ret = klp_add_nops(patch); |
@@ -926,9 +918,6 @@ static int __klp_enable_patch(struct klp_patch *patch) | |||
926 | if (WARN_ON(patch->enabled)) | 918 | if (WARN_ON(patch->enabled)) |
927 | return -EINVAL; | 919 | return -EINVAL; |
928 | 920 | ||
929 | if (!patch->kobj_added) | ||
930 | return -EINVAL; | ||
931 | |||
932 | pr_notice("enabling patch '%s'\n", patch->mod->name); | 921 | pr_notice("enabling patch '%s'\n", patch->mod->name); |
933 | 922 | ||
934 | klp_init_transition(patch, KLP_PATCHED); | 923 | klp_init_transition(patch, KLP_PATCHED); |
@@ -1003,11 +992,10 @@ int klp_enable_patch(struct klp_patch *patch) | |||
1003 | return -ENODEV; | 992 | return -ENODEV; |
1004 | 993 | ||
1005 | if (!klp_have_reliable_stack()) { | 994 | if (!klp_have_reliable_stack()) { |
1006 | pr_err("This architecture doesn't have support for the livepatch consistency model.\n"); | 995 | pr_warn("This architecture doesn't have support for the livepatch consistency model.\n"); |
1007 | return -EOPNOTSUPP; | 996 | pr_warn("The livepatch transition may never complete.\n"); |
1008 | } | 997 | } |
1009 | 998 | ||
1010 | |||
1011 | mutex_lock(&klp_mutex); | 999 | mutex_lock(&klp_mutex); |
1012 | 1000 | ||
1013 | ret = klp_init_patch_early(patch); | 1001 | ret = klp_init_patch_early(patch); |
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index 9c89ae8b337a..c53370d596be 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c | |||
@@ -202,15 +202,15 @@ void klp_update_patch_state(struct task_struct *task) | |||
202 | * Determine whether the given stack trace includes any references to a | 202 | * Determine whether the given stack trace includes any references to a |
203 | * to-be-patched or to-be-unpatched function. | 203 | * to-be-patched or to-be-unpatched function. |
204 | */ | 204 | */ |
205 | static int klp_check_stack_func(struct klp_func *func, | 205 | static int klp_check_stack_func(struct klp_func *func, unsigned long *entries, |
206 | struct stack_trace *trace) | 206 | unsigned int nr_entries) |
207 | { | 207 | { |
208 | unsigned long func_addr, func_size, address; | 208 | unsigned long func_addr, func_size, address; |
209 | struct klp_ops *ops; | 209 | struct klp_ops *ops; |
210 | int i; | 210 | int i; |
211 | 211 | ||
212 | for (i = 0; i < trace->nr_entries; i++) { | 212 | for (i = 0; i < nr_entries; i++) { |
213 | address = trace->entries[i]; | 213 | address = entries[i]; |
214 | 214 | ||
215 | if (klp_target_state == KLP_UNPATCHED) { | 215 | if (klp_target_state == KLP_UNPATCHED) { |
216 | /* | 216 | /* |
@@ -254,29 +254,25 @@ static int klp_check_stack_func(struct klp_func *func, | |||
254 | static int klp_check_stack(struct task_struct *task, char *err_buf) | 254 | static int klp_check_stack(struct task_struct *task, char *err_buf) |
255 | { | 255 | { |
256 | static unsigned long entries[MAX_STACK_ENTRIES]; | 256 | static unsigned long entries[MAX_STACK_ENTRIES]; |
257 | struct stack_trace trace; | ||
258 | struct klp_object *obj; | 257 | struct klp_object *obj; |
259 | struct klp_func *func; | 258 | struct klp_func *func; |
260 | int ret; | 259 | int ret, nr_entries; |
261 | 260 | ||
262 | trace.skip = 0; | 261 | ret = stack_trace_save_tsk_reliable(task, entries, ARRAY_SIZE(entries)); |
263 | trace.nr_entries = 0; | ||
264 | trace.max_entries = MAX_STACK_ENTRIES; | ||
265 | trace.entries = entries; | ||
266 | ret = save_stack_trace_tsk_reliable(task, &trace); | ||
267 | WARN_ON_ONCE(ret == -ENOSYS); | 262 | WARN_ON_ONCE(ret == -ENOSYS); |
268 | if (ret) { | 263 | if (ret < 0) { |
269 | snprintf(err_buf, STACK_ERR_BUF_SIZE, | 264 | snprintf(err_buf, STACK_ERR_BUF_SIZE, |
270 | "%s: %s:%d has an unreliable stack\n", | 265 | "%s: %s:%d has an unreliable stack\n", |
271 | __func__, task->comm, task->pid); | 266 | __func__, task->comm, task->pid); |
272 | return ret; | 267 | return ret; |
273 | } | 268 | } |
269 | nr_entries = ret; | ||
274 | 270 | ||
275 | klp_for_each_object(klp_transition_patch, obj) { | 271 | klp_for_each_object(klp_transition_patch, obj) { |
276 | if (!obj->patched) | 272 | if (!obj->patched) |
277 | continue; | 273 | continue; |
278 | klp_for_each_func(obj, func) { | 274 | klp_for_each_func(obj, func) { |
279 | ret = klp_check_stack_func(func, &trace); | 275 | ret = klp_check_stack_func(func, entries, nr_entries); |
280 | if (ret) { | 276 | if (ret) { |
281 | snprintf(err_buf, STACK_ERR_BUF_SIZE, | 277 | snprintf(err_buf, STACK_ERR_BUF_SIZE, |
282 | "%s: %s:%d is sleeping on function %s\n", | 278 | "%s: %s:%d is sleeping on function %s\n", |
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 392c7f23af76..6fe2f333aecb 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile | |||
@@ -3,7 +3,7 @@ | |||
3 | # and is generally not a function of system call inputs. | 3 | # and is generally not a function of system call inputs. |
4 | KCOV_INSTRUMENT := n | 4 | KCOV_INSTRUMENT := n |
5 | 5 | ||
6 | obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o | 6 | obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o rwsem-xadd.o |
7 | 7 | ||
8 | ifdef CONFIG_FUNCTION_TRACER | 8 | ifdef CONFIG_FUNCTION_TRACER |
9 | CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) | 9 | CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) |
@@ -25,8 +25,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o | |||
25 | obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o | 25 | obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o |
26 | obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o | 26 | obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o |
27 | obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o | 27 | obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o |
28 | obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o | ||
29 | obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o | ||
30 | obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o | 28 | obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o |
31 | obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o | 29 | obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o |
32 | obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o | 30 | obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o |
31 | obj-$(CONFIG_LOCK_EVENT_COUNTS) += lock_events.o | ||
diff --git a/kernel/locking/lock_events.c b/kernel/locking/lock_events.c new file mode 100644 index 000000000000..fa2c2f951c6b --- /dev/null +++ b/kernel/locking/lock_events.c | |||
@@ -0,0 +1,179 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 */ | ||
2 | /* | ||
3 | * This program is free software; you can redistribute it and/or modify | ||
4 | * it under the terms of the GNU General Public License as published by | ||
5 | * the Free Software Foundation; either version 2 of the License, or | ||
6 | * (at your option) any later version. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
11 | * GNU General Public License for more details. | ||
12 | * | ||
13 | * Authors: Waiman Long <waiman.long@hpe.com> | ||
14 | */ | ||
15 | |||
16 | /* | ||
17 | * Collect locking event counts | ||
18 | */ | ||
19 | #include <linux/debugfs.h> | ||
20 | #include <linux/sched.h> | ||
21 | #include <linux/sched/clock.h> | ||
22 | #include <linux/fs.h> | ||
23 | |||
24 | #include "lock_events.h" | ||
25 | |||
26 | #undef LOCK_EVENT | ||
27 | #define LOCK_EVENT(name) [LOCKEVENT_ ## name] = #name, | ||
28 | |||
29 | #define LOCK_EVENTS_DIR "lock_event_counts" | ||
30 | |||
31 | /* | ||
32 | * When CONFIG_LOCK_EVENT_COUNTS is enabled, event counts of different | ||
33 | * types of locks will be reported under the <debugfs>/lock_event_counts/ | ||
34 | * directory. See lock_events_list.h for the list of available locking | ||
35 | * events. | ||
36 | * | ||
37 | * Writing to the special ".reset_counts" file will reset all the above | ||
38 | * locking event counts. This is a very slow operation and so should not | ||
39 | * be done frequently. | ||
40 | * | ||
41 | * These event counts are implemented as per-cpu variables which are | ||
42 | * summed and computed whenever the corresponding debugfs files are read. This | ||
43 | * minimizes added overhead making the counts usable even in a production | ||
44 | * environment. | ||
45 | */ | ||
46 | static const char * const lockevent_names[lockevent_num + 1] = { | ||
47 | |||
48 | #include "lock_events_list.h" | ||
49 | |||
50 | [LOCKEVENT_reset_cnts] = ".reset_counts", | ||
51 | }; | ||
52 | |||
53 | /* | ||
54 | * Per-cpu counts | ||
55 | */ | ||
56 | DEFINE_PER_CPU(unsigned long, lockevents[lockevent_num]); | ||
57 | |||
58 | /* | ||
59 | * The lockevent_read() function can be overridden. | ||
60 | */ | ||
61 | ssize_t __weak lockevent_read(struct file *file, char __user *user_buf, | ||
62 | size_t count, loff_t *ppos) | ||
63 | { | ||
64 | char buf[64]; | ||
65 | int cpu, id, len; | ||
66 | u64 sum = 0; | ||
67 | |||
68 | /* | ||
69 | * Get the counter ID stored in file->f_inode->i_private | ||
70 | */ | ||
71 | id = (long)file_inode(file)->i_private; | ||
72 | |||
73 | if (id >= lockevent_num) | ||
74 | return -EBADF; | ||
75 | |||
76 | for_each_possible_cpu(cpu) | ||
77 | sum += per_cpu(lockevents[id], cpu); | ||
78 | len = snprintf(buf, sizeof(buf) - 1, "%llu\n", sum); | ||
79 | |||
80 | return simple_read_from_buffer(user_buf, count, ppos, buf, len); | ||
81 | } | ||
82 | |||
83 | /* | ||
84 | * Function to handle write request | ||
85 | * | ||
86 | * When idx = reset_cnts, reset all the counts. | ||
87 | */ | ||
88 | static ssize_t lockevent_write(struct file *file, const char __user *user_buf, | ||
89 | size_t count, loff_t *ppos) | ||
90 | { | ||
91 | int cpu; | ||
92 | |||
93 | /* | ||
94 | * Get the counter ID stored in file->f_inode->i_private | ||
95 | */ | ||
96 | if ((long)file_inode(file)->i_private != LOCKEVENT_reset_cnts) | ||
97 | return count; | ||
98 | |||
99 | for_each_possible_cpu(cpu) { | ||
100 | int i; | ||
101 | unsigned long *ptr = per_cpu_ptr(lockevents, cpu); | ||
102 | |||
103 | for (i = 0 ; i < lockevent_num; i++) | ||
104 | WRITE_ONCE(ptr[i], 0); | ||
105 | } | ||
106 | return count; | ||
107 | } | ||
108 | |||
109 | /* | ||
110 | * Debugfs data structures | ||
111 | */ | ||
112 | static const struct file_operations fops_lockevent = { | ||
113 | .read = lockevent_read, | ||
114 | .write = lockevent_write, | ||
115 | .llseek = default_llseek, | ||
116 | }; | ||
117 | |||
118 | #ifdef CONFIG_PARAVIRT_SPINLOCKS | ||
119 | #include <asm/paravirt.h> | ||
120 | |||
121 | static bool __init skip_lockevent(const char *name) | ||
122 | { | ||
123 | static int pv_on __initdata = -1; | ||
124 | |||
125 | if (pv_on < 0) | ||
126 | pv_on = !pv_is_native_spin_unlock(); | ||
127 | /* | ||
128 | * Skip PV qspinlock events on bare metal. | ||
129 | */ | ||
130 | if (!pv_on && !memcmp(name, "pv_", 3)) | ||
131 | return true; | ||
132 | return false; | ||
133 | } | ||
134 | #else | ||
135 | static inline bool skip_lockevent(const char *name) | ||
136 | { | ||
137 | return false; | ||
138 | } | ||
139 | #endif | ||
140 | |||
141 | /* | ||
142 | * Initialize debugfs for the locking event counts. | ||
143 | */ | ||
144 | static int __init init_lockevent_counts(void) | ||
145 | { | ||
146 | struct dentry *d_counts = debugfs_create_dir(LOCK_EVENTS_DIR, NULL); | ||
147 | int i; | ||
148 | |||
149 | if (!d_counts) | ||
150 | goto out; | ||
151 | |||
152 | /* | ||
153 | * Create the debugfs files | ||
154 | * | ||
155 | * As reading from and writing to the stat files can be slow, only | ||
156 | * root is allowed to do the read/write to limit impact to system | ||
157 | * performance. | ||
158 | */ | ||
159 | for (i = 0; i < lockevent_num; i++) { | ||
160 | if (skip_lockevent(lockevent_names[i])) | ||
161 | continue; | ||
162 | if (!debugfs_create_file(lockevent_names[i], 0400, d_counts, | ||
163 | (void *)(long)i, &fops_lockevent)) | ||
164 | goto fail_undo; | ||
165 | } | ||
166 | |||
167 | if (!debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200, | ||
168 | d_counts, (void *)(long)LOCKEVENT_reset_cnts, | ||
169 | &fops_lockevent)) | ||
170 | goto fail_undo; | ||
171 | |||
172 | return 0; | ||
173 | fail_undo: | ||
174 | debugfs_remove_recursive(d_counts); | ||
175 | out: | ||
176 | pr_warn("Could not create '%s' debugfs entries\n", LOCK_EVENTS_DIR); | ||
177 | return -ENOMEM; | ||
178 | } | ||
179 | fs_initcall(init_lockevent_counts); | ||
diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h new file mode 100644 index 000000000000..feb1acc54611 --- /dev/null +++ b/kernel/locking/lock_events.h | |||
@@ -0,0 +1,59 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 */ | ||
2 | /* | ||
3 | * This program is free software; you can redistribute it and/or modify | ||
4 | * it under the terms of the GNU General Public License as published by | ||
5 | * the Free Software Foundation; either version 2 of the License, or | ||
6 | * (at your option) any later version. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
11 | * GNU General Public License for more details. | ||
12 | * | ||
13 | * Authors: Waiman Long <longman@redhat.com> | ||
14 | */ | ||
15 | |||
16 | #ifndef __LOCKING_LOCK_EVENTS_H | ||
17 | #define __LOCKING_LOCK_EVENTS_H | ||
18 | |||
19 | enum lock_events { | ||
20 | |||
21 | #include "lock_events_list.h" | ||
22 | |||
23 | lockevent_num, /* Total number of lock event counts */ | ||
24 | LOCKEVENT_reset_cnts = lockevent_num, | ||
25 | }; | ||
26 | |||
27 | #ifdef CONFIG_LOCK_EVENT_COUNTS | ||
28 | /* | ||
29 | * Per-cpu counters | ||
30 | */ | ||
31 | DECLARE_PER_CPU(unsigned long, lockevents[lockevent_num]); | ||
32 | |||
33 | /* | ||
34 | * Increment the PV qspinlock statistical counters | ||
35 | */ | ||
36 | static inline void __lockevent_inc(enum lock_events event, bool cond) | ||
37 | { | ||
38 | if (cond) | ||
39 | __this_cpu_inc(lockevents[event]); | ||
40 | } | ||
41 | |||
42 | #define lockevent_inc(ev) __lockevent_inc(LOCKEVENT_ ##ev, true) | ||
43 | #define lockevent_cond_inc(ev, c) __lockevent_inc(LOCKEVENT_ ##ev, c) | ||
44 | |||
45 | static inline void __lockevent_add(enum lock_events event, int inc) | ||
46 | { | ||
47 | __this_cpu_add(lockevents[event], inc); | ||
48 | } | ||
49 | |||
50 | #define lockevent_add(ev, c) __lockevent_add(LOCKEVENT_ ##ev, c) | ||
51 | |||
52 | #else /* CONFIG_LOCK_EVENT_COUNTS */ | ||
53 | |||
54 | #define lockevent_inc(ev) | ||
55 | #define lockevent_add(ev, c) | ||
56 | #define lockevent_cond_inc(ev, c) | ||
57 | |||
58 | #endif /* CONFIG_LOCK_EVENT_COUNTS */ | ||
59 | #endif /* __LOCKING_LOCK_EVENTS_H */ | ||
diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h new file mode 100644 index 000000000000..ad7668cfc9da --- /dev/null +++ b/kernel/locking/lock_events_list.h | |||
@@ -0,0 +1,67 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 */ | ||
2 | /* | ||
3 | * This program is free software; you can redistribute it and/or modify | ||
4 | * it under the terms of the GNU General Public License as published by | ||
5 | * the Free Software Foundation; either version 2 of the License, or | ||
6 | * (at your option) any later version. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
11 | * GNU General Public License for more details. | ||
12 | * | ||
13 | * Authors: Waiman Long <longman@redhat.com> | ||
14 | */ | ||
15 | |||
16 | #ifndef LOCK_EVENT | ||
17 | #define LOCK_EVENT(name) LOCKEVENT_ ## name, | ||
18 | #endif | ||
19 | |||
20 | #ifdef CONFIG_QUEUED_SPINLOCKS | ||
21 | #ifdef CONFIG_PARAVIRT_SPINLOCKS | ||
22 | /* | ||
23 | * Locking events for PV qspinlock. | ||
24 | */ | ||
25 | LOCK_EVENT(pv_hash_hops) /* Average # of hops per hashing operation */ | ||
26 | LOCK_EVENT(pv_kick_unlock) /* # of vCPU kicks issued at unlock time */ | ||
27 | LOCK_EVENT(pv_kick_wake) /* # of vCPU kicks for pv_latency_wake */ | ||
28 | LOCK_EVENT(pv_latency_kick) /* Average latency (ns) of vCPU kick */ | ||
29 | LOCK_EVENT(pv_latency_wake) /* Average latency (ns) of kick-to-wakeup */ | ||
30 | LOCK_EVENT(pv_lock_stealing) /* # of lock stealing operations */ | ||
31 | LOCK_EVENT(pv_spurious_wakeup) /* # of spurious wakeups in non-head vCPUs */ | ||
32 | LOCK_EVENT(pv_wait_again) /* # of wait's after queue head vCPU kick */ | ||
33 | LOCK_EVENT(pv_wait_early) /* # of early vCPU wait's */ | ||
34 | LOCK_EVENT(pv_wait_head) /* # of vCPU wait's at the queue head */ | ||
35 | LOCK_EVENT(pv_wait_node) /* # of vCPU wait's at non-head queue node */ | ||
36 | #endif /* CONFIG_PARAVIRT_SPINLOCKS */ | ||
37 | |||
38 | /* | ||
39 | * Locking events for qspinlock | ||
40 | * | ||
41 | * Subtracting lock_use_node[234] from lock_slowpath will give you | ||
42 | * lock_use_node1. | ||
43 | */ | ||
44 | LOCK_EVENT(lock_pending) /* # of locking ops via pending code */ | ||
45 | LOCK_EVENT(lock_slowpath) /* # of locking ops via MCS lock queue */ | ||
46 | LOCK_EVENT(lock_use_node2) /* # of locking ops that use 2nd percpu node */ | ||
47 | LOCK_EVENT(lock_use_node3) /* # of locking ops that use 3rd percpu node */ | ||
48 | LOCK_EVENT(lock_use_node4) /* # of locking ops that use 4th percpu node */ | ||
49 | LOCK_EVENT(lock_no_node) /* # of locking ops w/o using percpu node */ | ||
50 | #endif /* CONFIG_QUEUED_SPINLOCKS */ | ||
51 | |||
52 | /* | ||
53 | * Locking events for rwsem | ||
54 | */ | ||
55 | LOCK_EVENT(rwsem_sleep_reader) /* # of reader sleeps */ | ||
56 | LOCK_EVENT(rwsem_sleep_writer) /* # of writer sleeps */ | ||
57 | LOCK_EVENT(rwsem_wake_reader) /* # of reader wakeups */ | ||
58 | LOCK_EVENT(rwsem_wake_writer) /* # of writer wakeups */ | ||
59 | LOCK_EVENT(rwsem_opt_wlock) /* # of write locks opt-spin acquired */ | ||
60 | LOCK_EVENT(rwsem_opt_fail) /* # of failed opt-spinnings */ | ||
61 | LOCK_EVENT(rwsem_rlock) /* # of read locks acquired */ | ||
62 | LOCK_EVENT(rwsem_rlock_fast) /* # of fast read locks acquired */ | ||
63 | LOCK_EVENT(rwsem_rlock_fail) /* # of failed read lock acquisitions */ | ||
64 | LOCK_EVENT(rwsem_rtrylock) /* # of read trylock calls */ | ||
65 | LOCK_EVENT(rwsem_wlock) /* # of write locks acquired */ | ||
66 | LOCK_EVENT(rwsem_wlock_fail) /* # of failed write lock acquisitions */ | ||
67 | LOCK_EVENT(rwsem_wtrylock) /* # of write trylock calls */ | ||
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 34cdcbedda49..d06190fa5082 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c | |||
@@ -434,29 +434,14 @@ static void print_lockdep_off(const char *bug_msg) | |||
434 | #endif | 434 | #endif |
435 | } | 435 | } |
436 | 436 | ||
437 | static int save_trace(struct stack_trace *trace) | 437 | static int save_trace(struct lock_trace *trace) |
438 | { | 438 | { |
439 | trace->nr_entries = 0; | 439 | unsigned long *entries = stack_trace + nr_stack_trace_entries; |
440 | trace->max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; | 440 | unsigned int max_entries; |
441 | trace->entries = stack_trace + nr_stack_trace_entries; | ||
442 | |||
443 | trace->skip = 3; | ||
444 | |||
445 | save_stack_trace(trace); | ||
446 | |||
447 | /* | ||
448 | * Some daft arches put -1 at the end to indicate its a full trace. | ||
449 | * | ||
450 | * <rant> this is buggy anyway, since it takes a whole extra entry so a | ||
451 | * complete trace that maxes out the entries provided will be reported | ||
452 | * as incomplete, friggin useless </rant> | ||
453 | */ | ||
454 | if (trace->nr_entries != 0 && | ||
455 | trace->entries[trace->nr_entries-1] == ULONG_MAX) | ||
456 | trace->nr_entries--; | ||
457 | |||
458 | trace->max_entries = trace->nr_entries; | ||
459 | 441 | ||
442 | trace->offset = nr_stack_trace_entries; | ||
443 | max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; | ||
444 | trace->nr_entries = stack_trace_save(entries, max_entries, 3); | ||
460 | nr_stack_trace_entries += trace->nr_entries; | 445 | nr_stack_trace_entries += trace->nr_entries; |
461 | 446 | ||
462 | if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) { | 447 | if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) { |
@@ -516,11 +501,11 @@ static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit) | |||
516 | { | 501 | { |
517 | char c = '.'; | 502 | char c = '.'; |
518 | 503 | ||
519 | if (class->usage_mask & lock_flag(bit + 2)) | 504 | if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK)) |
520 | c = '+'; | 505 | c = '+'; |
521 | if (class->usage_mask & lock_flag(bit)) { | 506 | if (class->usage_mask & lock_flag(bit)) { |
522 | c = '-'; | 507 | c = '-'; |
523 | if (class->usage_mask & lock_flag(bit + 2)) | 508 | if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK)) |
524 | c = '?'; | 509 | c = '?'; |
525 | } | 510 | } |
526 | 511 | ||
@@ -649,6 +634,9 @@ static int static_obj(const void *obj) | |||
649 | end = (unsigned long) &_end, | 634 | end = (unsigned long) &_end, |
650 | addr = (unsigned long) obj; | 635 | addr = (unsigned long) obj; |
651 | 636 | ||
637 | if (arch_is_kernel_initmem_freed(addr)) | ||
638 | return 0; | ||
639 | |||
652 | /* | 640 | /* |
653 | * static variable? | 641 | * static variable? |
654 | */ | 642 | */ |
@@ -1207,7 +1195,7 @@ static struct lock_list *alloc_list_entry(void) | |||
1207 | static int add_lock_to_list(struct lock_class *this, | 1195 | static int add_lock_to_list(struct lock_class *this, |
1208 | struct lock_class *links_to, struct list_head *head, | 1196 | struct lock_class *links_to, struct list_head *head, |
1209 | unsigned long ip, int distance, | 1197 | unsigned long ip, int distance, |
1210 | struct stack_trace *trace) | 1198 | struct lock_trace *trace) |
1211 | { | 1199 | { |
1212 | struct lock_list *entry; | 1200 | struct lock_list *entry; |
1213 | /* | 1201 | /* |
@@ -1426,6 +1414,13 @@ static inline int __bfs_backwards(struct lock_list *src_entry, | |||
1426 | * checking. | 1414 | * checking. |
1427 | */ | 1415 | */ |
1428 | 1416 | ||
1417 | static void print_lock_trace(struct lock_trace *trace, unsigned int spaces) | ||
1418 | { | ||
1419 | unsigned long *entries = stack_trace + trace->offset; | ||
1420 | |||
1421 | stack_trace_print(entries, trace->nr_entries, spaces); | ||
1422 | } | ||
1423 | |||
1429 | /* | 1424 | /* |
1430 | * Print a dependency chain entry (this is only done when a deadlock | 1425 | * Print a dependency chain entry (this is only done when a deadlock |
1431 | * has been detected): | 1426 | * has been detected): |
@@ -1438,8 +1433,7 @@ print_circular_bug_entry(struct lock_list *target, int depth) | |||
1438 | printk("\n-> #%u", depth); | 1433 | printk("\n-> #%u", depth); |
1439 | print_lock_name(target->class); | 1434 | print_lock_name(target->class); |
1440 | printk(KERN_CONT ":\n"); | 1435 | printk(KERN_CONT ":\n"); |
1441 | print_stack_trace(&target->trace, 6); | 1436 | print_lock_trace(&target->trace, 6); |
1442 | |||
1443 | return 0; | 1437 | return 0; |
1444 | } | 1438 | } |
1445 | 1439 | ||
@@ -1533,10 +1527,9 @@ static inline int class_equal(struct lock_list *entry, void *data) | |||
1533 | } | 1527 | } |
1534 | 1528 | ||
1535 | static noinline int print_circular_bug(struct lock_list *this, | 1529 | static noinline int print_circular_bug(struct lock_list *this, |
1536 | struct lock_list *target, | 1530 | struct lock_list *target, |
1537 | struct held_lock *check_src, | 1531 | struct held_lock *check_src, |
1538 | struct held_lock *check_tgt, | 1532 | struct held_lock *check_tgt) |
1539 | struct stack_trace *trace) | ||
1540 | { | 1533 | { |
1541 | struct task_struct *curr = current; | 1534 | struct task_struct *curr = current; |
1542 | struct lock_list *parent; | 1535 | struct lock_list *parent; |
@@ -1676,19 +1669,25 @@ check_redundant(struct lock_list *root, struct lock_class *target, | |||
1676 | } | 1669 | } |
1677 | 1670 | ||
1678 | #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) | 1671 | #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) |
1672 | |||
1673 | static inline int usage_accumulate(struct lock_list *entry, void *mask) | ||
1674 | { | ||
1675 | *(unsigned long *)mask |= entry->class->usage_mask; | ||
1676 | |||
1677 | return 0; | ||
1678 | } | ||
1679 | |||
1679 | /* | 1680 | /* |
1680 | * Forwards and backwards subgraph searching, for the purposes of | 1681 | * Forwards and backwards subgraph searching, for the purposes of |
1681 | * proving that two subgraphs can be connected by a new dependency | 1682 | * proving that two subgraphs can be connected by a new dependency |
1682 | * without creating any illegal irq-safe -> irq-unsafe lock dependency. | 1683 | * without creating any illegal irq-safe -> irq-unsafe lock dependency. |
1683 | */ | 1684 | */ |
1684 | 1685 | ||
1685 | static inline int usage_match(struct lock_list *entry, void *bit) | 1686 | static inline int usage_match(struct lock_list *entry, void *mask) |
1686 | { | 1687 | { |
1687 | return entry->class->usage_mask & (1 << (enum lock_usage_bit)bit); | 1688 | return entry->class->usage_mask & *(unsigned long *)mask; |
1688 | } | 1689 | } |
1689 | 1690 | ||
1690 | |||
1691 | |||
1692 | /* | 1691 | /* |
1693 | * Find a node in the forwards-direction dependency sub-graph starting | 1692 | * Find a node in the forwards-direction dependency sub-graph starting |
1694 | * at @root->class that matches @bit. | 1693 | * at @root->class that matches @bit. |
@@ -1700,14 +1699,14 @@ static inline int usage_match(struct lock_list *entry, void *bit) | |||
1700 | * Return <0 on error. | 1699 | * Return <0 on error. |
1701 | */ | 1700 | */ |
1702 | static int | 1701 | static int |
1703 | find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit, | 1702 | find_usage_forwards(struct lock_list *root, unsigned long usage_mask, |
1704 | struct lock_list **target_entry) | 1703 | struct lock_list **target_entry) |
1705 | { | 1704 | { |
1706 | int result; | 1705 | int result; |
1707 | 1706 | ||
1708 | debug_atomic_inc(nr_find_usage_forwards_checks); | 1707 | debug_atomic_inc(nr_find_usage_forwards_checks); |
1709 | 1708 | ||
1710 | result = __bfs_forwards(root, (void *)bit, usage_match, target_entry); | 1709 | result = __bfs_forwards(root, &usage_mask, usage_match, target_entry); |
1711 | 1710 | ||
1712 | return result; | 1711 | return result; |
1713 | } | 1712 | } |
@@ -1723,14 +1722,14 @@ find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit, | |||
1723 | * Return <0 on error. | 1722 | * Return <0 on error. |
1724 | */ | 1723 | */ |
1725 | static int | 1724 | static int |
1726 | find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit, | 1725 | find_usage_backwards(struct lock_list *root, unsigned long usage_mask, |
1727 | struct lock_list **target_entry) | 1726 | struct lock_list **target_entry) |
1728 | { | 1727 | { |
1729 | int result; | 1728 | int result; |
1730 | 1729 | ||
1731 | debug_atomic_inc(nr_find_usage_backwards_checks); | 1730 | debug_atomic_inc(nr_find_usage_backwards_checks); |
1732 | 1731 | ||
1733 | result = __bfs_backwards(root, (void *)bit, usage_match, target_entry); | 1732 | result = __bfs_backwards(root, &usage_mask, usage_match, target_entry); |
1734 | 1733 | ||
1735 | return result; | 1734 | return result; |
1736 | } | 1735 | } |
@@ -1752,7 +1751,7 @@ static void print_lock_class_header(struct lock_class *class, int depth) | |||
1752 | 1751 | ||
1753 | len += printk("%*s %s", depth, "", usage_str[bit]); | 1752 | len += printk("%*s %s", depth, "", usage_str[bit]); |
1754 | len += printk(KERN_CONT " at:\n"); | 1753 | len += printk(KERN_CONT " at:\n"); |
1755 | print_stack_trace(class->usage_traces + bit, len); | 1754 | print_lock_trace(class->usage_traces + bit, len); |
1756 | } | 1755 | } |
1757 | } | 1756 | } |
1758 | printk("%*s }\n", depth, ""); | 1757 | printk("%*s }\n", depth, ""); |
@@ -1777,7 +1776,7 @@ print_shortest_lock_dependencies(struct lock_list *leaf, | |||
1777 | do { | 1776 | do { |
1778 | print_lock_class_header(entry->class, depth); | 1777 | print_lock_class_header(entry->class, depth); |
1779 | printk("%*s ... acquired at:\n", depth, ""); | 1778 | printk("%*s ... acquired at:\n", depth, ""); |
1780 | print_stack_trace(&entry->trace, 2); | 1779 | print_lock_trace(&entry->trace, 2); |
1781 | printk("\n"); | 1780 | printk("\n"); |
1782 | 1781 | ||
1783 | if (depth == 0 && (entry != root)) { | 1782 | if (depth == 0 && (entry != root)) { |
@@ -1890,14 +1889,14 @@ print_bad_irq_dependency(struct task_struct *curr, | |||
1890 | print_lock_name(backwards_entry->class); | 1889 | print_lock_name(backwards_entry->class); |
1891 | pr_warn("\n... which became %s-irq-safe at:\n", irqclass); | 1890 | pr_warn("\n... which became %s-irq-safe at:\n", irqclass); |
1892 | 1891 | ||
1893 | print_stack_trace(backwards_entry->class->usage_traces + bit1, 1); | 1892 | print_lock_trace(backwards_entry->class->usage_traces + bit1, 1); |
1894 | 1893 | ||
1895 | pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass); | 1894 | pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass); |
1896 | print_lock_name(forwards_entry->class); | 1895 | print_lock_name(forwards_entry->class); |
1897 | pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass); | 1896 | pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass); |
1898 | pr_warn("..."); | 1897 | pr_warn("..."); |
1899 | 1898 | ||
1900 | print_stack_trace(forwards_entry->class->usage_traces + bit2, 1); | 1899 | print_lock_trace(forwards_entry->class->usage_traces + bit2, 1); |
1901 | 1900 | ||
1902 | pr_warn("\nother info that might help us debug this:\n\n"); | 1901 | pr_warn("\nother info that might help us debug this:\n\n"); |
1903 | print_irq_lock_scenario(backwards_entry, forwards_entry, | 1902 | print_irq_lock_scenario(backwards_entry, forwards_entry, |
@@ -1922,39 +1921,6 @@ print_bad_irq_dependency(struct task_struct *curr, | |||
1922 | return 0; | 1921 | return 0; |
1923 | } | 1922 | } |
1924 | 1923 | ||
1925 | static int | ||
1926 | check_usage(struct task_struct *curr, struct held_lock *prev, | ||
1927 | struct held_lock *next, enum lock_usage_bit bit_backwards, | ||
1928 | enum lock_usage_bit bit_forwards, const char *irqclass) | ||
1929 | { | ||
1930 | int ret; | ||
1931 | struct lock_list this, that; | ||
1932 | struct lock_list *uninitialized_var(target_entry); | ||
1933 | struct lock_list *uninitialized_var(target_entry1); | ||
1934 | |||
1935 | this.parent = NULL; | ||
1936 | |||
1937 | this.class = hlock_class(prev); | ||
1938 | ret = find_usage_backwards(&this, bit_backwards, &target_entry); | ||
1939 | if (ret < 0) | ||
1940 | return print_bfs_bug(ret); | ||
1941 | if (ret == 1) | ||
1942 | return ret; | ||
1943 | |||
1944 | that.parent = NULL; | ||
1945 | that.class = hlock_class(next); | ||
1946 | ret = find_usage_forwards(&that, bit_forwards, &target_entry1); | ||
1947 | if (ret < 0) | ||
1948 | return print_bfs_bug(ret); | ||
1949 | if (ret == 1) | ||
1950 | return ret; | ||
1951 | |||
1952 | return print_bad_irq_dependency(curr, &this, &that, | ||
1953 | target_entry, target_entry1, | ||
1954 | prev, next, | ||
1955 | bit_backwards, bit_forwards, irqclass); | ||
1956 | } | ||
1957 | |||
1958 | static const char *state_names[] = { | 1924 | static const char *state_names[] = { |
1959 | #define LOCKDEP_STATE(__STATE) \ | 1925 | #define LOCKDEP_STATE(__STATE) \ |
1960 | __stringify(__STATE), | 1926 | __stringify(__STATE), |
@@ -1971,9 +1937,19 @@ static const char *state_rnames[] = { | |||
1971 | 1937 | ||
1972 | static inline const char *state_name(enum lock_usage_bit bit) | 1938 | static inline const char *state_name(enum lock_usage_bit bit) |
1973 | { | 1939 | { |
1974 | return (bit & LOCK_USAGE_READ_MASK) ? state_rnames[bit >> 2] : state_names[bit >> 2]; | 1940 | if (bit & LOCK_USAGE_READ_MASK) |
1941 | return state_rnames[bit >> LOCK_USAGE_DIR_MASK]; | ||
1942 | else | ||
1943 | return state_names[bit >> LOCK_USAGE_DIR_MASK]; | ||
1975 | } | 1944 | } |
1976 | 1945 | ||
1946 | /* | ||
1947 | * The bit number is encoded like: | ||
1948 | * | ||
1949 | * bit0: 0 exclusive, 1 read lock | ||
1950 | * bit1: 0 used in irq, 1 irq enabled | ||
1951 | * bit2-n: state | ||
1952 | */ | ||
1977 | static int exclusive_bit(int new_bit) | 1953 | static int exclusive_bit(int new_bit) |
1978 | { | 1954 | { |
1979 | int state = new_bit & LOCK_USAGE_STATE_MASK; | 1955 | int state = new_bit & LOCK_USAGE_STATE_MASK; |
@@ -1985,45 +1961,160 @@ static int exclusive_bit(int new_bit) | |||
1985 | return state | (dir ^ LOCK_USAGE_DIR_MASK); | 1961 | return state | (dir ^ LOCK_USAGE_DIR_MASK); |
1986 | } | 1962 | } |
1987 | 1963 | ||
1964 | /* | ||
1965 | * Observe that when given a bitmask where each bitnr is encoded as above, a | ||
1966 | * right shift of the mask transforms the individual bitnrs as -1 and | ||
1967 | * conversely, a left shift transforms into +1 for the individual bitnrs. | ||
1968 | * | ||
1969 | * So for all bits whose number have LOCK_ENABLED_* set (bitnr1 == 1), we can | ||
1970 | * create the mask with those bit numbers using LOCK_USED_IN_* (bitnr1 == 0) | ||
1971 | * instead by subtracting the bit number by 2, or shifting the mask right by 2. | ||
1972 | * | ||
1973 | * Similarly, bitnr1 == 0 becomes bitnr1 == 1 by adding 2, or shifting left 2. | ||
1974 | * | ||
1975 | * So split the mask (note that LOCKF_ENABLED_IRQ_ALL|LOCKF_USED_IN_IRQ_ALL is | ||
1976 | * all bits set) and recompose with bitnr1 flipped. | ||
1977 | */ | ||
1978 | static unsigned long invert_dir_mask(unsigned long mask) | ||
1979 | { | ||
1980 | unsigned long excl = 0; | ||
1981 | |||
1982 | /* Invert dir */ | ||
1983 | excl |= (mask & LOCKF_ENABLED_IRQ_ALL) >> LOCK_USAGE_DIR_MASK; | ||
1984 | excl |= (mask & LOCKF_USED_IN_IRQ_ALL) << LOCK_USAGE_DIR_MASK; | ||
1985 | |||
1986 | return excl; | ||
1987 | } | ||
1988 | |||
1989 | /* | ||
1990 | * As above, we clear bitnr0 (LOCK_*_READ off) with bitmask ops. First, for all | ||
1991 | * bits with bitnr0 set (LOCK_*_READ), add those with bitnr0 cleared (LOCK_*). | ||
1992 | * And then mask out all bitnr0. | ||
1993 | */ | ||
1994 | static unsigned long exclusive_mask(unsigned long mask) | ||
1995 | { | ||
1996 | unsigned long excl = invert_dir_mask(mask); | ||
1997 | |||
1998 | /* Strip read */ | ||
1999 | excl |= (excl & LOCKF_IRQ_READ) >> LOCK_USAGE_READ_MASK; | ||
2000 | excl &= ~LOCKF_IRQ_READ; | ||
2001 | |||
2002 | return excl; | ||
2003 | } | ||
2004 | |||
2005 | /* | ||
2006 | * Retrieve the _possible_ original mask to which @mask is | ||
2007 | * exclusive. Ie: this is the opposite of exclusive_mask(). | ||
2008 | * Note that 2 possible original bits can match an exclusive | ||
2009 | * bit: one has LOCK_USAGE_READ_MASK set, the other has it | ||
2010 | * cleared. So both are returned for each exclusive bit. | ||
2011 | */ | ||
2012 | static unsigned long original_mask(unsigned long mask) | ||
2013 | { | ||
2014 | unsigned long excl = invert_dir_mask(mask); | ||
2015 | |||
2016 | /* Include read in existing usages */ | ||
2017 | excl |= (excl & LOCKF_IRQ) << LOCK_USAGE_READ_MASK; | ||
2018 | |||
2019 | return excl; | ||
2020 | } | ||
2021 | |||
2022 | /* | ||
2023 | * Find the first pair of bit match between an original | ||
2024 | * usage mask and an exclusive usage mask. | ||
2025 | */ | ||
2026 | static int find_exclusive_match(unsigned long mask, | ||
2027 | unsigned long excl_mask, | ||
2028 | enum lock_usage_bit *bitp, | ||
2029 | enum lock_usage_bit *excl_bitp) | ||
2030 | { | ||
2031 | int bit, excl; | ||
2032 | |||
2033 | for_each_set_bit(bit, &mask, LOCK_USED) { | ||
2034 | excl = exclusive_bit(bit); | ||
2035 | if (excl_mask & lock_flag(excl)) { | ||
2036 | *bitp = bit; | ||
2037 | *excl_bitp = excl; | ||
2038 | return 0; | ||
2039 | } | ||
2040 | } | ||
2041 | return -1; | ||
2042 | } | ||
2043 | |||
2044 | /* | ||
2045 | * Prove that the new dependency does not connect a hardirq-safe(-read) | ||
2046 | * lock with a hardirq-unsafe lock - to achieve this we search | ||
2047 | * the backwards-subgraph starting at <prev>, and the | ||
2048 | * forwards-subgraph starting at <next>: | ||
2049 | */ | ||
1988 | static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, | 2050 | static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, |
1989 | struct held_lock *next, enum lock_usage_bit bit) | 2051 | struct held_lock *next) |
1990 | { | 2052 | { |
2053 | unsigned long usage_mask = 0, forward_mask, backward_mask; | ||
2054 | enum lock_usage_bit forward_bit = 0, backward_bit = 0; | ||
2055 | struct lock_list *uninitialized_var(target_entry1); | ||
2056 | struct lock_list *uninitialized_var(target_entry); | ||
2057 | struct lock_list this, that; | ||
2058 | int ret; | ||
2059 | |||
1991 | /* | 2060 | /* |
1992 | * Prove that the new dependency does not connect a hardirq-safe | 2061 | * Step 1: gather all hard/soft IRQs usages backward in an |
1993 | * lock with a hardirq-unsafe lock - to achieve this we search | 2062 | * accumulated usage mask. |
1994 | * the backwards-subgraph starting at <prev>, and the | ||
1995 | * forwards-subgraph starting at <next>: | ||
1996 | */ | 2063 | */ |
1997 | if (!check_usage(curr, prev, next, bit, | 2064 | this.parent = NULL; |
1998 | exclusive_bit(bit), state_name(bit))) | 2065 | this.class = hlock_class(prev); |
1999 | return 0; | 2066 | |
2067 | ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL); | ||
2068 | if (ret < 0) | ||
2069 | return print_bfs_bug(ret); | ||
2000 | 2070 | ||
2001 | bit++; /* _READ */ | 2071 | usage_mask &= LOCKF_USED_IN_IRQ_ALL; |
2072 | if (!usage_mask) | ||
2073 | return 1; | ||
2002 | 2074 | ||
2003 | /* | 2075 | /* |
2004 | * Prove that the new dependency does not connect a hardirq-safe-read | 2076 | * Step 2: find exclusive uses forward that match the previous |
2005 | * lock with a hardirq-unsafe lock - to achieve this we search | 2077 | * backward accumulated mask. |
2006 | * the backwards-subgraph starting at <prev>, and the | ||
2007 | * forwards-subgraph starting at <next>: | ||
2008 | */ | 2078 | */ |
2009 | if (!check_usage(curr, prev, next, bit, | 2079 | forward_mask = exclusive_mask(usage_mask); |
2010 | exclusive_bit(bit), state_name(bit))) | ||
2011 | return 0; | ||
2012 | 2080 | ||
2013 | return 1; | 2081 | that.parent = NULL; |
2014 | } | 2082 | that.class = hlock_class(next); |
2015 | 2083 | ||
2016 | static int | 2084 | ret = find_usage_forwards(&that, forward_mask, &target_entry1); |
2017 | check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, | 2085 | if (ret < 0) |
2018 | struct held_lock *next) | 2086 | return print_bfs_bug(ret); |
2019 | { | 2087 | if (ret == 1) |
2020 | #define LOCKDEP_STATE(__STATE) \ | 2088 | return ret; |
2021 | if (!check_irq_usage(curr, prev, next, LOCK_USED_IN_##__STATE)) \ | ||
2022 | return 0; | ||
2023 | #include "lockdep_states.h" | ||
2024 | #undef LOCKDEP_STATE | ||
2025 | 2089 | ||
2026 | return 1; | 2090 | /* |
2091 | * Step 3: we found a bad match! Now retrieve a lock from the backward | ||
2092 | * list whose usage mask matches the exclusive usage mask from the | ||
2093 | * lock found on the forward list. | ||
2094 | */ | ||
2095 | backward_mask = original_mask(target_entry1->class->usage_mask); | ||
2096 | |||
2097 | ret = find_usage_backwards(&this, backward_mask, &target_entry); | ||
2098 | if (ret < 0) | ||
2099 | return print_bfs_bug(ret); | ||
2100 | if (DEBUG_LOCKS_WARN_ON(ret == 1)) | ||
2101 | return 1; | ||
2102 | |||
2103 | /* | ||
2104 | * Step 4: narrow down to a pair of incompatible usage bits | ||
2105 | * and report it. | ||
2106 | */ | ||
2107 | ret = find_exclusive_match(target_entry->class->usage_mask, | ||
2108 | target_entry1->class->usage_mask, | ||
2109 | &backward_bit, &forward_bit); | ||
2110 | if (DEBUG_LOCKS_WARN_ON(ret == -1)) | ||
2111 | return 1; | ||
2112 | |||
2113 | return print_bad_irq_dependency(curr, &this, &that, | ||
2114 | target_entry, target_entry1, | ||
2115 | prev, next, | ||
2116 | backward_bit, forward_bit, | ||
2117 | state_name(backward_bit)); | ||
2027 | } | 2118 | } |
2028 | 2119 | ||
2029 | static void inc_chains(void) | 2120 | static void inc_chains(void) |
@@ -2040,9 +2131,8 @@ static void inc_chains(void) | |||
2040 | 2131 | ||
2041 | #else | 2132 | #else |
2042 | 2133 | ||
2043 | static inline int | 2134 | static inline int check_irq_usage(struct task_struct *curr, |
2044 | check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, | 2135 | struct held_lock *prev, struct held_lock *next) |
2045 | struct held_lock *next) | ||
2046 | { | 2136 | { |
2047 | return 1; | 2137 | return 1; |
2048 | } | 2138 | } |
@@ -2170,8 +2260,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, | |||
2170 | */ | 2260 | */ |
2171 | static int | 2261 | static int |
2172 | check_prev_add(struct task_struct *curr, struct held_lock *prev, | 2262 | check_prev_add(struct task_struct *curr, struct held_lock *prev, |
2173 | struct held_lock *next, int distance, struct stack_trace *trace, | 2263 | struct held_lock *next, int distance, struct lock_trace *trace) |
2174 | int (*save)(struct stack_trace *trace)) | ||
2175 | { | 2264 | { |
2176 | struct lock_list *uninitialized_var(target_entry); | 2265 | struct lock_list *uninitialized_var(target_entry); |
2177 | struct lock_list *entry; | 2266 | struct lock_list *entry; |
@@ -2209,20 +2298,20 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, | |||
2209 | this.parent = NULL; | 2298 | this.parent = NULL; |
2210 | ret = check_noncircular(&this, hlock_class(prev), &target_entry); | 2299 | ret = check_noncircular(&this, hlock_class(prev), &target_entry); |
2211 | if (unlikely(!ret)) { | 2300 | if (unlikely(!ret)) { |
2212 | if (!trace->entries) { | 2301 | if (!trace->nr_entries) { |
2213 | /* | 2302 | /* |
2214 | * If @save fails here, the printing might trigger | 2303 | * If save_trace fails here, the printing might |
2215 | * a WARN but because of the !nr_entries it should | 2304 | * trigger a WARN but because of the !nr_entries it |
2216 | * not do bad things. | 2305 | * should not do bad things. |
2217 | */ | 2306 | */ |
2218 | save(trace); | 2307 | save_trace(trace); |
2219 | } | 2308 | } |
2220 | return print_circular_bug(&this, target_entry, next, prev, trace); | 2309 | return print_circular_bug(&this, target_entry, next, prev); |
2221 | } | 2310 | } |
2222 | else if (unlikely(ret < 0)) | 2311 | else if (unlikely(ret < 0)) |
2223 | return print_bfs_bug(ret); | 2312 | return print_bfs_bug(ret); |
2224 | 2313 | ||
2225 | if (!check_prev_add_irq(curr, prev, next)) | 2314 | if (!check_irq_usage(curr, prev, next)) |
2226 | return 0; | 2315 | return 0; |
2227 | 2316 | ||
2228 | /* | 2317 | /* |
@@ -2265,7 +2354,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, | |||
2265 | return print_bfs_bug(ret); | 2354 | return print_bfs_bug(ret); |
2266 | 2355 | ||
2267 | 2356 | ||
2268 | if (!trace->entries && !save(trace)) | 2357 | if (!trace->nr_entries && !save_trace(trace)) |
2269 | return 0; | 2358 | return 0; |
2270 | 2359 | ||
2271 | /* | 2360 | /* |
@@ -2297,14 +2386,9 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, | |||
2297 | static int | 2386 | static int |
2298 | check_prevs_add(struct task_struct *curr, struct held_lock *next) | 2387 | check_prevs_add(struct task_struct *curr, struct held_lock *next) |
2299 | { | 2388 | { |
2389 | struct lock_trace trace = { .nr_entries = 0 }; | ||
2300 | int depth = curr->lockdep_depth; | 2390 | int depth = curr->lockdep_depth; |
2301 | struct held_lock *hlock; | 2391 | struct held_lock *hlock; |
2302 | struct stack_trace trace = { | ||
2303 | .nr_entries = 0, | ||
2304 | .max_entries = 0, | ||
2305 | .entries = NULL, | ||
2306 | .skip = 0, | ||
2307 | }; | ||
2308 | 2392 | ||
2309 | /* | 2393 | /* |
2310 | * Debugging checks. | 2394 | * Debugging checks. |
@@ -2330,7 +2414,8 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) | |||
2330 | * added: | 2414 | * added: |
2331 | */ | 2415 | */ |
2332 | if (hlock->read != 2 && hlock->check) { | 2416 | if (hlock->read != 2 && hlock->check) { |
2333 | int ret = check_prev_add(curr, hlock, next, distance, &trace, save_trace); | 2417 | int ret = check_prev_add(curr, hlock, next, distance, |
2418 | &trace); | ||
2334 | if (!ret) | 2419 | if (!ret) |
2335 | return 0; | 2420 | return 0; |
2336 | 2421 | ||
@@ -2731,6 +2816,10 @@ static inline int validate_chain(struct task_struct *curr, | |||
2731 | { | 2816 | { |
2732 | return 1; | 2817 | return 1; |
2733 | } | 2818 | } |
2819 | |||
2820 | static void print_lock_trace(struct lock_trace *trace, unsigned int spaces) | ||
2821 | { | ||
2822 | } | ||
2734 | #endif | 2823 | #endif |
2735 | 2824 | ||
2736 | /* | 2825 | /* |
@@ -2784,6 +2873,12 @@ static void check_chain_key(struct task_struct *curr) | |||
2784 | #endif | 2873 | #endif |
2785 | } | 2874 | } |
2786 | 2875 | ||
2876 | static int mark_lock(struct task_struct *curr, struct held_lock *this, | ||
2877 | enum lock_usage_bit new_bit); | ||
2878 | |||
2879 | #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) | ||
2880 | |||
2881 | |||
2787 | static void | 2882 | static void |
2788 | print_usage_bug_scenario(struct held_lock *lock) | 2883 | print_usage_bug_scenario(struct held_lock *lock) |
2789 | { | 2884 | { |
@@ -2827,7 +2922,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, | |||
2827 | print_lock(this); | 2922 | print_lock(this); |
2828 | 2923 | ||
2829 | pr_warn("{%s} state was registered at:\n", usage_str[prev_bit]); | 2924 | pr_warn("{%s} state was registered at:\n", usage_str[prev_bit]); |
2830 | print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1); | 2925 | print_lock_trace(hlock_class(this)->usage_traces + prev_bit, 1); |
2831 | 2926 | ||
2832 | print_irqtrace_events(curr); | 2927 | print_irqtrace_events(curr); |
2833 | pr_warn("\nother info that might help us debug this:\n"); | 2928 | pr_warn("\nother info that might help us debug this:\n"); |
@@ -2853,10 +2948,6 @@ valid_state(struct task_struct *curr, struct held_lock *this, | |||
2853 | return 1; | 2948 | return 1; |
2854 | } | 2949 | } |
2855 | 2950 | ||
2856 | static int mark_lock(struct task_struct *curr, struct held_lock *this, | ||
2857 | enum lock_usage_bit new_bit); | ||
2858 | |||
2859 | #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) | ||
2860 | 2951 | ||
2861 | /* | 2952 | /* |
2862 | * print irq inversion bug: | 2953 | * print irq inversion bug: |
@@ -2936,7 +3027,7 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this, | |||
2936 | 3027 | ||
2937 | root.parent = NULL; | 3028 | root.parent = NULL; |
2938 | root.class = hlock_class(this); | 3029 | root.class = hlock_class(this); |
2939 | ret = find_usage_forwards(&root, bit, &target_entry); | 3030 | ret = find_usage_forwards(&root, lock_flag(bit), &target_entry); |
2940 | if (ret < 0) | 3031 | if (ret < 0) |
2941 | return print_bfs_bug(ret); | 3032 | return print_bfs_bug(ret); |
2942 | if (ret == 1) | 3033 | if (ret == 1) |
@@ -2960,7 +3051,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, | |||
2960 | 3051 | ||
2961 | root.parent = NULL; | 3052 | root.parent = NULL; |
2962 | root.class = hlock_class(this); | 3053 | root.class = hlock_class(this); |
2963 | ret = find_usage_backwards(&root, bit, &target_entry); | 3054 | ret = find_usage_backwards(&root, lock_flag(bit), &target_entry); |
2964 | if (ret < 0) | 3055 | if (ret < 0) |
2965 | return print_bfs_bug(ret); | 3056 | return print_bfs_bug(ret); |
2966 | if (ret == 1) | 3057 | if (ret == 1) |
@@ -3015,7 +3106,7 @@ static int (*state_verbose_f[])(struct lock_class *class) = { | |||
3015 | static inline int state_verbose(enum lock_usage_bit bit, | 3106 | static inline int state_verbose(enum lock_usage_bit bit, |
3016 | struct lock_class *class) | 3107 | struct lock_class *class) |
3017 | { | 3108 | { |
3018 | return state_verbose_f[bit >> 2](class); | 3109 | return state_verbose_f[bit >> LOCK_USAGE_DIR_MASK](class); |
3019 | } | 3110 | } |
3020 | 3111 | ||
3021 | typedef int (*check_usage_f)(struct task_struct *, struct held_lock *, | 3112 | typedef int (*check_usage_f)(struct task_struct *, struct held_lock *, |
@@ -3157,7 +3248,7 @@ void lockdep_hardirqs_on(unsigned long ip) | |||
3157 | /* | 3248 | /* |
3158 | * See the fine text that goes along with this variable definition. | 3249 | * See the fine text that goes along with this variable definition. |
3159 | */ | 3250 | */ |
3160 | if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) | 3251 | if (DEBUG_LOCKS_WARN_ON(early_boot_irqs_disabled)) |
3161 | return; | 3252 | return; |
3162 | 3253 | ||
3163 | /* | 3254 | /* |
@@ -4689,8 +4780,8 @@ static void free_zapped_rcu(struct rcu_head *ch) | |||
4689 | return; | 4780 | return; |
4690 | 4781 | ||
4691 | raw_local_irq_save(flags); | 4782 | raw_local_irq_save(flags); |
4692 | if (!graph_lock()) | 4783 | arch_spin_lock(&lockdep_lock); |
4693 | goto out_irq; | 4784 | current->lockdep_recursion = 1; |
4694 | 4785 | ||
4695 | /* closed head */ | 4786 | /* closed head */ |
4696 | pf = delayed_free.pf + (delayed_free.index ^ 1); | 4787 | pf = delayed_free.pf + (delayed_free.index ^ 1); |
@@ -4702,8 +4793,8 @@ static void free_zapped_rcu(struct rcu_head *ch) | |||
4702 | */ | 4793 | */ |
4703 | call_rcu_zapped(delayed_free.pf + delayed_free.index); | 4794 | call_rcu_zapped(delayed_free.pf + delayed_free.index); |
4704 | 4795 | ||
4705 | graph_unlock(); | 4796 | current->lockdep_recursion = 0; |
4706 | out_irq: | 4797 | arch_spin_unlock(&lockdep_lock); |
4707 | raw_local_irq_restore(flags); | 4798 | raw_local_irq_restore(flags); |
4708 | } | 4799 | } |
4709 | 4800 | ||
@@ -4744,21 +4835,17 @@ static void lockdep_free_key_range_reg(void *start, unsigned long size) | |||
4744 | { | 4835 | { |
4745 | struct pending_free *pf; | 4836 | struct pending_free *pf; |
4746 | unsigned long flags; | 4837 | unsigned long flags; |
4747 | int locked; | ||
4748 | 4838 | ||
4749 | init_data_structures_once(); | 4839 | init_data_structures_once(); |
4750 | 4840 | ||
4751 | raw_local_irq_save(flags); | 4841 | raw_local_irq_save(flags); |
4752 | locked = graph_lock(); | 4842 | arch_spin_lock(&lockdep_lock); |
4753 | if (!locked) | 4843 | current->lockdep_recursion = 1; |
4754 | goto out_irq; | ||
4755 | |||
4756 | pf = get_pending_free(); | 4844 | pf = get_pending_free(); |
4757 | __lockdep_free_key_range(pf, start, size); | 4845 | __lockdep_free_key_range(pf, start, size); |
4758 | call_rcu_zapped(pf); | 4846 | call_rcu_zapped(pf); |
4759 | 4847 | current->lockdep_recursion = 0; | |
4760 | graph_unlock(); | 4848 | arch_spin_unlock(&lockdep_lock); |
4761 | out_irq: | ||
4762 | raw_local_irq_restore(flags); | 4849 | raw_local_irq_restore(flags); |
4763 | 4850 | ||
4764 | /* | 4851 | /* |
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index d4c197425f68..150ec3f0c5b5 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h | |||
@@ -42,13 +42,35 @@ enum { | |||
42 | __LOCKF(USED) | 42 | __LOCKF(USED) |
43 | }; | 43 | }; |
44 | 44 | ||
45 | #define LOCKF_ENABLED_IRQ (LOCKF_ENABLED_HARDIRQ | LOCKF_ENABLED_SOFTIRQ) | 45 | #define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE | |
46 | #define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ) | 46 | static const unsigned long LOCKF_ENABLED_IRQ = |
47 | #include "lockdep_states.h" | ||
48 | 0; | ||
49 | #undef LOCKDEP_STATE | ||
50 | |||
51 | #define LOCKDEP_STATE(__STATE) LOCKF_USED_IN_##__STATE | | ||
52 | static const unsigned long LOCKF_USED_IN_IRQ = | ||
53 | #include "lockdep_states.h" | ||
54 | 0; | ||
55 | #undef LOCKDEP_STATE | ||
56 | |||
57 | #define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE##_READ | | ||
58 | static const unsigned long LOCKF_ENABLED_IRQ_READ = | ||
59 | #include "lockdep_states.h" | ||
60 | 0; | ||
61 | #undef LOCKDEP_STATE | ||
62 | |||
63 | #define LOCKDEP_STATE(__STATE) LOCKF_USED_IN_##__STATE##_READ | | ||
64 | static const unsigned long LOCKF_USED_IN_IRQ_READ = | ||
65 | #include "lockdep_states.h" | ||
66 | 0; | ||
67 | #undef LOCKDEP_STATE | ||
68 | |||
69 | #define LOCKF_ENABLED_IRQ_ALL (LOCKF_ENABLED_IRQ | LOCKF_ENABLED_IRQ_READ) | ||
70 | #define LOCKF_USED_IN_IRQ_ALL (LOCKF_USED_IN_IRQ | LOCKF_USED_IN_IRQ_READ) | ||
47 | 71 | ||
48 | #define LOCKF_ENABLED_IRQ_READ \ | 72 | #define LOCKF_IRQ (LOCKF_ENABLED_IRQ | LOCKF_USED_IN_IRQ) |
49 | (LOCKF_ENABLED_HARDIRQ_READ | LOCKF_ENABLED_SOFTIRQ_READ) | 73 | #define LOCKF_IRQ_READ (LOCKF_ENABLED_IRQ_READ | LOCKF_USED_IN_IRQ_READ) |
50 | #define LOCKF_USED_IN_IRQ_READ \ | ||
51 | (LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ) | ||
52 | 74 | ||
53 | /* | 75 | /* |
54 | * CONFIG_LOCKDEP_SMALL is defined for sparc. Sparc requires .text, | 76 | * CONFIG_LOCKDEP_SMALL is defined for sparc. Sparc requires .text, |
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index ad40a2617063..80a463d31a8d 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c | |||
@@ -829,7 +829,9 @@ static void lock_torture_cleanup(void) | |||
829 | "End of test: SUCCESS"); | 829 | "End of test: SUCCESS"); |
830 | 830 | ||
831 | kfree(cxt.lwsa); | 831 | kfree(cxt.lwsa); |
832 | cxt.lwsa = NULL; | ||
832 | kfree(cxt.lrsa); | 833 | kfree(cxt.lrsa); |
834 | cxt.lrsa = NULL; | ||
833 | 835 | ||
834 | end: | 836 | end: |
835 | torture_cleanup_end(); | 837 | torture_cleanup_end(); |
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 883cf1b92d90..f17dad99eec8 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c | |||
@@ -7,6 +7,8 @@ | |||
7 | #include <linux/sched.h> | 7 | #include <linux/sched.h> |
8 | #include <linux/errno.h> | 8 | #include <linux/errno.h> |
9 | 9 | ||
10 | #include "rwsem.h" | ||
11 | |||
10 | int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, | 12 | int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, |
11 | const char *name, struct lock_class_key *rwsem_key) | 13 | const char *name, struct lock_class_key *rwsem_key) |
12 | { | 14 | { |
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 5e9247dc2515..e14b32c69639 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c | |||
@@ -395,7 +395,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) | |||
395 | * 0,1,0 -> 0,0,1 | 395 | * 0,1,0 -> 0,0,1 |
396 | */ | 396 | */ |
397 | clear_pending_set_locked(lock); | 397 | clear_pending_set_locked(lock); |
398 | qstat_inc(qstat_lock_pending, true); | 398 | lockevent_inc(lock_pending); |
399 | return; | 399 | return; |
400 | 400 | ||
401 | /* | 401 | /* |
@@ -403,7 +403,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) | |||
403 | * queuing. | 403 | * queuing. |
404 | */ | 404 | */ |
405 | queue: | 405 | queue: |
406 | qstat_inc(qstat_lock_slowpath, true); | 406 | lockevent_inc(lock_slowpath); |
407 | pv_queue: | 407 | pv_queue: |
408 | node = this_cpu_ptr(&qnodes[0].mcs); | 408 | node = this_cpu_ptr(&qnodes[0].mcs); |
409 | idx = node->count++; | 409 | idx = node->count++; |
@@ -419,7 +419,7 @@ pv_queue: | |||
419 | * simple enough. | 419 | * simple enough. |
420 | */ | 420 | */ |
421 | if (unlikely(idx >= MAX_NODES)) { | 421 | if (unlikely(idx >= MAX_NODES)) { |
422 | qstat_inc(qstat_lock_no_node, true); | 422 | lockevent_inc(lock_no_node); |
423 | while (!queued_spin_trylock(lock)) | 423 | while (!queued_spin_trylock(lock)) |
424 | cpu_relax(); | 424 | cpu_relax(); |
425 | goto release; | 425 | goto release; |
@@ -430,7 +430,7 @@ pv_queue: | |||
430 | /* | 430 | /* |
431 | * Keep counts of non-zero index values: | 431 | * Keep counts of non-zero index values: |
432 | */ | 432 | */ |
433 | qstat_inc(qstat_lock_use_node2 + idx - 1, idx); | 433 | lockevent_cond_inc(lock_use_node2 + idx - 1, idx); |
434 | 434 | ||
435 | /* | 435 | /* |
436 | * Ensure that we increment the head node->count before initialising | 436 | * Ensure that we increment the head node->count before initialising |
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 8f36c27c1794..89bab079e7a4 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h | |||
@@ -89,7 +89,7 @@ static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock) | |||
89 | 89 | ||
90 | if (!(val & _Q_LOCKED_PENDING_MASK) && | 90 | if (!(val & _Q_LOCKED_PENDING_MASK) && |
91 | (cmpxchg_acquire(&lock->locked, 0, _Q_LOCKED_VAL) == 0)) { | 91 | (cmpxchg_acquire(&lock->locked, 0, _Q_LOCKED_VAL) == 0)) { |
92 | qstat_inc(qstat_pv_lock_stealing, true); | 92 | lockevent_inc(pv_lock_stealing); |
93 | return true; | 93 | return true; |
94 | } | 94 | } |
95 | if (!(val & _Q_TAIL_MASK) || (val & _Q_PENDING_MASK)) | 95 | if (!(val & _Q_TAIL_MASK) || (val & _Q_PENDING_MASK)) |
@@ -219,7 +219,7 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node) | |||
219 | hopcnt++; | 219 | hopcnt++; |
220 | if (!cmpxchg(&he->lock, NULL, lock)) { | 220 | if (!cmpxchg(&he->lock, NULL, lock)) { |
221 | WRITE_ONCE(he->node, node); | 221 | WRITE_ONCE(he->node, node); |
222 | qstat_hop(hopcnt); | 222 | lockevent_pv_hop(hopcnt); |
223 | return &he->lock; | 223 | return &he->lock; |
224 | } | 224 | } |
225 | } | 225 | } |
@@ -320,8 +320,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) | |||
320 | smp_store_mb(pn->state, vcpu_halted); | 320 | smp_store_mb(pn->state, vcpu_halted); |
321 | 321 | ||
322 | if (!READ_ONCE(node->locked)) { | 322 | if (!READ_ONCE(node->locked)) { |
323 | qstat_inc(qstat_pv_wait_node, true); | 323 | lockevent_inc(pv_wait_node); |
324 | qstat_inc(qstat_pv_wait_early, wait_early); | 324 | lockevent_cond_inc(pv_wait_early, wait_early); |
325 | pv_wait(&pn->state, vcpu_halted); | 325 | pv_wait(&pn->state, vcpu_halted); |
326 | } | 326 | } |
327 | 327 | ||
@@ -339,7 +339,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) | |||
339 | * So it is better to spin for a while in the hope that the | 339 | * So it is better to spin for a while in the hope that the |
340 | * MCS lock will be released soon. | 340 | * MCS lock will be released soon. |
341 | */ | 341 | */ |
342 | qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked)); | 342 | lockevent_cond_inc(pv_spurious_wakeup, |
343 | !READ_ONCE(node->locked)); | ||
343 | } | 344 | } |
344 | 345 | ||
345 | /* | 346 | /* |
@@ -416,7 +417,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) | |||
416 | /* | 417 | /* |
417 | * Tracking # of slowpath locking operations | 418 | * Tracking # of slowpath locking operations |
418 | */ | 419 | */ |
419 | qstat_inc(qstat_lock_slowpath, true); | 420 | lockevent_inc(lock_slowpath); |
420 | 421 | ||
421 | for (;; waitcnt++) { | 422 | for (;; waitcnt++) { |
422 | /* | 423 | /* |
@@ -464,8 +465,8 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) | |||
464 | } | 465 | } |
465 | } | 466 | } |
466 | WRITE_ONCE(pn->state, vcpu_hashed); | 467 | WRITE_ONCE(pn->state, vcpu_hashed); |
467 | qstat_inc(qstat_pv_wait_head, true); | 468 | lockevent_inc(pv_wait_head); |
468 | qstat_inc(qstat_pv_wait_again, waitcnt); | 469 | lockevent_cond_inc(pv_wait_again, waitcnt); |
469 | pv_wait(&lock->locked, _Q_SLOW_VAL); | 470 | pv_wait(&lock->locked, _Q_SLOW_VAL); |
470 | 471 | ||
471 | /* | 472 | /* |
@@ -528,7 +529,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked) | |||
528 | * vCPU is harmless other than the additional latency in completing | 529 | * vCPU is harmless other than the additional latency in completing |
529 | * the unlock. | 530 | * the unlock. |
530 | */ | 531 | */ |
531 | qstat_inc(qstat_pv_kick_unlock, true); | 532 | lockevent_inc(pv_kick_unlock); |
532 | pv_kick(node->cpu); | 533 | pv_kick(node->cpu); |
533 | } | 534 | } |
534 | 535 | ||
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index d73f85388d5c..54152670ff24 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h | |||
@@ -9,262 +9,105 @@ | |||
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
10 | * GNU General Public License for more details. | 10 | * GNU General Public License for more details. |
11 | * | 11 | * |
12 | * Authors: Waiman Long <waiman.long@hpe.com> | 12 | * Authors: Waiman Long <longman@redhat.com> |
13 | */ | 13 | */ |
14 | 14 | ||
15 | /* | 15 | #include "lock_events.h" |
16 | * When queued spinlock statistical counters are enabled, the following | ||
17 | * debugfs files will be created for reporting the counter values: | ||
18 | * | ||
19 | * <debugfs>/qlockstat/ | ||
20 | * pv_hash_hops - average # of hops per hashing operation | ||
21 | * pv_kick_unlock - # of vCPU kicks issued at unlock time | ||
22 | * pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake | ||
23 | * pv_latency_kick - average latency (ns) of vCPU kick operation | ||
24 | * pv_latency_wake - average latency (ns) from vCPU kick to wakeup | ||
25 | * pv_lock_stealing - # of lock stealing operations | ||
26 | * pv_spurious_wakeup - # of spurious wakeups in non-head vCPUs | ||
27 | * pv_wait_again - # of wait's after a queue head vCPU kick | ||
28 | * pv_wait_early - # of early vCPU wait's | ||
29 | * pv_wait_head - # of vCPU wait's at the queue head | ||
30 | * pv_wait_node - # of vCPU wait's at a non-head queue node | ||
31 | * lock_pending - # of locking operations via pending code | ||
32 | * lock_slowpath - # of locking operations via MCS lock queue | ||
33 | * lock_use_node2 - # of locking operations that use 2nd per-CPU node | ||
34 | * lock_use_node3 - # of locking operations that use 3rd per-CPU node | ||
35 | * lock_use_node4 - # of locking operations that use 4th per-CPU node | ||
36 | * lock_no_node - # of locking operations without using per-CPU node | ||
37 | * | ||
38 | * Subtracting lock_use_node[234] from lock_slowpath will give you | ||
39 | * lock_use_node1. | ||
40 | * | ||
41 | * Writing to the "reset_counters" file will reset all the above counter | ||
42 | * values. | ||
43 | * | ||
44 | * These statistical counters are implemented as per-cpu variables which are | ||
45 | * summed and computed whenever the corresponding debugfs files are read. This | ||
46 | * minimizes added overhead making the counters usable even in a production | ||
47 | * environment. | ||
48 | * | ||
49 | * There may be slight difference between pv_kick_wake and pv_kick_unlock. | ||
50 | */ | ||
51 | enum qlock_stats { | ||
52 | qstat_pv_hash_hops, | ||
53 | qstat_pv_kick_unlock, | ||
54 | qstat_pv_kick_wake, | ||
55 | qstat_pv_latency_kick, | ||
56 | qstat_pv_latency_wake, | ||
57 | qstat_pv_lock_stealing, | ||
58 | qstat_pv_spurious_wakeup, | ||
59 | qstat_pv_wait_again, | ||
60 | qstat_pv_wait_early, | ||
61 | qstat_pv_wait_head, | ||
62 | qstat_pv_wait_node, | ||
63 | qstat_lock_pending, | ||
64 | qstat_lock_slowpath, | ||
65 | qstat_lock_use_node2, | ||
66 | qstat_lock_use_node3, | ||
67 | qstat_lock_use_node4, | ||
68 | qstat_lock_no_node, | ||
69 | qstat_num, /* Total number of statistical counters */ | ||
70 | qstat_reset_cnts = qstat_num, | ||
71 | }; | ||
72 | 16 | ||
73 | #ifdef CONFIG_QUEUED_LOCK_STAT | 17 | #ifdef CONFIG_LOCK_EVENT_COUNTS |
18 | #ifdef CONFIG_PARAVIRT_SPINLOCKS | ||
74 | /* | 19 | /* |
75 | * Collect pvqspinlock statistics | 20 | * Collect pvqspinlock locking event counts |
76 | */ | 21 | */ |
77 | #include <linux/debugfs.h> | ||
78 | #include <linux/sched.h> | 22 | #include <linux/sched.h> |
79 | #include <linux/sched/clock.h> | 23 | #include <linux/sched/clock.h> |
80 | #include <linux/fs.h> | 24 | #include <linux/fs.h> |
81 | 25 | ||
82 | static const char * const qstat_names[qstat_num + 1] = { | 26 | #define EVENT_COUNT(ev) lockevents[LOCKEVENT_ ## ev] |
83 | [qstat_pv_hash_hops] = "pv_hash_hops", | ||
84 | [qstat_pv_kick_unlock] = "pv_kick_unlock", | ||
85 | [qstat_pv_kick_wake] = "pv_kick_wake", | ||
86 | [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup", | ||
87 | [qstat_pv_latency_kick] = "pv_latency_kick", | ||
88 | [qstat_pv_latency_wake] = "pv_latency_wake", | ||
89 | [qstat_pv_lock_stealing] = "pv_lock_stealing", | ||
90 | [qstat_pv_wait_again] = "pv_wait_again", | ||
91 | [qstat_pv_wait_early] = "pv_wait_early", | ||
92 | [qstat_pv_wait_head] = "pv_wait_head", | ||
93 | [qstat_pv_wait_node] = "pv_wait_node", | ||
94 | [qstat_lock_pending] = "lock_pending", | ||
95 | [qstat_lock_slowpath] = "lock_slowpath", | ||
96 | [qstat_lock_use_node2] = "lock_use_node2", | ||
97 | [qstat_lock_use_node3] = "lock_use_node3", | ||
98 | [qstat_lock_use_node4] = "lock_use_node4", | ||
99 | [qstat_lock_no_node] = "lock_no_node", | ||
100 | [qstat_reset_cnts] = "reset_counters", | ||
101 | }; | ||
102 | 27 | ||
103 | /* | 28 | /* |
104 | * Per-cpu counters | 29 | * PV specific per-cpu counter |
105 | */ | 30 | */ |
106 | static DEFINE_PER_CPU(unsigned long, qstats[qstat_num]); | ||
107 | static DEFINE_PER_CPU(u64, pv_kick_time); | 31 | static DEFINE_PER_CPU(u64, pv_kick_time); |
108 | 32 | ||
109 | /* | 33 | /* |
110 | * Function to read and return the qlock statistical counter values | 34 | * Function to read and return the PV qspinlock counts. |
111 | * | 35 | * |
112 | * The following counters are handled specially: | 36 | * The following counters are handled specially: |
113 | * 1. qstat_pv_latency_kick | 37 | * 1. pv_latency_kick |
114 | * Average kick latency (ns) = pv_latency_kick/pv_kick_unlock | 38 | * Average kick latency (ns) = pv_latency_kick/pv_kick_unlock |
115 | * 2. qstat_pv_latency_wake | 39 | * 2. pv_latency_wake |
116 | * Average wake latency (ns) = pv_latency_wake/pv_kick_wake | 40 | * Average wake latency (ns) = pv_latency_wake/pv_kick_wake |
117 | * 3. qstat_pv_hash_hops | 41 | * 3. pv_hash_hops |
118 | * Average hops/hash = pv_hash_hops/pv_kick_unlock | 42 | * Average hops/hash = pv_hash_hops/pv_kick_unlock |
119 | */ | 43 | */ |
120 | static ssize_t qstat_read(struct file *file, char __user *user_buf, | 44 | ssize_t lockevent_read(struct file *file, char __user *user_buf, |
121 | size_t count, loff_t *ppos) | 45 | size_t count, loff_t *ppos) |
122 | { | 46 | { |
123 | char buf[64]; | 47 | char buf[64]; |
124 | int cpu, counter, len; | 48 | int cpu, id, len; |
125 | u64 stat = 0, kicks = 0; | 49 | u64 sum = 0, kicks = 0; |
126 | 50 | ||
127 | /* | 51 | /* |
128 | * Get the counter ID stored in file->f_inode->i_private | 52 | * Get the counter ID stored in file->f_inode->i_private |
129 | */ | 53 | */ |
130 | counter = (long)file_inode(file)->i_private; | 54 | id = (long)file_inode(file)->i_private; |
131 | 55 | ||
132 | if (counter >= qstat_num) | 56 | if (id >= lockevent_num) |
133 | return -EBADF; | 57 | return -EBADF; |
134 | 58 | ||
135 | for_each_possible_cpu(cpu) { | 59 | for_each_possible_cpu(cpu) { |
136 | stat += per_cpu(qstats[counter], cpu); | 60 | sum += per_cpu(lockevents[id], cpu); |
137 | /* | 61 | /* |
138 | * Need to sum additional counter for some of them | 62 | * Need to sum additional counters for some of them |
139 | */ | 63 | */ |
140 | switch (counter) { | 64 | switch (id) { |
141 | 65 | ||
142 | case qstat_pv_latency_kick: | 66 | case LOCKEVENT_pv_latency_kick: |
143 | case qstat_pv_hash_hops: | 67 | case LOCKEVENT_pv_hash_hops: |
144 | kicks += per_cpu(qstats[qstat_pv_kick_unlock], cpu); | 68 | kicks += per_cpu(EVENT_COUNT(pv_kick_unlock), cpu); |
145 | break; | 69 | break; |
146 | 70 | ||
147 | case qstat_pv_latency_wake: | 71 | case LOCKEVENT_pv_latency_wake: |
148 | kicks += per_cpu(qstats[qstat_pv_kick_wake], cpu); | 72 | kicks += per_cpu(EVENT_COUNT(pv_kick_wake), cpu); |
149 | break; | 73 | break; |
150 | } | 74 | } |
151 | } | 75 | } |
152 | 76 | ||
153 | if (counter == qstat_pv_hash_hops) { | 77 | if (id == LOCKEVENT_pv_hash_hops) { |
154 | u64 frac = 0; | 78 | u64 frac = 0; |
155 | 79 | ||
156 | if (kicks) { | 80 | if (kicks) { |
157 | frac = 100ULL * do_div(stat, kicks); | 81 | frac = 100ULL * do_div(sum, kicks); |
158 | frac = DIV_ROUND_CLOSEST_ULL(frac, kicks); | 82 | frac = DIV_ROUND_CLOSEST_ULL(frac, kicks); |
159 | } | 83 | } |
160 | 84 | ||
161 | /* | 85 | /* |
162 | * Return a X.XX decimal number | 86 | * Return a X.XX decimal number |
163 | */ | 87 | */ |
164 | len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", stat, frac); | 88 | len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", |
89 | sum, frac); | ||
165 | } else { | 90 | } else { |
166 | /* | 91 | /* |
167 | * Round to the nearest ns | 92 | * Round to the nearest ns |
168 | */ | 93 | */ |
169 | if ((counter == qstat_pv_latency_kick) || | 94 | if ((id == LOCKEVENT_pv_latency_kick) || |
170 | (counter == qstat_pv_latency_wake)) { | 95 | (id == LOCKEVENT_pv_latency_wake)) { |
171 | if (kicks) | 96 | if (kicks) |
172 | stat = DIV_ROUND_CLOSEST_ULL(stat, kicks); | 97 | sum = DIV_ROUND_CLOSEST_ULL(sum, kicks); |
173 | } | 98 | } |
174 | len = snprintf(buf, sizeof(buf) - 1, "%llu\n", stat); | 99 | len = snprintf(buf, sizeof(buf) - 1, "%llu\n", sum); |
175 | } | 100 | } |
176 | 101 | ||
177 | return simple_read_from_buffer(user_buf, count, ppos, buf, len); | 102 | return simple_read_from_buffer(user_buf, count, ppos, buf, len); |
178 | } | 103 | } |
179 | 104 | ||
180 | /* | 105 | /* |
181 | * Function to handle write request | ||
182 | * | ||
183 | * When counter = reset_cnts, reset all the counter values. | ||
184 | * Since the counter updates aren't atomic, the resetting is done twice | ||
185 | * to make sure that the counters are very likely to be all cleared. | ||
186 | */ | ||
187 | static ssize_t qstat_write(struct file *file, const char __user *user_buf, | ||
188 | size_t count, loff_t *ppos) | ||
189 | { | ||
190 | int cpu; | ||
191 | |||
192 | /* | ||
193 | * Get the counter ID stored in file->f_inode->i_private | ||
194 | */ | ||
195 | if ((long)file_inode(file)->i_private != qstat_reset_cnts) | ||
196 | return count; | ||
197 | |||
198 | for_each_possible_cpu(cpu) { | ||
199 | int i; | ||
200 | unsigned long *ptr = per_cpu_ptr(qstats, cpu); | ||
201 | |||
202 | for (i = 0 ; i < qstat_num; i++) | ||
203 | WRITE_ONCE(ptr[i], 0); | ||
204 | } | ||
205 | return count; | ||
206 | } | ||
207 | |||
208 | /* | ||
209 | * Debugfs data structures | ||
210 | */ | ||
211 | static const struct file_operations fops_qstat = { | ||
212 | .read = qstat_read, | ||
213 | .write = qstat_write, | ||
214 | .llseek = default_llseek, | ||
215 | }; | ||
216 | |||
217 | /* | ||
218 | * Initialize debugfs for the qspinlock statistical counters | ||
219 | */ | ||
220 | static int __init init_qspinlock_stat(void) | ||
221 | { | ||
222 | struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL); | ||
223 | int i; | ||
224 | |||
225 | if (!d_qstat) | ||
226 | goto out; | ||
227 | |||
228 | /* | ||
229 | * Create the debugfs files | ||
230 | * | ||
231 | * As reading from and writing to the stat files can be slow, only | ||
232 | * root is allowed to do the read/write to limit impact to system | ||
233 | * performance. | ||
234 | */ | ||
235 | for (i = 0; i < qstat_num; i++) | ||
236 | if (!debugfs_create_file(qstat_names[i], 0400, d_qstat, | ||
237 | (void *)(long)i, &fops_qstat)) | ||
238 | goto fail_undo; | ||
239 | |||
240 | if (!debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat, | ||
241 | (void *)(long)qstat_reset_cnts, &fops_qstat)) | ||
242 | goto fail_undo; | ||
243 | |||
244 | return 0; | ||
245 | fail_undo: | ||
246 | debugfs_remove_recursive(d_qstat); | ||
247 | out: | ||
248 | pr_warn("Could not create 'qlockstat' debugfs entries\n"); | ||
249 | return -ENOMEM; | ||
250 | } | ||
251 | fs_initcall(init_qspinlock_stat); | ||
252 | |||
253 | /* | ||
254 | * Increment the PV qspinlock statistical counters | ||
255 | */ | ||
256 | static inline void qstat_inc(enum qlock_stats stat, bool cond) | ||
257 | { | ||
258 | if (cond) | ||
259 | this_cpu_inc(qstats[stat]); | ||
260 | } | ||
261 | |||
262 | /* | ||
263 | * PV hash hop count | 106 | * PV hash hop count |
264 | */ | 107 | */ |
265 | static inline void qstat_hop(int hopcnt) | 108 | static inline void lockevent_pv_hop(int hopcnt) |
266 | { | 109 | { |
267 | this_cpu_add(qstats[qstat_pv_hash_hops], hopcnt); | 110 | this_cpu_add(EVENT_COUNT(pv_hash_hops), hopcnt); |
268 | } | 111 | } |
269 | 112 | ||
270 | /* | 113 | /* |
@@ -276,7 +119,7 @@ static inline void __pv_kick(int cpu) | |||
276 | 119 | ||
277 | per_cpu(pv_kick_time, cpu) = start; | 120 | per_cpu(pv_kick_time, cpu) = start; |
278 | pv_kick(cpu); | 121 | pv_kick(cpu); |
279 | this_cpu_add(qstats[qstat_pv_latency_kick], sched_clock() - start); | 122 | this_cpu_add(EVENT_COUNT(pv_latency_kick), sched_clock() - start); |
280 | } | 123 | } |
281 | 124 | ||
282 | /* | 125 | /* |
@@ -289,18 +132,19 @@ static inline void __pv_wait(u8 *ptr, u8 val) | |||
289 | *pkick_time = 0; | 132 | *pkick_time = 0; |
290 | pv_wait(ptr, val); | 133 | pv_wait(ptr, val); |
291 | if (*pkick_time) { | 134 | if (*pkick_time) { |
292 | this_cpu_add(qstats[qstat_pv_latency_wake], | 135 | this_cpu_add(EVENT_COUNT(pv_latency_wake), |
293 | sched_clock() - *pkick_time); | 136 | sched_clock() - *pkick_time); |
294 | qstat_inc(qstat_pv_kick_wake, true); | 137 | lockevent_inc(pv_kick_wake); |
295 | } | 138 | } |
296 | } | 139 | } |
297 | 140 | ||
298 | #define pv_kick(c) __pv_kick(c) | 141 | #define pv_kick(c) __pv_kick(c) |
299 | #define pv_wait(p, v) __pv_wait(p, v) | 142 | #define pv_wait(p, v) __pv_wait(p, v) |
300 | 143 | ||
301 | #else /* CONFIG_QUEUED_LOCK_STAT */ | 144 | #endif /* CONFIG_PARAVIRT_SPINLOCKS */ |
145 | |||
146 | #else /* CONFIG_LOCK_EVENT_COUNTS */ | ||
302 | 147 | ||
303 | static inline void qstat_inc(enum qlock_stats stat, bool cond) { } | 148 | static inline void lockevent_pv_hop(int hopcnt) { } |
304 | static inline void qstat_hop(int hopcnt) { } | ||
305 | 149 | ||
306 | #endif /* CONFIG_QUEUED_LOCK_STAT */ | 150 | #endif /* CONFIG_LOCK_EVENT_COUNTS */ |
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c deleted file mode 100644 index a7ffb2a96ede..000000000000 --- a/kernel/locking/rwsem-spinlock.c +++ /dev/null | |||
@@ -1,339 +0,0 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* rwsem-spinlock.c: R/W semaphores: contention handling functions for | ||
3 | * generic spinlock implementation | ||
4 | * | ||
5 | * Copyright (c) 2001 David Howells (dhowells@redhat.com). | ||
6 | * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de> | ||
7 | * - Derived also from comments by Linus | ||
8 | */ | ||
9 | #include <linux/rwsem.h> | ||
10 | #include <linux/sched/signal.h> | ||
11 | #include <linux/sched/debug.h> | ||
12 | #include <linux/export.h> | ||
13 | |||
14 | enum rwsem_waiter_type { | ||
15 | RWSEM_WAITING_FOR_WRITE, | ||
16 | RWSEM_WAITING_FOR_READ | ||
17 | }; | ||
18 | |||
19 | struct rwsem_waiter { | ||
20 | struct list_head list; | ||
21 | struct task_struct *task; | ||
22 | enum rwsem_waiter_type type; | ||
23 | }; | ||
24 | |||
25 | int rwsem_is_locked(struct rw_semaphore *sem) | ||
26 | { | ||
27 | int ret = 1; | ||
28 | unsigned long flags; | ||
29 | |||
30 | if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) { | ||
31 | ret = (sem->count != 0); | ||
32 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | ||
33 | } | ||
34 | return ret; | ||
35 | } | ||
36 | EXPORT_SYMBOL(rwsem_is_locked); | ||
37 | |||
38 | /* | ||
39 | * initialise the semaphore | ||
40 | */ | ||
41 | void __init_rwsem(struct rw_semaphore *sem, const char *name, | ||
42 | struct lock_class_key *key) | ||
43 | { | ||
44 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
45 | /* | ||
46 | * Make sure we are not reinitializing a held semaphore: | ||
47 | */ | ||
48 | debug_check_no_locks_freed((void *)sem, sizeof(*sem)); | ||
49 | lockdep_init_map(&sem->dep_map, name, key, 0); | ||
50 | #endif | ||
51 | sem->count = 0; | ||
52 | raw_spin_lock_init(&sem->wait_lock); | ||
53 | INIT_LIST_HEAD(&sem->wait_list); | ||
54 | } | ||
55 | EXPORT_SYMBOL(__init_rwsem); | ||
56 | |||
57 | /* | ||
58 | * handle the lock release when processes blocked on it that can now run | ||
59 | * - if we come here, then: | ||
60 | * - the 'active count' _reached_ zero | ||
61 | * - the 'waiting count' is non-zero | ||
62 | * - the spinlock must be held by the caller | ||
63 | * - woken process blocks are discarded from the list after having task zeroed | ||
64 | * - writers are only woken if wakewrite is non-zero | ||
65 | */ | ||
66 | static inline struct rw_semaphore * | ||
67 | __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) | ||
68 | { | ||
69 | struct rwsem_waiter *waiter; | ||
70 | struct task_struct *tsk; | ||
71 | int woken; | ||
72 | |||
73 | waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); | ||
74 | |||
75 | if (waiter->type == RWSEM_WAITING_FOR_WRITE) { | ||
76 | if (wakewrite) | ||
77 | /* Wake up a writer. Note that we do not grant it the | ||
78 | * lock - it will have to acquire it when it runs. */ | ||
79 | wake_up_process(waiter->task); | ||
80 | goto out; | ||
81 | } | ||
82 | |||
83 | /* grant an infinite number of read locks to the front of the queue */ | ||
84 | woken = 0; | ||
85 | do { | ||
86 | struct list_head *next = waiter->list.next; | ||
87 | |||
88 | list_del(&waiter->list); | ||
89 | tsk = waiter->task; | ||
90 | /* | ||
91 | * Make sure we do not wakeup the next reader before | ||
92 | * setting the nil condition to grant the next reader; | ||
93 | * otherwise we could miss the wakeup on the other | ||
94 | * side and end up sleeping again. See the pairing | ||
95 | * in rwsem_down_read_failed(). | ||
96 | */ | ||
97 | smp_mb(); | ||
98 | waiter->task = NULL; | ||
99 | wake_up_process(tsk); | ||
100 | put_task_struct(tsk); | ||
101 | woken++; | ||
102 | if (next == &sem->wait_list) | ||
103 | break; | ||
104 | waiter = list_entry(next, struct rwsem_waiter, list); | ||
105 | } while (waiter->type != RWSEM_WAITING_FOR_WRITE); | ||
106 | |||
107 | sem->count += woken; | ||
108 | |||
109 | out: | ||
110 | return sem; | ||
111 | } | ||
112 | |||
113 | /* | ||
114 | * wake a single writer | ||
115 | */ | ||
116 | static inline struct rw_semaphore * | ||
117 | __rwsem_wake_one_writer(struct rw_semaphore *sem) | ||
118 | { | ||
119 | struct rwsem_waiter *waiter; | ||
120 | |||
121 | waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); | ||
122 | wake_up_process(waiter->task); | ||
123 | |||
124 | return sem; | ||
125 | } | ||
126 | |||
127 | /* | ||
128 | * get a read lock on the semaphore | ||
129 | */ | ||
130 | int __sched __down_read_common(struct rw_semaphore *sem, int state) | ||
131 | { | ||
132 | struct rwsem_waiter waiter; | ||
133 | unsigned long flags; | ||
134 | |||
135 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | ||
136 | |||
137 | if (sem->count >= 0 && list_empty(&sem->wait_list)) { | ||
138 | /* granted */ | ||
139 | sem->count++; | ||
140 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | ||
141 | goto out; | ||
142 | } | ||
143 | |||
144 | /* set up my own style of waitqueue */ | ||
145 | waiter.task = current; | ||
146 | waiter.type = RWSEM_WAITING_FOR_READ; | ||
147 | get_task_struct(current); | ||
148 | |||
149 | list_add_tail(&waiter.list, &sem->wait_list); | ||
150 | |||
151 | /* wait to be given the lock */ | ||
152 | for (;;) { | ||
153 | if (!waiter.task) | ||
154 | break; | ||
155 | if (signal_pending_state(state, current)) | ||
156 | goto out_nolock; | ||
157 | set_current_state(state); | ||
158 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | ||
159 | schedule(); | ||
160 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | ||
161 | } | ||
162 | |||
163 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | ||
164 | out: | ||
165 | return 0; | ||
166 | |||
167 | out_nolock: | ||
168 | /* | ||
169 | * We didn't take the lock, so that there is a writer, which | ||
170 | * is owner or the first waiter of the sem. If it's a waiter, | ||
171 | * it will be woken by current owner. Not need to wake anybody. | ||
172 | */ | ||
173 | list_del(&waiter.list); | ||
174 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | ||
175 | return -EINTR; | ||
176 | } | ||
177 | |||
178 | void __sched __down_read(struct rw_semaphore *sem) | ||
179 | { | ||
180 | __down_read_common(sem, TASK_UNINTERRUPTIBLE); | ||
181 | } | ||
182 | |||
183 | int __sched __down_read_killable(struct rw_semaphore *sem) | ||
184 | { | ||
185 | return __down_read_common(sem, TASK_KILLABLE); | ||
186 | } | ||
187 | |||
188 | /* | ||
189 | * trylock for reading -- returns 1 if successful, 0 if contention | ||
190 | */ | ||
191 | int __down_read_trylock(struct rw_semaphore *sem) | ||
192 | { | ||
193 | unsigned long flags; | ||
194 | int ret = 0; | ||
195 | |||
196 | |||
197 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | ||
198 | |||
199 | if (sem->count >= 0 && list_empty(&sem->wait_list)) { | ||
200 | /* granted */ | ||
201 | sem->count++; | ||
202 | ret = 1; | ||
203 | } | ||
204 | |||
205 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | ||
206 | |||
207 | return ret; | ||
208 | } | ||
209 | |||
210 | /* | ||
211 | * get a write lock on the semaphore | ||
212 | */ | ||
213 | int __sched __down_write_common(struct rw_semaphore *sem, int state) | ||
214 | { | ||
215 | struct rwsem_waiter waiter; | ||
216 | unsigned long flags; | ||
217 | int ret = 0; | ||
218 | |||
219 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | ||
220 | |||
221 | /* set up my own style of waitqueue */ | ||
222 | waiter.task = current; | ||
223 | waiter.type = RWSEM_WAITING_FOR_WRITE; | ||
224 | list_add_tail(&waiter.list, &sem->wait_list); | ||
225 | |||
226 | /* wait for someone to release the lock */ | ||
227 | for (;;) { | ||
228 | /* | ||
229 | * That is the key to support write lock stealing: allows the | ||
230 | * task already on CPU to get the lock soon rather than put | ||
231 | * itself into sleep and waiting for system woke it or someone | ||
232 | * else in the head of the wait list up. | ||
233 | */ | ||
234 | if (sem->count == 0) | ||
235 | break; | ||
236 | if (signal_pending_state(state, current)) | ||
237 | goto out_nolock; | ||
238 | |||
239 | set_current_state(state); | ||
240 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | ||
241 | schedule(); | ||
242 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | ||
243 | } | ||
244 | /* got the lock */ | ||
245 | sem->count = -1; | ||
246 | list_del(&waiter.list); | ||
247 | |||
248 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | ||
249 | |||
250 | return ret; | ||
251 | |||
252 | out_nolock: | ||
253 | list_del(&waiter.list); | ||
254 | if (!list_empty(&sem->wait_list) && sem->count >= 0) | ||
255 | __rwsem_do_wake(sem, 0); | ||
256 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | ||
257 | |||
258 | return -EINTR; | ||
259 | } | ||
260 | |||
261 | void __sched __down_write(struct rw_semaphore *sem) | ||
262 | { | ||
263 | __down_write_common(sem, TASK_UNINTERRUPTIBLE); | ||
264 | } | ||
265 | |||
266 | int __sched __down_write_killable(struct rw_semaphore *sem) | ||
267 | { | ||
268 | return __down_write_common(sem, TASK_KILLABLE); | ||
269 | } | ||
270 | |||
271 | /* | ||
272 | * trylock for writing -- returns 1 if successful, 0 if contention | ||
273 | */ | ||
274 | int __down_write_trylock(struct rw_semaphore *sem) | ||
275 | { | ||
276 | unsigned long flags; | ||
277 | int ret = 0; | ||
278 | |||
279 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | ||
280 | |||
281 | if (sem->count == 0) { | ||
282 | /* got the lock */ | ||
283 | sem->count = -1; | ||
284 | ret = 1; | ||
285 | } | ||
286 | |||
287 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | ||
288 | |||
289 | return ret; | ||
290 | } | ||
291 | |||
292 | /* | ||
293 | * release a read lock on the semaphore | ||
294 | */ | ||
295 | void __up_read(struct rw_semaphore *sem) | ||
296 | { | ||
297 | unsigned long flags; | ||
298 | |||
299 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | ||
300 | |||
301 | if (--sem->count == 0 && !list_empty(&sem->wait_list)) | ||
302 | sem = __rwsem_wake_one_writer(sem); | ||
303 | |||
304 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | ||
305 | } | ||
306 | |||
307 | /* | ||
308 | * release a write lock on the semaphore | ||
309 | */ | ||
310 | void __up_write(struct rw_semaphore *sem) | ||
311 | { | ||
312 | unsigned long flags; | ||
313 | |||
314 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | ||
315 | |||
316 | sem->count = 0; | ||
317 | if (!list_empty(&sem->wait_list)) | ||
318 | sem = __rwsem_do_wake(sem, 1); | ||
319 | |||
320 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | ||
321 | } | ||
322 | |||
323 | /* | ||
324 | * downgrade a write lock into a read lock | ||
325 | * - just wake up any readers at the front of the queue | ||
326 | */ | ||
327 | void __downgrade_write(struct rw_semaphore *sem) | ||
328 | { | ||
329 | unsigned long flags; | ||
330 | |||
331 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | ||
332 | |||
333 | sem->count = 1; | ||
334 | if (!list_empty(&sem->wait_list)) | ||
335 | sem = __rwsem_do_wake(sem, 0); | ||
336 | |||
337 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | ||
338 | } | ||
339 | |||
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index fbe96341beee..6b3ee9948bf1 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c | |||
@@ -147,6 +147,7 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, | |||
147 | * will notice the queued writer. | 147 | * will notice the queued writer. |
148 | */ | 148 | */ |
149 | wake_q_add(wake_q, waiter->task); | 149 | wake_q_add(wake_q, waiter->task); |
150 | lockevent_inc(rwsem_wake_writer); | ||
150 | } | 151 | } |
151 | 152 | ||
152 | return; | 153 | return; |
@@ -176,9 +177,8 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, | |||
176 | goto try_reader_grant; | 177 | goto try_reader_grant; |
177 | } | 178 | } |
178 | /* | 179 | /* |
179 | * It is not really necessary to set it to reader-owned here, | 180 | * Set it to reader-owned to give spinners an early |
180 | * but it gives the spinners an early indication that the | 181 | * indication that readers now have the lock. |
181 | * readers now have the lock. | ||
182 | */ | 182 | */ |
183 | __rwsem_set_reader_owned(sem, waiter->task); | 183 | __rwsem_set_reader_owned(sem, waiter->task); |
184 | } | 184 | } |
@@ -215,6 +215,7 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, | |||
215 | } | 215 | } |
216 | 216 | ||
217 | adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; | 217 | adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; |
218 | lockevent_cond_inc(rwsem_wake_reader, woken); | ||
218 | if (list_empty(&sem->wait_list)) { | 219 | if (list_empty(&sem->wait_list)) { |
219 | /* hit end of list above */ | 220 | /* hit end of list above */ |
220 | adjustment -= RWSEM_WAITING_BIAS; | 221 | adjustment -= RWSEM_WAITING_BIAS; |
@@ -225,92 +226,6 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, | |||
225 | } | 226 | } |
226 | 227 | ||
227 | /* | 228 | /* |
228 | * Wait for the read lock to be granted | ||
229 | */ | ||
230 | static inline struct rw_semaphore __sched * | ||
231 | __rwsem_down_read_failed_common(struct rw_semaphore *sem, int state) | ||
232 | { | ||
233 | long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; | ||
234 | struct rwsem_waiter waiter; | ||
235 | DEFINE_WAKE_Q(wake_q); | ||
236 | |||
237 | waiter.task = current; | ||
238 | waiter.type = RWSEM_WAITING_FOR_READ; | ||
239 | |||
240 | raw_spin_lock_irq(&sem->wait_lock); | ||
241 | if (list_empty(&sem->wait_list)) { | ||
242 | /* | ||
243 | * In case the wait queue is empty and the lock isn't owned | ||
244 | * by a writer, this reader can exit the slowpath and return | ||
245 | * immediately as its RWSEM_ACTIVE_READ_BIAS has already | ||
246 | * been set in the count. | ||
247 | */ | ||
248 | if (atomic_long_read(&sem->count) >= 0) { | ||
249 | raw_spin_unlock_irq(&sem->wait_lock); | ||
250 | return sem; | ||
251 | } | ||
252 | adjustment += RWSEM_WAITING_BIAS; | ||
253 | } | ||
254 | list_add_tail(&waiter.list, &sem->wait_list); | ||
255 | |||
256 | /* we're now waiting on the lock, but no longer actively locking */ | ||
257 | count = atomic_long_add_return(adjustment, &sem->count); | ||
258 | |||
259 | /* | ||
260 | * If there are no active locks, wake the front queued process(es). | ||
261 | * | ||
262 | * If there are no writers and we are first in the queue, | ||
263 | * wake our own waiter to join the existing active readers ! | ||
264 | */ | ||
265 | if (count == RWSEM_WAITING_BIAS || | ||
266 | (count > RWSEM_WAITING_BIAS && | ||
267 | adjustment != -RWSEM_ACTIVE_READ_BIAS)) | ||
268 | __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); | ||
269 | |||
270 | raw_spin_unlock_irq(&sem->wait_lock); | ||
271 | wake_up_q(&wake_q); | ||
272 | |||
273 | /* wait to be given the lock */ | ||
274 | while (true) { | ||
275 | set_current_state(state); | ||
276 | if (!waiter.task) | ||
277 | break; | ||
278 | if (signal_pending_state(state, current)) { | ||
279 | raw_spin_lock_irq(&sem->wait_lock); | ||
280 | if (waiter.task) | ||
281 | goto out_nolock; | ||
282 | raw_spin_unlock_irq(&sem->wait_lock); | ||
283 | break; | ||
284 | } | ||
285 | schedule(); | ||
286 | } | ||
287 | |||
288 | __set_current_state(TASK_RUNNING); | ||
289 | return sem; | ||
290 | out_nolock: | ||
291 | list_del(&waiter.list); | ||
292 | if (list_empty(&sem->wait_list)) | ||
293 | atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count); | ||
294 | raw_spin_unlock_irq(&sem->wait_lock); | ||
295 | __set_current_state(TASK_RUNNING); | ||
296 | return ERR_PTR(-EINTR); | ||
297 | } | ||
298 | |||
299 | __visible struct rw_semaphore * __sched | ||
300 | rwsem_down_read_failed(struct rw_semaphore *sem) | ||
301 | { | ||
302 | return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE); | ||
303 | } | ||
304 | EXPORT_SYMBOL(rwsem_down_read_failed); | ||
305 | |||
306 | __visible struct rw_semaphore * __sched | ||
307 | rwsem_down_read_failed_killable(struct rw_semaphore *sem) | ||
308 | { | ||
309 | return __rwsem_down_read_failed_common(sem, TASK_KILLABLE); | ||
310 | } | ||
311 | EXPORT_SYMBOL(rwsem_down_read_failed_killable); | ||
312 | |||
313 | /* | ||
314 | * This function must be called with the sem->wait_lock held to prevent | 229 | * This function must be called with the sem->wait_lock held to prevent |
315 | * race conditions between checking the rwsem wait list and setting the | 230 | * race conditions between checking the rwsem wait list and setting the |
316 | * sem->count accordingly. | 231 | * sem->count accordingly. |
@@ -346,21 +261,17 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) | |||
346 | */ | 261 | */ |
347 | static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) | 262 | static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) |
348 | { | 263 | { |
349 | long old, count = atomic_long_read(&sem->count); | 264 | long count = atomic_long_read(&sem->count); |
350 | |||
351 | while (true) { | ||
352 | if (!(count == 0 || count == RWSEM_WAITING_BIAS)) | ||
353 | return false; | ||
354 | 265 | ||
355 | old = atomic_long_cmpxchg_acquire(&sem->count, count, | 266 | while (!count || count == RWSEM_WAITING_BIAS) { |
356 | count + RWSEM_ACTIVE_WRITE_BIAS); | 267 | if (atomic_long_try_cmpxchg_acquire(&sem->count, &count, |
357 | if (old == count) { | 268 | count + RWSEM_ACTIVE_WRITE_BIAS)) { |
358 | rwsem_set_owner(sem); | 269 | rwsem_set_owner(sem); |
270 | lockevent_inc(rwsem_opt_wlock); | ||
359 | return true; | 271 | return true; |
360 | } | 272 | } |
361 | |||
362 | count = old; | ||
363 | } | 273 | } |
274 | return false; | ||
364 | } | 275 | } |
365 | 276 | ||
366 | static inline bool owner_on_cpu(struct task_struct *owner) | 277 | static inline bool owner_on_cpu(struct task_struct *owner) |
@@ -481,6 +392,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) | |||
481 | osq_unlock(&sem->osq); | 392 | osq_unlock(&sem->osq); |
482 | done: | 393 | done: |
483 | preempt_enable(); | 394 | preempt_enable(); |
395 | lockevent_cond_inc(rwsem_opt_fail, !taken); | ||
484 | return taken; | 396 | return taken; |
485 | } | 397 | } |
486 | 398 | ||
@@ -505,6 +417,97 @@ static inline bool rwsem_has_spinner(struct rw_semaphore *sem) | |||
505 | #endif | 417 | #endif |
506 | 418 | ||
507 | /* | 419 | /* |
420 | * Wait for the read lock to be granted | ||
421 | */ | ||
422 | static inline struct rw_semaphore __sched * | ||
423 | __rwsem_down_read_failed_common(struct rw_semaphore *sem, int state) | ||
424 | { | ||
425 | long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; | ||
426 | struct rwsem_waiter waiter; | ||
427 | DEFINE_WAKE_Q(wake_q); | ||
428 | |||
429 | waiter.task = current; | ||
430 | waiter.type = RWSEM_WAITING_FOR_READ; | ||
431 | |||
432 | raw_spin_lock_irq(&sem->wait_lock); | ||
433 | if (list_empty(&sem->wait_list)) { | ||
434 | /* | ||
435 | * In case the wait queue is empty and the lock isn't owned | ||
436 | * by a writer, this reader can exit the slowpath and return | ||
437 | * immediately as its RWSEM_ACTIVE_READ_BIAS has already | ||
438 | * been set in the count. | ||
439 | */ | ||
440 | if (atomic_long_read(&sem->count) >= 0) { | ||
441 | raw_spin_unlock_irq(&sem->wait_lock); | ||
442 | rwsem_set_reader_owned(sem); | ||
443 | lockevent_inc(rwsem_rlock_fast); | ||
444 | return sem; | ||
445 | } | ||
446 | adjustment += RWSEM_WAITING_BIAS; | ||
447 | } | ||
448 | list_add_tail(&waiter.list, &sem->wait_list); | ||
449 | |||
450 | /* we're now waiting on the lock, but no longer actively locking */ | ||
451 | count = atomic_long_add_return(adjustment, &sem->count); | ||
452 | |||
453 | /* | ||
454 | * If there are no active locks, wake the front queued process(es). | ||
455 | * | ||
456 | * If there are no writers and we are first in the queue, | ||
457 | * wake our own waiter to join the existing active readers ! | ||
458 | */ | ||
459 | if (count == RWSEM_WAITING_BIAS || | ||
460 | (count > RWSEM_WAITING_BIAS && | ||
461 | adjustment != -RWSEM_ACTIVE_READ_BIAS)) | ||
462 | __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); | ||
463 | |||
464 | raw_spin_unlock_irq(&sem->wait_lock); | ||
465 | wake_up_q(&wake_q); | ||
466 | |||
467 | /* wait to be given the lock */ | ||
468 | while (true) { | ||
469 | set_current_state(state); | ||
470 | if (!waiter.task) | ||
471 | break; | ||
472 | if (signal_pending_state(state, current)) { | ||
473 | raw_spin_lock_irq(&sem->wait_lock); | ||
474 | if (waiter.task) | ||
475 | goto out_nolock; | ||
476 | raw_spin_unlock_irq(&sem->wait_lock); | ||
477 | break; | ||
478 | } | ||
479 | schedule(); | ||
480 | lockevent_inc(rwsem_sleep_reader); | ||
481 | } | ||
482 | |||
483 | __set_current_state(TASK_RUNNING); | ||
484 | lockevent_inc(rwsem_rlock); | ||
485 | return sem; | ||
486 | out_nolock: | ||
487 | list_del(&waiter.list); | ||
488 | if (list_empty(&sem->wait_list)) | ||
489 | atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count); | ||
490 | raw_spin_unlock_irq(&sem->wait_lock); | ||
491 | __set_current_state(TASK_RUNNING); | ||
492 | lockevent_inc(rwsem_rlock_fail); | ||
493 | return ERR_PTR(-EINTR); | ||
494 | } | ||
495 | |||
496 | __visible struct rw_semaphore * __sched | ||
497 | rwsem_down_read_failed(struct rw_semaphore *sem) | ||
498 | { | ||
499 | return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE); | ||
500 | } | ||
501 | EXPORT_SYMBOL(rwsem_down_read_failed); | ||
502 | |||
503 | __visible struct rw_semaphore * __sched | ||
504 | rwsem_down_read_failed_killable(struct rw_semaphore *sem) | ||
505 | { | ||
506 | return __rwsem_down_read_failed_common(sem, TASK_KILLABLE); | ||
507 | } | ||
508 | EXPORT_SYMBOL(rwsem_down_read_failed_killable); | ||
509 | |||
510 | /* | ||
508 | * Wait until we successfully acquire the write lock | 511 | * Wait until we successfully acquire the write lock |
509 | */ | 512 | */ |
510 | static inline struct rw_semaphore * | 513 | static inline struct rw_semaphore * |
@@ -580,6 +583,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) | |||
580 | goto out_nolock; | 583 | goto out_nolock; |
581 | 584 | ||
582 | schedule(); | 585 | schedule(); |
586 | lockevent_inc(rwsem_sleep_writer); | ||
583 | set_current_state(state); | 587 | set_current_state(state); |
584 | } while ((count = atomic_long_read(&sem->count)) & RWSEM_ACTIVE_MASK); | 588 | } while ((count = atomic_long_read(&sem->count)) & RWSEM_ACTIVE_MASK); |
585 | 589 | ||
@@ -588,6 +592,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) | |||
588 | __set_current_state(TASK_RUNNING); | 592 | __set_current_state(TASK_RUNNING); |
589 | list_del(&waiter.list); | 593 | list_del(&waiter.list); |
590 | raw_spin_unlock_irq(&sem->wait_lock); | 594 | raw_spin_unlock_irq(&sem->wait_lock); |
595 | lockevent_inc(rwsem_wlock); | ||
591 | 596 | ||
592 | return ret; | 597 | return ret; |
593 | 598 | ||
@@ -601,6 +606,7 @@ out_nolock: | |||
601 | __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); | 606 | __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); |
602 | raw_spin_unlock_irq(&sem->wait_lock); | 607 | raw_spin_unlock_irq(&sem->wait_lock); |
603 | wake_up_q(&wake_q); | 608 | wake_up_q(&wake_q); |
609 | lockevent_inc(rwsem_wlock_fail); | ||
604 | 610 | ||
605 | return ERR_PTR(-EINTR); | 611 | return ERR_PTR(-EINTR); |
606 | } | 612 | } |
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index e586f0d03ad3..ccbf18f560ff 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c | |||
@@ -24,7 +24,6 @@ void __sched down_read(struct rw_semaphore *sem) | |||
24 | rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); | 24 | rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); |
25 | 25 | ||
26 | LOCK_CONTENDED(sem, __down_read_trylock, __down_read); | 26 | LOCK_CONTENDED(sem, __down_read_trylock, __down_read); |
27 | rwsem_set_reader_owned(sem); | ||
28 | } | 27 | } |
29 | 28 | ||
30 | EXPORT_SYMBOL(down_read); | 29 | EXPORT_SYMBOL(down_read); |
@@ -39,7 +38,6 @@ int __sched down_read_killable(struct rw_semaphore *sem) | |||
39 | return -EINTR; | 38 | return -EINTR; |
40 | } | 39 | } |
41 | 40 | ||
42 | rwsem_set_reader_owned(sem); | ||
43 | return 0; | 41 | return 0; |
44 | } | 42 | } |
45 | 43 | ||
@@ -52,10 +50,8 @@ int down_read_trylock(struct rw_semaphore *sem) | |||
52 | { | 50 | { |
53 | int ret = __down_read_trylock(sem); | 51 | int ret = __down_read_trylock(sem); |
54 | 52 | ||
55 | if (ret == 1) { | 53 | if (ret == 1) |
56 | rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); | 54 | rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); |
57 | rwsem_set_reader_owned(sem); | ||
58 | } | ||
59 | return ret; | 55 | return ret; |
60 | } | 56 | } |
61 | 57 | ||
@@ -70,7 +66,6 @@ void __sched down_write(struct rw_semaphore *sem) | |||
70 | rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); | 66 | rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); |
71 | 67 | ||
72 | LOCK_CONTENDED(sem, __down_write_trylock, __down_write); | 68 | LOCK_CONTENDED(sem, __down_write_trylock, __down_write); |
73 | rwsem_set_owner(sem); | ||
74 | } | 69 | } |
75 | 70 | ||
76 | EXPORT_SYMBOL(down_write); | 71 | EXPORT_SYMBOL(down_write); |
@@ -88,7 +83,6 @@ int __sched down_write_killable(struct rw_semaphore *sem) | |||
88 | return -EINTR; | 83 | return -EINTR; |
89 | } | 84 | } |
90 | 85 | ||
91 | rwsem_set_owner(sem); | ||
92 | return 0; | 86 | return 0; |
93 | } | 87 | } |
94 | 88 | ||
@@ -101,10 +95,8 @@ int down_write_trylock(struct rw_semaphore *sem) | |||
101 | { | 95 | { |
102 | int ret = __down_write_trylock(sem); | 96 | int ret = __down_write_trylock(sem); |
103 | 97 | ||
104 | if (ret == 1) { | 98 | if (ret == 1) |
105 | rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_); | 99 | rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_); |
106 | rwsem_set_owner(sem); | ||
107 | } | ||
108 | 100 | ||
109 | return ret; | 101 | return ret; |
110 | } | 102 | } |
@@ -117,9 +109,7 @@ EXPORT_SYMBOL(down_write_trylock); | |||
117 | void up_read(struct rw_semaphore *sem) | 109 | void up_read(struct rw_semaphore *sem) |
118 | { | 110 | { |
119 | rwsem_release(&sem->dep_map, 1, _RET_IP_); | 111 | rwsem_release(&sem->dep_map, 1, _RET_IP_); |
120 | DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED)); | ||
121 | 112 | ||
122 | rwsem_clear_reader_owned(sem); | ||
123 | __up_read(sem); | 113 | __up_read(sem); |
124 | } | 114 | } |
125 | 115 | ||
@@ -131,9 +121,7 @@ EXPORT_SYMBOL(up_read); | |||
131 | void up_write(struct rw_semaphore *sem) | 121 | void up_write(struct rw_semaphore *sem) |
132 | { | 122 | { |
133 | rwsem_release(&sem->dep_map, 1, _RET_IP_); | 123 | rwsem_release(&sem->dep_map, 1, _RET_IP_); |
134 | DEBUG_RWSEMS_WARN_ON(sem->owner != current); | ||
135 | 124 | ||
136 | rwsem_clear_owner(sem); | ||
137 | __up_write(sem); | 125 | __up_write(sem); |
138 | } | 126 | } |
139 | 127 | ||
@@ -145,9 +133,7 @@ EXPORT_SYMBOL(up_write); | |||
145 | void downgrade_write(struct rw_semaphore *sem) | 133 | void downgrade_write(struct rw_semaphore *sem) |
146 | { | 134 | { |
147 | lock_downgrade(&sem->dep_map, _RET_IP_); | 135 | lock_downgrade(&sem->dep_map, _RET_IP_); |
148 | DEBUG_RWSEMS_WARN_ON(sem->owner != current); | ||
149 | 136 | ||
150 | rwsem_set_reader_owned(sem); | ||
151 | __downgrade_write(sem); | 137 | __downgrade_write(sem); |
152 | } | 138 | } |
153 | 139 | ||
@@ -161,7 +147,6 @@ void down_read_nested(struct rw_semaphore *sem, int subclass) | |||
161 | rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); | 147 | rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); |
162 | 148 | ||
163 | LOCK_CONTENDED(sem, __down_read_trylock, __down_read); | 149 | LOCK_CONTENDED(sem, __down_read_trylock, __down_read); |
164 | rwsem_set_reader_owned(sem); | ||
165 | } | 150 | } |
166 | 151 | ||
167 | EXPORT_SYMBOL(down_read_nested); | 152 | EXPORT_SYMBOL(down_read_nested); |
@@ -172,7 +157,6 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) | |||
172 | rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); | 157 | rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); |
173 | 158 | ||
174 | LOCK_CONTENDED(sem, __down_write_trylock, __down_write); | 159 | LOCK_CONTENDED(sem, __down_write_trylock, __down_write); |
175 | rwsem_set_owner(sem); | ||
176 | } | 160 | } |
177 | 161 | ||
178 | EXPORT_SYMBOL(_down_write_nest_lock); | 162 | EXPORT_SYMBOL(_down_write_nest_lock); |
@@ -193,7 +177,6 @@ void down_write_nested(struct rw_semaphore *sem, int subclass) | |||
193 | rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); | 177 | rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); |
194 | 178 | ||
195 | LOCK_CONTENDED(sem, __down_write_trylock, __down_write); | 179 | LOCK_CONTENDED(sem, __down_write_trylock, __down_write); |
196 | rwsem_set_owner(sem); | ||
197 | } | 180 | } |
198 | 181 | ||
199 | EXPORT_SYMBOL(down_write_nested); | 182 | EXPORT_SYMBOL(down_write_nested); |
@@ -208,7 +191,6 @@ int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass) | |||
208 | return -EINTR; | 191 | return -EINTR; |
209 | } | 192 | } |
210 | 193 | ||
211 | rwsem_set_owner(sem); | ||
212 | return 0; | 194 | return 0; |
213 | } | 195 | } |
214 | 196 | ||
@@ -216,7 +198,8 @@ EXPORT_SYMBOL(down_write_killable_nested); | |||
216 | 198 | ||
217 | void up_read_non_owner(struct rw_semaphore *sem) | 199 | void up_read_non_owner(struct rw_semaphore *sem) |
218 | { | 200 | { |
219 | DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED)); | 201 | DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED), |
202 | sem); | ||
220 | __up_read(sem); | 203 | __up_read(sem); |
221 | } | 204 | } |
222 | 205 | ||
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index bad2bca0268b..64877f5294e3 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h | |||
@@ -23,15 +23,44 @@ | |||
23 | * is involved. Ideally we would like to track all the readers that own | 23 | * is involved. Ideally we would like to track all the readers that own |
24 | * a rwsem, but the overhead is simply too big. | 24 | * a rwsem, but the overhead is simply too big. |
25 | */ | 25 | */ |
26 | #include "lock_events.h" | ||
27 | |||
26 | #define RWSEM_READER_OWNED (1UL << 0) | 28 | #define RWSEM_READER_OWNED (1UL << 0) |
27 | #define RWSEM_ANONYMOUSLY_OWNED (1UL << 1) | 29 | #define RWSEM_ANONYMOUSLY_OWNED (1UL << 1) |
28 | 30 | ||
29 | #ifdef CONFIG_DEBUG_RWSEMS | 31 | #ifdef CONFIG_DEBUG_RWSEMS |
30 | # define DEBUG_RWSEMS_WARN_ON(c) DEBUG_LOCKS_WARN_ON(c) | 32 | # define DEBUG_RWSEMS_WARN_ON(c, sem) do { \ |
33 | if (!debug_locks_silent && \ | ||
34 | WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\ | ||
35 | #c, atomic_long_read(&(sem)->count), \ | ||
36 | (long)((sem)->owner), (long)current, \ | ||
37 | list_empty(&(sem)->wait_list) ? "" : "not ")) \ | ||
38 | debug_locks_off(); \ | ||
39 | } while (0) | ||
40 | #else | ||
41 | # define DEBUG_RWSEMS_WARN_ON(c, sem) | ||
42 | #endif | ||
43 | |||
44 | /* | ||
45 | * R/W semaphores originally for PPC using the stuff in lib/rwsem.c. | ||
46 | * Adapted largely from include/asm-i386/rwsem.h | ||
47 | * by Paul Mackerras <paulus@samba.org>. | ||
48 | */ | ||
49 | |||
50 | /* | ||
51 | * the semaphore definition | ||
52 | */ | ||
53 | #ifdef CONFIG_64BIT | ||
54 | # define RWSEM_ACTIVE_MASK 0xffffffffL | ||
31 | #else | 55 | #else |
32 | # define DEBUG_RWSEMS_WARN_ON(c) | 56 | # define RWSEM_ACTIVE_MASK 0x0000ffffL |
33 | #endif | 57 | #endif |
34 | 58 | ||
59 | #define RWSEM_ACTIVE_BIAS 0x00000001L | ||
60 | #define RWSEM_WAITING_BIAS (-RWSEM_ACTIVE_MASK-1) | ||
61 | #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS | ||
62 | #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) | ||
63 | |||
35 | #ifdef CONFIG_RWSEM_SPIN_ON_OWNER | 64 | #ifdef CONFIG_RWSEM_SPIN_ON_OWNER |
36 | /* | 65 | /* |
37 | * All writes to owner are protected by WRITE_ONCE() to make sure that | 66 | * All writes to owner are protected by WRITE_ONCE() to make sure that |
@@ -132,3 +161,144 @@ static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) | |||
132 | { | 161 | { |
133 | } | 162 | } |
134 | #endif | 163 | #endif |
164 | |||
165 | extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); | ||
166 | extern struct rw_semaphore *rwsem_down_read_failed_killable(struct rw_semaphore *sem); | ||
167 | extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); | ||
168 | extern struct rw_semaphore *rwsem_down_write_failed_killable(struct rw_semaphore *sem); | ||
169 | extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem); | ||
170 | extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); | ||
171 | |||
172 | /* | ||
173 | * lock for reading | ||
174 | */ | ||
175 | static inline void __down_read(struct rw_semaphore *sem) | ||
176 | { | ||
177 | if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) { | ||
178 | rwsem_down_read_failed(sem); | ||
179 | DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & | ||
180 | RWSEM_READER_OWNED), sem); | ||
181 | } else { | ||
182 | rwsem_set_reader_owned(sem); | ||
183 | } | ||
184 | } | ||
185 | |||
186 | static inline int __down_read_killable(struct rw_semaphore *sem) | ||
187 | { | ||
188 | if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) { | ||
189 | if (IS_ERR(rwsem_down_read_failed_killable(sem))) | ||
190 | return -EINTR; | ||
191 | DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & | ||
192 | RWSEM_READER_OWNED), sem); | ||
193 | } else { | ||
194 | rwsem_set_reader_owned(sem); | ||
195 | } | ||
196 | return 0; | ||
197 | } | ||
198 | |||
199 | static inline int __down_read_trylock(struct rw_semaphore *sem) | ||
200 | { | ||
201 | /* | ||
202 | * Optimize for the case when the rwsem is not locked at all. | ||
203 | */ | ||
204 | long tmp = RWSEM_UNLOCKED_VALUE; | ||
205 | |||
206 | lockevent_inc(rwsem_rtrylock); | ||
207 | do { | ||
208 | if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, | ||
209 | tmp + RWSEM_ACTIVE_READ_BIAS)) { | ||
210 | rwsem_set_reader_owned(sem); | ||
211 | return 1; | ||
212 | } | ||
213 | } while (tmp >= 0); | ||
214 | return 0; | ||
215 | } | ||
216 | |||
217 | /* | ||
218 | * lock for writing | ||
219 | */ | ||
220 | static inline void __down_write(struct rw_semaphore *sem) | ||
221 | { | ||
222 | long tmp; | ||
223 | |||
224 | tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS, | ||
225 | &sem->count); | ||
226 | if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS)) | ||
227 | rwsem_down_write_failed(sem); | ||
228 | rwsem_set_owner(sem); | ||
229 | } | ||
230 | |||
231 | static inline int __down_write_killable(struct rw_semaphore *sem) | ||
232 | { | ||
233 | long tmp; | ||
234 | |||
235 | tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS, | ||
236 | &sem->count); | ||
237 | if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS)) | ||
238 | if (IS_ERR(rwsem_down_write_failed_killable(sem))) | ||
239 | return -EINTR; | ||
240 | rwsem_set_owner(sem); | ||
241 | return 0; | ||
242 | } | ||
243 | |||
244 | static inline int __down_write_trylock(struct rw_semaphore *sem) | ||
245 | { | ||
246 | long tmp; | ||
247 | |||
248 | lockevent_inc(rwsem_wtrylock); | ||
249 | tmp = atomic_long_cmpxchg_acquire(&sem->count, RWSEM_UNLOCKED_VALUE, | ||
250 | RWSEM_ACTIVE_WRITE_BIAS); | ||
251 | if (tmp == RWSEM_UNLOCKED_VALUE) { | ||
252 | rwsem_set_owner(sem); | ||
253 | return true; | ||
254 | } | ||
255 | return false; | ||
256 | } | ||
257 | |||
258 | /* | ||
259 | * unlock after reading | ||
260 | */ | ||
261 | static inline void __up_read(struct rw_semaphore *sem) | ||
262 | { | ||
263 | long tmp; | ||
264 | |||
265 | DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED), | ||
266 | sem); | ||
267 | rwsem_clear_reader_owned(sem); | ||
268 | tmp = atomic_long_dec_return_release(&sem->count); | ||
269 | if (unlikely(tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0)) | ||
270 | rwsem_wake(sem); | ||
271 | } | ||
272 | |||
273 | /* | ||
274 | * unlock after writing | ||
275 | */ | ||
276 | static inline void __up_write(struct rw_semaphore *sem) | ||
277 | { | ||
278 | DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem); | ||
279 | rwsem_clear_owner(sem); | ||
280 | if (unlikely(atomic_long_sub_return_release(RWSEM_ACTIVE_WRITE_BIAS, | ||
281 | &sem->count) < 0)) | ||
282 | rwsem_wake(sem); | ||
283 | } | ||
284 | |||
285 | /* | ||
286 | * downgrade write lock to read lock | ||
287 | */ | ||
288 | static inline void __downgrade_write(struct rw_semaphore *sem) | ||
289 | { | ||
290 | long tmp; | ||
291 | |||
292 | /* | ||
293 | * When downgrading from exclusive to shared ownership, | ||
294 | * anything inside the write-locked region cannot leak | ||
295 | * into the read side. In contrast, anything in the | ||
296 | * read-locked region is ok to be re-ordered into the | ||
297 | * write side. As such, rely on RELEASE semantics. | ||
298 | */ | ||
299 | DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem); | ||
300 | tmp = atomic_long_add_return_release(-RWSEM_WAITING_BIAS, &sem->count); | ||
301 | rwsem_set_reader_owned(sem); | ||
302 | if (tmp < 0) | ||
303 | rwsem_downgrade_wake(sem); | ||
304 | } | ||
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index 936f3d14dd6b..0ff08380f531 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c | |||
@@ -22,6 +22,13 @@ | |||
22 | #include <linux/debug_locks.h> | 22 | #include <linux/debug_locks.h> |
23 | #include <linux/export.h> | 23 | #include <linux/export.h> |
24 | 24 | ||
25 | #ifdef CONFIG_MMIOWB | ||
26 | #ifndef arch_mmiowb_state | ||
27 | DEFINE_PER_CPU(struct mmiowb_state, __mmiowb_state); | ||
28 | EXPORT_PER_CPU_SYMBOL(__mmiowb_state); | ||
29 | #endif | ||
30 | #endif | ||
31 | |||
25 | /* | 32 | /* |
26 | * If lockdep is enabled then we use the non-preemption spin-ops | 33 | * If lockdep is enabled then we use the non-preemption spin-ops |
27 | * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are | 34 | * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are |
diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c index 9aa0fccd5d43..399669f7eba8 100644 --- a/kernel/locking/spinlock_debug.c +++ b/kernel/locking/spinlock_debug.c | |||
@@ -111,6 +111,7 @@ void do_raw_spin_lock(raw_spinlock_t *lock) | |||
111 | { | 111 | { |
112 | debug_spin_lock_before(lock); | 112 | debug_spin_lock_before(lock); |
113 | arch_spin_lock(&lock->raw_lock); | 113 | arch_spin_lock(&lock->raw_lock); |
114 | mmiowb_spin_lock(); | ||
114 | debug_spin_lock_after(lock); | 115 | debug_spin_lock_after(lock); |
115 | } | 116 | } |
116 | 117 | ||
@@ -118,8 +119,10 @@ int do_raw_spin_trylock(raw_spinlock_t *lock) | |||
118 | { | 119 | { |
119 | int ret = arch_spin_trylock(&lock->raw_lock); | 120 | int ret = arch_spin_trylock(&lock->raw_lock); |
120 | 121 | ||
121 | if (ret) | 122 | if (ret) { |
123 | mmiowb_spin_lock(); | ||
122 | debug_spin_lock_after(lock); | 124 | debug_spin_lock_after(lock); |
125 | } | ||
123 | #ifndef CONFIG_SMP | 126 | #ifndef CONFIG_SMP |
124 | /* | 127 | /* |
125 | * Must not happen on UP: | 128 | * Must not happen on UP: |
@@ -131,6 +134,7 @@ int do_raw_spin_trylock(raw_spinlock_t *lock) | |||
131 | 134 | ||
132 | void do_raw_spin_unlock(raw_spinlock_t *lock) | 135 | void do_raw_spin_unlock(raw_spinlock_t *lock) |
133 | { | 136 | { |
137 | mmiowb_spin_unlock(); | ||
134 | debug_spin_unlock(lock); | 138 | debug_spin_unlock(lock); |
135 | arch_spin_unlock(&lock->raw_lock); | 139 | arch_spin_unlock(&lock->raw_lock); |
136 | } | 140 | } |
diff --git a/kernel/module.c b/kernel/module.c index 0b9aa8ab89f0..a9020bdd4cf6 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -98,6 +98,10 @@ DEFINE_MUTEX(module_mutex); | |||
98 | EXPORT_SYMBOL_GPL(module_mutex); | 98 | EXPORT_SYMBOL_GPL(module_mutex); |
99 | static LIST_HEAD(modules); | 99 | static LIST_HEAD(modules); |
100 | 100 | ||
101 | /* Work queue for freeing init sections in success case */ | ||
102 | static struct work_struct init_free_wq; | ||
103 | static struct llist_head init_free_list; | ||
104 | |||
101 | #ifdef CONFIG_MODULES_TREE_LOOKUP | 105 | #ifdef CONFIG_MODULES_TREE_LOOKUP |
102 | 106 | ||
103 | /* | 107 | /* |
@@ -1949,9 +1953,16 @@ void module_enable_ro(const struct module *mod, bool after_init) | |||
1949 | if (!rodata_enabled) | 1953 | if (!rodata_enabled) |
1950 | return; | 1954 | return; |
1951 | 1955 | ||
1956 | set_vm_flush_reset_perms(mod->core_layout.base); | ||
1957 | set_vm_flush_reset_perms(mod->init_layout.base); | ||
1952 | frob_text(&mod->core_layout, set_memory_ro); | 1958 | frob_text(&mod->core_layout, set_memory_ro); |
1959 | frob_text(&mod->core_layout, set_memory_x); | ||
1960 | |||
1953 | frob_rodata(&mod->core_layout, set_memory_ro); | 1961 | frob_rodata(&mod->core_layout, set_memory_ro); |
1962 | |||
1954 | frob_text(&mod->init_layout, set_memory_ro); | 1963 | frob_text(&mod->init_layout, set_memory_ro); |
1964 | frob_text(&mod->init_layout, set_memory_x); | ||
1965 | |||
1955 | frob_rodata(&mod->init_layout, set_memory_ro); | 1966 | frob_rodata(&mod->init_layout, set_memory_ro); |
1956 | 1967 | ||
1957 | if (after_init) | 1968 | if (after_init) |
@@ -1967,15 +1978,6 @@ static void module_enable_nx(const struct module *mod) | |||
1967 | frob_writable_data(&mod->init_layout, set_memory_nx); | 1978 | frob_writable_data(&mod->init_layout, set_memory_nx); |
1968 | } | 1979 | } |
1969 | 1980 | ||
1970 | static void module_disable_nx(const struct module *mod) | ||
1971 | { | ||
1972 | frob_rodata(&mod->core_layout, set_memory_x); | ||
1973 | frob_ro_after_init(&mod->core_layout, set_memory_x); | ||
1974 | frob_writable_data(&mod->core_layout, set_memory_x); | ||
1975 | frob_rodata(&mod->init_layout, set_memory_x); | ||
1976 | frob_writable_data(&mod->init_layout, set_memory_x); | ||
1977 | } | ||
1978 | |||
1979 | /* Iterate through all modules and set each module's text as RW */ | 1981 | /* Iterate through all modules and set each module's text as RW */ |
1980 | void set_all_modules_text_rw(void) | 1982 | void set_all_modules_text_rw(void) |
1981 | { | 1983 | { |
@@ -2019,23 +2021,8 @@ void set_all_modules_text_ro(void) | |||
2019 | } | 2021 | } |
2020 | mutex_unlock(&module_mutex); | 2022 | mutex_unlock(&module_mutex); |
2021 | } | 2023 | } |
2022 | |||
2023 | static void disable_ro_nx(const struct module_layout *layout) | ||
2024 | { | ||
2025 | if (rodata_enabled) { | ||
2026 | frob_text(layout, set_memory_rw); | ||
2027 | frob_rodata(layout, set_memory_rw); | ||
2028 | frob_ro_after_init(layout, set_memory_rw); | ||
2029 | } | ||
2030 | frob_rodata(layout, set_memory_x); | ||
2031 | frob_ro_after_init(layout, set_memory_x); | ||
2032 | frob_writable_data(layout, set_memory_x); | ||
2033 | } | ||
2034 | |||
2035 | #else | 2024 | #else |
2036 | static void disable_ro_nx(const struct module_layout *layout) { } | ||
2037 | static void module_enable_nx(const struct module *mod) { } | 2025 | static void module_enable_nx(const struct module *mod) { } |
2038 | static void module_disable_nx(const struct module *mod) { } | ||
2039 | #endif | 2026 | #endif |
2040 | 2027 | ||
2041 | #ifdef CONFIG_LIVEPATCH | 2028 | #ifdef CONFIG_LIVEPATCH |
@@ -2115,6 +2102,11 @@ static void free_module_elf(struct module *mod) | |||
2115 | 2102 | ||
2116 | void __weak module_memfree(void *module_region) | 2103 | void __weak module_memfree(void *module_region) |
2117 | { | 2104 | { |
2105 | /* | ||
2106 | * This memory may be RO, and freeing RO memory in an interrupt is not | ||
2107 | * supported by vmalloc. | ||
2108 | */ | ||
2109 | WARN_ON(in_interrupt()); | ||
2118 | vfree(module_region); | 2110 | vfree(module_region); |
2119 | } | 2111 | } |
2120 | 2112 | ||
@@ -2166,7 +2158,6 @@ static void free_module(struct module *mod) | |||
2166 | mutex_unlock(&module_mutex); | 2158 | mutex_unlock(&module_mutex); |
2167 | 2159 | ||
2168 | /* This may be empty, but that's OK */ | 2160 | /* This may be empty, but that's OK */ |
2169 | disable_ro_nx(&mod->init_layout); | ||
2170 | module_arch_freeing_init(mod); | 2161 | module_arch_freeing_init(mod); |
2171 | module_memfree(mod->init_layout.base); | 2162 | module_memfree(mod->init_layout.base); |
2172 | kfree(mod->args); | 2163 | kfree(mod->args); |
@@ -2176,7 +2167,6 @@ static void free_module(struct module *mod) | |||
2176 | lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size); | 2167 | lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size); |
2177 | 2168 | ||
2178 | /* Finally, free the core (containing the module structure) */ | 2169 | /* Finally, free the core (containing the module structure) */ |
2179 | disable_ro_nx(&mod->core_layout); | ||
2180 | module_memfree(mod->core_layout.base); | 2170 | module_memfree(mod->core_layout.base); |
2181 | } | 2171 | } |
2182 | 2172 | ||
@@ -3415,17 +3405,34 @@ static void do_mod_ctors(struct module *mod) | |||
3415 | 3405 | ||
3416 | /* For freeing module_init on success, in case kallsyms traversing */ | 3406 | /* For freeing module_init on success, in case kallsyms traversing */ |
3417 | struct mod_initfree { | 3407 | struct mod_initfree { |
3418 | struct rcu_head rcu; | 3408 | struct llist_node node; |
3419 | void *module_init; | 3409 | void *module_init; |
3420 | }; | 3410 | }; |
3421 | 3411 | ||
3422 | static void do_free_init(struct rcu_head *head) | 3412 | static void do_free_init(struct work_struct *w) |
3423 | { | 3413 | { |
3424 | struct mod_initfree *m = container_of(head, struct mod_initfree, rcu); | 3414 | struct llist_node *pos, *n, *list; |
3425 | module_memfree(m->module_init); | 3415 | struct mod_initfree *initfree; |
3426 | kfree(m); | 3416 | |
3417 | list = llist_del_all(&init_free_list); | ||
3418 | |||
3419 | synchronize_rcu(); | ||
3420 | |||
3421 | llist_for_each_safe(pos, n, list) { | ||
3422 | initfree = container_of(pos, struct mod_initfree, node); | ||
3423 | module_memfree(initfree->module_init); | ||
3424 | kfree(initfree); | ||
3425 | } | ||
3427 | } | 3426 | } |
3428 | 3427 | ||
3428 | static int __init modules_wq_init(void) | ||
3429 | { | ||
3430 | INIT_WORK(&init_free_wq, do_free_init); | ||
3431 | init_llist_head(&init_free_list); | ||
3432 | return 0; | ||
3433 | } | ||
3434 | module_init(modules_wq_init); | ||
3435 | |||
3429 | /* | 3436 | /* |
3430 | * This is where the real work happens. | 3437 | * This is where the real work happens. |
3431 | * | 3438 | * |
@@ -3502,7 +3509,6 @@ static noinline int do_init_module(struct module *mod) | |||
3502 | #endif | 3509 | #endif |
3503 | module_enable_ro(mod, true); | 3510 | module_enable_ro(mod, true); |
3504 | mod_tree_remove_init(mod); | 3511 | mod_tree_remove_init(mod); |
3505 | disable_ro_nx(&mod->init_layout); | ||
3506 | module_arch_freeing_init(mod); | 3512 | module_arch_freeing_init(mod); |
3507 | mod->init_layout.base = NULL; | 3513 | mod->init_layout.base = NULL; |
3508 | mod->init_layout.size = 0; | 3514 | mod->init_layout.size = 0; |
@@ -3513,14 +3519,18 @@ static noinline int do_init_module(struct module *mod) | |||
3513 | * We want to free module_init, but be aware that kallsyms may be | 3519 | * We want to free module_init, but be aware that kallsyms may be |
3514 | * walking this with preempt disabled. In all the failure paths, we | 3520 | * walking this with preempt disabled. In all the failure paths, we |
3515 | * call synchronize_rcu(), but we don't want to slow down the success | 3521 | * call synchronize_rcu(), but we don't want to slow down the success |
3516 | * path, so use actual RCU here. | 3522 | * path. module_memfree() cannot be called in an interrupt, so do the |
3523 | * work and call synchronize_rcu() in a work queue. | ||
3524 | * | ||
3517 | * Note that module_alloc() on most architectures creates W+X page | 3525 | * Note that module_alloc() on most architectures creates W+X page |
3518 | * mappings which won't be cleaned up until do_free_init() runs. Any | 3526 | * mappings which won't be cleaned up until do_free_init() runs. Any |
3519 | * code such as mark_rodata_ro() which depends on those mappings to | 3527 | * code such as mark_rodata_ro() which depends on those mappings to |
3520 | * be cleaned up needs to sync with the queued work - ie | 3528 | * be cleaned up needs to sync with the queued work - ie |
3521 | * rcu_barrier() | 3529 | * rcu_barrier() |
3522 | */ | 3530 | */ |
3523 | call_rcu(&freeinit->rcu, do_free_init); | 3531 | if (llist_add(&freeinit->node, &init_free_list)) |
3532 | schedule_work(&init_free_wq); | ||
3533 | |||
3524 | mutex_unlock(&module_mutex); | 3534 | mutex_unlock(&module_mutex); |
3525 | wake_up_all(&module_wq); | 3535 | wake_up_all(&module_wq); |
3526 | 3536 | ||
@@ -3817,10 +3827,6 @@ static int load_module(struct load_info *info, const char __user *uargs, | |||
3817 | module_bug_cleanup(mod); | 3827 | module_bug_cleanup(mod); |
3818 | mutex_unlock(&module_mutex); | 3828 | mutex_unlock(&module_mutex); |
3819 | 3829 | ||
3820 | /* we can't deallocate the module until we clear memory protection */ | ||
3821 | module_disable_ro(mod); | ||
3822 | module_disable_nx(mod); | ||
3823 | |||
3824 | ddebug_cleanup: | 3830 | ddebug_cleanup: |
3825 | ftrace_release_mod(mod); | 3831 | ftrace_release_mod(mod); |
3826 | dynamic_debug_remove(mod, info->debug); | 3832 | dynamic_debug_remove(mod, info->debug); |
diff --git a/kernel/padata.c b/kernel/padata.c index 3e2633ae3bca..2d2fddbb7a4c 100644 --- a/kernel/padata.c +++ b/kernel/padata.c | |||
@@ -957,6 +957,7 @@ static struct attribute *padata_default_attrs[] = { | |||
957 | ¶llel_cpumask_attr.attr, | 957 | ¶llel_cpumask_attr.attr, |
958 | NULL, | 958 | NULL, |
959 | }; | 959 | }; |
960 | ATTRIBUTE_GROUPS(padata_default); | ||
960 | 961 | ||
961 | static ssize_t padata_sysfs_show(struct kobject *kobj, | 962 | static ssize_t padata_sysfs_show(struct kobject *kobj, |
962 | struct attribute *attr, char *buf) | 963 | struct attribute *attr, char *buf) |
@@ -995,7 +996,7 @@ static const struct sysfs_ops padata_sysfs_ops = { | |||
995 | 996 | ||
996 | static struct kobj_type padata_attr_type = { | 997 | static struct kobj_type padata_attr_type = { |
997 | .sysfs_ops = &padata_sysfs_ops, | 998 | .sysfs_ops = &padata_sysfs_ops, |
998 | .default_attrs = padata_default_attrs, | 999 | .default_groups = padata_default_groups, |
999 | .release = padata_sysfs_release, | 1000 | .release = padata_sysfs_release, |
1000 | }; | 1001 | }; |
1001 | 1002 | ||
diff --git a/kernel/panic.c b/kernel/panic.c index 0ae0d7332f12..c1fcaad337b7 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
@@ -318,12 +318,7 @@ void panic(const char *fmt, ...) | |||
318 | } | 318 | } |
319 | #endif | 319 | #endif |
320 | #if defined(CONFIG_S390) | 320 | #if defined(CONFIG_S390) |
321 | { | 321 | disabled_wait(); |
322 | unsigned long caller; | ||
323 | |||
324 | caller = (unsigned long)__builtin_return_address(0); | ||
325 | disabled_wait(caller); | ||
326 | } | ||
327 | #endif | 322 | #endif |
328 | pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf); | 323 | pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf); |
329 | local_irq_enable(); | 324 | local_irq_enable(); |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index f8fe57d1022e..9bbaaab14b36 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -114,6 +114,15 @@ config PM_SLEEP_SMP | |||
114 | depends on PM_SLEEP | 114 | depends on PM_SLEEP |
115 | select HOTPLUG_CPU | 115 | select HOTPLUG_CPU |
116 | 116 | ||
117 | config PM_SLEEP_SMP_NONZERO_CPU | ||
118 | def_bool y | ||
119 | depends on PM_SLEEP_SMP | ||
120 | depends on ARCH_SUSPEND_NONZERO_CPU | ||
121 | ---help--- | ||
122 | If an arch can suspend (for suspend, hibernate, kexec, etc) on a | ||
123 | non-zero numbered CPU, it may define ARCH_SUSPEND_NONZERO_CPU. This | ||
124 | will allow nohz_full mask to include CPU0. | ||
125 | |||
117 | config PM_AUTOSLEEP | 126 | config PM_AUTOSLEEP |
118 | bool "Opportunistic sleep" | 127 | bool "Opportunistic sleep" |
119 | depends on PM_SLEEP | 128 | depends on PM_SLEEP |
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index abef759de7c8..c8c272df7154 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
@@ -14,7 +14,6 @@ | |||
14 | 14 | ||
15 | #include <linux/export.h> | 15 | #include <linux/export.h> |
16 | #include <linux/suspend.h> | 16 | #include <linux/suspend.h> |
17 | #include <linux/syscalls.h> | ||
18 | #include <linux/reboot.h> | 17 | #include <linux/reboot.h> |
19 | #include <linux/string.h> | 18 | #include <linux/string.h> |
20 | #include <linux/device.h> | 19 | #include <linux/device.h> |
@@ -281,7 +280,7 @@ static int create_image(int platform_mode) | |||
281 | if (error || hibernation_test(TEST_PLATFORM)) | 280 | if (error || hibernation_test(TEST_PLATFORM)) |
282 | goto Platform_finish; | 281 | goto Platform_finish; |
283 | 282 | ||
284 | error = disable_nonboot_cpus(); | 283 | error = suspend_disable_secondary_cpus(); |
285 | if (error || hibernation_test(TEST_CPUS)) | 284 | if (error || hibernation_test(TEST_CPUS)) |
286 | goto Enable_cpus; | 285 | goto Enable_cpus; |
287 | 286 | ||
@@ -323,7 +322,7 @@ static int create_image(int platform_mode) | |||
323 | local_irq_enable(); | 322 | local_irq_enable(); |
324 | 323 | ||
325 | Enable_cpus: | 324 | Enable_cpus: |
326 | enable_nonboot_cpus(); | 325 | suspend_enable_secondary_cpus(); |
327 | 326 | ||
328 | Platform_finish: | 327 | Platform_finish: |
329 | platform_finish(platform_mode); | 328 | platform_finish(platform_mode); |
@@ -417,7 +416,7 @@ int hibernation_snapshot(int platform_mode) | |||
417 | 416 | ||
418 | int __weak hibernate_resume_nonboot_cpu_disable(void) | 417 | int __weak hibernate_resume_nonboot_cpu_disable(void) |
419 | { | 418 | { |
420 | return disable_nonboot_cpus(); | 419 | return suspend_disable_secondary_cpus(); |
421 | } | 420 | } |
422 | 421 | ||
423 | /** | 422 | /** |
@@ -486,7 +485,7 @@ static int resume_target_kernel(bool platform_mode) | |||
486 | local_irq_enable(); | 485 | local_irq_enable(); |
487 | 486 | ||
488 | Enable_cpus: | 487 | Enable_cpus: |
489 | enable_nonboot_cpus(); | 488 | suspend_enable_secondary_cpus(); |
490 | 489 | ||
491 | Cleanup: | 490 | Cleanup: |
492 | platform_restore_cleanup(platform_mode); | 491 | platform_restore_cleanup(platform_mode); |
@@ -564,7 +563,7 @@ int hibernation_platform_enter(void) | |||
564 | if (error) | 563 | if (error) |
565 | goto Platform_finish; | 564 | goto Platform_finish; |
566 | 565 | ||
567 | error = disable_nonboot_cpus(); | 566 | error = suspend_disable_secondary_cpus(); |
568 | if (error) | 567 | if (error) |
569 | goto Enable_cpus; | 568 | goto Enable_cpus; |
570 | 569 | ||
@@ -586,7 +585,7 @@ int hibernation_platform_enter(void) | |||
586 | local_irq_enable(); | 585 | local_irq_enable(); |
587 | 586 | ||
588 | Enable_cpus: | 587 | Enable_cpus: |
589 | enable_nonboot_cpus(); | 588 | suspend_enable_secondary_cpus(); |
590 | 589 | ||
591 | Platform_finish: | 590 | Platform_finish: |
592 | hibernation_ops->finish(); | 591 | hibernation_ops->finish(); |
@@ -709,9 +708,7 @@ int hibernate(void) | |||
709 | goto Exit; | 708 | goto Exit; |
710 | } | 709 | } |
711 | 710 | ||
712 | pr_info("Syncing filesystems ... \n"); | 711 | ksys_sync_helper(); |
713 | ksys_sync(); | ||
714 | pr_info("done.\n"); | ||
715 | 712 | ||
716 | error = freeze_processes(); | 713 | error = freeze_processes(); |
717 | if (error) | 714 | if (error) |
diff --git a/kernel/power/main.c b/kernel/power/main.c index 98e76cad128b..4f43e724f6eb 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <linux/debugfs.h> | 16 | #include <linux/debugfs.h> |
17 | #include <linux/seq_file.h> | 17 | #include <linux/seq_file.h> |
18 | #include <linux/suspend.h> | 18 | #include <linux/suspend.h> |
19 | #include <linux/syscalls.h> | ||
19 | 20 | ||
20 | #include "power.h" | 21 | #include "power.h" |
21 | 22 | ||
@@ -51,6 +52,19 @@ void unlock_system_sleep(void) | |||
51 | } | 52 | } |
52 | EXPORT_SYMBOL_GPL(unlock_system_sleep); | 53 | EXPORT_SYMBOL_GPL(unlock_system_sleep); |
53 | 54 | ||
55 | void ksys_sync_helper(void) | ||
56 | { | ||
57 | ktime_t start; | ||
58 | long elapsed_msecs; | ||
59 | |||
60 | start = ktime_get(); | ||
61 | ksys_sync(); | ||
62 | elapsed_msecs = ktime_to_ms(ktime_sub(ktime_get(), start)); | ||
63 | pr_info("Filesystems sync: %ld.%03ld seconds\n", | ||
64 | elapsed_msecs / MSEC_PER_SEC, elapsed_msecs % MSEC_PER_SEC); | ||
65 | } | ||
66 | EXPORT_SYMBOL_GPL(ksys_sync_helper); | ||
67 | |||
54 | /* Routines for PM-transition notifications */ | 68 | /* Routines for PM-transition notifications */ |
55 | 69 | ||
56 | static BLOCKING_NOTIFIER_HEAD(pm_chain_head); | 70 | static BLOCKING_NOTIFIER_HEAD(pm_chain_head); |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index f08a1e4ee1d4..bc9558ab1e5b 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
@@ -1342,8 +1342,9 @@ static inline void do_copy_page(long *dst, long *src) | |||
1342 | * safe_copy_page - Copy a page in a safe way. | 1342 | * safe_copy_page - Copy a page in a safe way. |
1343 | * | 1343 | * |
1344 | * Check if the page we are going to copy is marked as present in the kernel | 1344 | * Check if the page we are going to copy is marked as present in the kernel |
1345 | * page tables (this always is the case if CONFIG_DEBUG_PAGEALLOC is not set | 1345 | * page tables. This always is the case if CONFIG_DEBUG_PAGEALLOC or |
1346 | * and in that case kernel_page_present() always returns 'true'). | 1346 | * CONFIG_ARCH_HAS_SET_DIRECT_MAP is not set. In that case kernel_page_present() |
1347 | * always returns 'true'. | ||
1347 | */ | 1348 | */ |
1348 | static void safe_copy_page(void *dst, struct page *s_page) | 1349 | static void safe_copy_page(void *dst, struct page *s_page) |
1349 | { | 1350 | { |
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 0bd595a0b610..ef908c134b34 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
@@ -17,7 +17,6 @@ | |||
17 | #include <linux/console.h> | 17 | #include <linux/console.h> |
18 | #include <linux/cpu.h> | 18 | #include <linux/cpu.h> |
19 | #include <linux/cpuidle.h> | 19 | #include <linux/cpuidle.h> |
20 | #include <linux/syscalls.h> | ||
21 | #include <linux/gfp.h> | 20 | #include <linux/gfp.h> |
22 | #include <linux/io.h> | 21 | #include <linux/io.h> |
23 | #include <linux/kernel.h> | 22 | #include <linux/kernel.h> |
@@ -428,7 +427,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) | |||
428 | if (suspend_test(TEST_PLATFORM)) | 427 | if (suspend_test(TEST_PLATFORM)) |
429 | goto Platform_wake; | 428 | goto Platform_wake; |
430 | 429 | ||
431 | error = disable_nonboot_cpus(); | 430 | error = suspend_disable_secondary_cpus(); |
432 | if (error || suspend_test(TEST_CPUS)) | 431 | if (error || suspend_test(TEST_CPUS)) |
433 | goto Enable_cpus; | 432 | goto Enable_cpus; |
434 | 433 | ||
@@ -458,7 +457,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) | |||
458 | BUG_ON(irqs_disabled()); | 457 | BUG_ON(irqs_disabled()); |
459 | 458 | ||
460 | Enable_cpus: | 459 | Enable_cpus: |
461 | enable_nonboot_cpus(); | 460 | suspend_enable_secondary_cpus(); |
462 | 461 | ||
463 | Platform_wake: | 462 | Platform_wake: |
464 | platform_resume_noirq(state); | 463 | platform_resume_noirq(state); |
@@ -568,13 +567,11 @@ static int enter_state(suspend_state_t state) | |||
568 | if (state == PM_SUSPEND_TO_IDLE) | 567 | if (state == PM_SUSPEND_TO_IDLE) |
569 | s2idle_begin(); | 568 | s2idle_begin(); |
570 | 569 | ||
571 | #ifndef CONFIG_SUSPEND_SKIP_SYNC | 570 | if (!IS_ENABLED(CONFIG_SUSPEND_SKIP_SYNC)) { |
572 | trace_suspend_resume(TPS("sync_filesystems"), 0, true); | 571 | trace_suspend_resume(TPS("sync_filesystems"), 0, true); |
573 | pr_info("Syncing filesystems ... "); | 572 | ksys_sync_helper(); |
574 | ksys_sync(); | 573 | trace_suspend_resume(TPS("sync_filesystems"), 0, false); |
575 | pr_cont("done.\n"); | 574 | } |
576 | trace_suspend_resume(TPS("sync_filesystems"), 0, false); | ||
577 | #endif | ||
578 | 575 | ||
579 | pm_pr_dbg("Preparing system for sleep (%s)\n", mem_sleep_labels[state]); | 576 | pm_pr_dbg("Preparing system for sleep (%s)\n", mem_sleep_labels[state]); |
580 | pm_suspend_clear_flags(); | 577 | pm_suspend_clear_flags(); |
diff --git a/kernel/power/user.c b/kernel/power/user.c index 2d8b60a3c86b..cb24e840a3e6 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c | |||
@@ -10,7 +10,6 @@ | |||
10 | */ | 10 | */ |
11 | 11 | ||
12 | #include <linux/suspend.h> | 12 | #include <linux/suspend.h> |
13 | #include <linux/syscalls.h> | ||
14 | #include <linux/reboot.h> | 13 | #include <linux/reboot.h> |
15 | #include <linux/string.h> | 14 | #include <linux/string.h> |
16 | #include <linux/device.h> | 15 | #include <linux/device.h> |
@@ -228,9 +227,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
228 | if (data->frozen) | 227 | if (data->frozen) |
229 | break; | 228 | break; |
230 | 229 | ||
231 | printk("Syncing filesystems ... "); | 230 | ksys_sync_helper(); |
232 | ksys_sync(); | ||
233 | printk("done.\n"); | ||
234 | 231 | ||
235 | error = freeze_processes(); | 232 | error = freeze_processes(); |
236 | if (error) | 233 | if (error) |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 771e93f9c43f..6f357f4fc859 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -29,6 +29,7 @@ | |||
29 | #include <linux/hw_breakpoint.h> | 29 | #include <linux/hw_breakpoint.h> |
30 | #include <linux/cn_proc.h> | 30 | #include <linux/cn_proc.h> |
31 | #include <linux/compat.h> | 31 | #include <linux/compat.h> |
32 | #include <linux/sched/signal.h> | ||
32 | 33 | ||
33 | /* | 34 | /* |
34 | * Access another process' address space via ptrace. | 35 | * Access another process' address space via ptrace. |
@@ -924,18 +925,26 @@ int ptrace_request(struct task_struct *child, long request, | |||
924 | ret = ptrace_setsiginfo(child, &siginfo); | 925 | ret = ptrace_setsiginfo(child, &siginfo); |
925 | break; | 926 | break; |
926 | 927 | ||
927 | case PTRACE_GETSIGMASK: | 928 | case PTRACE_GETSIGMASK: { |
929 | sigset_t *mask; | ||
930 | |||
928 | if (addr != sizeof(sigset_t)) { | 931 | if (addr != sizeof(sigset_t)) { |
929 | ret = -EINVAL; | 932 | ret = -EINVAL; |
930 | break; | 933 | break; |
931 | } | 934 | } |
932 | 935 | ||
933 | if (copy_to_user(datavp, &child->blocked, sizeof(sigset_t))) | 936 | if (test_tsk_restore_sigmask(child)) |
937 | mask = &child->saved_sigmask; | ||
938 | else | ||
939 | mask = &child->blocked; | ||
940 | |||
941 | if (copy_to_user(datavp, mask, sizeof(sigset_t))) | ||
934 | ret = -EFAULT; | 942 | ret = -EFAULT; |
935 | else | 943 | else |
936 | ret = 0; | 944 | ret = 0; |
937 | 945 | ||
938 | break; | 946 | break; |
947 | } | ||
939 | 948 | ||
940 | case PTRACE_SETSIGMASK: { | 949 | case PTRACE_SETSIGMASK: { |
941 | sigset_t new_set; | 950 | sigset_t new_set; |
@@ -961,6 +970,8 @@ int ptrace_request(struct task_struct *child, long request, | |||
961 | child->blocked = new_set; | 970 | child->blocked = new_set; |
962 | spin_unlock_irq(&child->sighand->siglock); | 971 | spin_unlock_irq(&child->sighand->siglock); |
963 | 972 | ||
973 | clear_tsk_restore_sigmask(child); | ||
974 | |||
964 | ret = 0; | 975 | ret = 0; |
965 | break; | 976 | break; |
966 | } | 977 | } |
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index acee72c0b24b..4b58c907b4b7 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h | |||
@@ -233,6 +233,7 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) | |||
233 | #ifdef CONFIG_RCU_STALL_COMMON | 233 | #ifdef CONFIG_RCU_STALL_COMMON |
234 | 234 | ||
235 | extern int rcu_cpu_stall_suppress; | 235 | extern int rcu_cpu_stall_suppress; |
236 | extern int rcu_cpu_stall_timeout; | ||
236 | int rcu_jiffies_till_stall_check(void); | 237 | int rcu_jiffies_till_stall_check(void); |
237 | 238 | ||
238 | #define rcu_ftrace_dump_stall_suppress() \ | 239 | #define rcu_ftrace_dump_stall_suppress() \ |
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index c29761152874..7a6890b23c5f 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c | |||
@@ -494,6 +494,10 @@ rcu_perf_cleanup(void) | |||
494 | 494 | ||
495 | if (torture_cleanup_begin()) | 495 | if (torture_cleanup_begin()) |
496 | return; | 496 | return; |
497 | if (!cur_ops) { | ||
498 | torture_cleanup_end(); | ||
499 | return; | ||
500 | } | ||
497 | 501 | ||
498 | if (reader_tasks) { | 502 | if (reader_tasks) { |
499 | for (i = 0; i < nrealreaders; i++) | 503 | for (i = 0; i < nrealreaders; i++) |
@@ -614,6 +618,7 @@ rcu_perf_init(void) | |||
614 | pr_cont("\n"); | 618 | pr_cont("\n"); |
615 | WARN_ON(!IS_MODULE(CONFIG_RCU_PERF_TEST)); | 619 | WARN_ON(!IS_MODULE(CONFIG_RCU_PERF_TEST)); |
616 | firsterr = -EINVAL; | 620 | firsterr = -EINVAL; |
621 | cur_ops = NULL; | ||
617 | goto unwind; | 622 | goto unwind; |
618 | } | 623 | } |
619 | if (cur_ops->init) | 624 | if (cur_ops->init) |
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index f14d1b18a74f..efaa5b3f4d3f 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c | |||
@@ -299,7 +299,6 @@ struct rcu_torture_ops { | |||
299 | int irq_capable; | 299 | int irq_capable; |
300 | int can_boost; | 300 | int can_boost; |
301 | int extendables; | 301 | int extendables; |
302 | int ext_irq_conflict; | ||
303 | const char *name; | 302 | const char *name; |
304 | }; | 303 | }; |
305 | 304 | ||
@@ -592,12 +591,7 @@ static void srcu_torture_init(void) | |||
592 | 591 | ||
593 | static void srcu_torture_cleanup(void) | 592 | static void srcu_torture_cleanup(void) |
594 | { | 593 | { |
595 | static DEFINE_TORTURE_RANDOM(rand); | 594 | cleanup_srcu_struct(&srcu_ctld); |
596 | |||
597 | if (torture_random(&rand) & 0x800) | ||
598 | cleanup_srcu_struct(&srcu_ctld); | ||
599 | else | ||
600 | cleanup_srcu_struct_quiesced(&srcu_ctld); | ||
601 | srcu_ctlp = &srcu_ctl; /* In case of a later rcutorture run. */ | 595 | srcu_ctlp = &srcu_ctl; /* In case of a later rcutorture run. */ |
602 | } | 596 | } |
603 | 597 | ||
@@ -1160,7 +1154,7 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) | |||
1160 | unsigned long randmask2 = randmask1 >> 3; | 1154 | unsigned long randmask2 = randmask1 >> 3; |
1161 | 1155 | ||
1162 | WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT); | 1156 | WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT); |
1163 | /* Most of the time lots of bits, half the time only one bit. */ | 1157 | /* Mostly only one bit (need preemption!), sometimes lots of bits. */ |
1164 | if (!(randmask1 & 0x7)) | 1158 | if (!(randmask1 & 0x7)) |
1165 | mask = mask & randmask2; | 1159 | mask = mask & randmask2; |
1166 | else | 1160 | else |
@@ -1170,10 +1164,6 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) | |||
1170 | ((!(mask & RCUTORTURE_RDR_BH) && (oldmask & RCUTORTURE_RDR_BH)) || | 1164 | ((!(mask & RCUTORTURE_RDR_BH) && (oldmask & RCUTORTURE_RDR_BH)) || |
1171 | (!(mask & RCUTORTURE_RDR_RBH) && (oldmask & RCUTORTURE_RDR_RBH)))) | 1165 | (!(mask & RCUTORTURE_RDR_RBH) && (oldmask & RCUTORTURE_RDR_RBH)))) |
1172 | mask |= RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH; | 1166 | mask |= RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH; |
1173 | if ((mask & RCUTORTURE_RDR_IRQ) && | ||
1174 | !(mask & cur_ops->ext_irq_conflict) && | ||
1175 | (oldmask & cur_ops->ext_irq_conflict)) | ||
1176 | mask |= cur_ops->ext_irq_conflict; /* Or if readers object. */ | ||
1177 | return mask ?: RCUTORTURE_RDR_RCU; | 1167 | return mask ?: RCUTORTURE_RDR_RCU; |
1178 | } | 1168 | } |
1179 | 1169 | ||
@@ -1848,7 +1838,7 @@ static int rcutorture_oom_notify(struct notifier_block *self, | |||
1848 | WARN(1, "%s invoked upon OOM during forward-progress testing.\n", | 1838 | WARN(1, "%s invoked upon OOM during forward-progress testing.\n", |
1849 | __func__); | 1839 | __func__); |
1850 | rcu_torture_fwd_cb_hist(); | 1840 | rcu_torture_fwd_cb_hist(); |
1851 | rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwd_startat) / 2)); | 1841 | rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwd_startat)) / 2); |
1852 | WRITE_ONCE(rcu_fwd_emergency_stop, true); | 1842 | WRITE_ONCE(rcu_fwd_emergency_stop, true); |
1853 | smp_mb(); /* Emergency stop before free and wait to avoid hangs. */ | 1843 | smp_mb(); /* Emergency stop before free and wait to avoid hangs. */ |
1854 | pr_info("%s: Freed %lu RCU callbacks.\n", | 1844 | pr_info("%s: Freed %lu RCU callbacks.\n", |
@@ -2094,6 +2084,10 @@ rcu_torture_cleanup(void) | |||
2094 | cur_ops->cb_barrier(); | 2084 | cur_ops->cb_barrier(); |
2095 | return; | 2085 | return; |
2096 | } | 2086 | } |
2087 | if (!cur_ops) { | ||
2088 | torture_cleanup_end(); | ||
2089 | return; | ||
2090 | } | ||
2097 | 2091 | ||
2098 | rcu_torture_barrier_cleanup(); | 2092 | rcu_torture_barrier_cleanup(); |
2099 | torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task); | 2093 | torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task); |
@@ -2267,6 +2261,7 @@ rcu_torture_init(void) | |||
2267 | pr_cont("\n"); | 2261 | pr_cont("\n"); |
2268 | WARN_ON(!IS_MODULE(CONFIG_RCU_TORTURE_TEST)); | 2262 | WARN_ON(!IS_MODULE(CONFIG_RCU_TORTURE_TEST)); |
2269 | firsterr = -EINVAL; | 2263 | firsterr = -EINVAL; |
2264 | cur_ops = NULL; | ||
2270 | goto unwind; | 2265 | goto unwind; |
2271 | } | 2266 | } |
2272 | if (cur_ops->fqs == NULL && fqs_duration != 0) { | 2267 | if (cur_ops->fqs == NULL && fqs_duration != 0) { |
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 5d4a39a6505a..44d6606b8325 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c | |||
@@ -76,19 +76,16 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); | |||
76 | * Must invoke this after you are finished using a given srcu_struct that | 76 | * Must invoke this after you are finished using a given srcu_struct that |
77 | * was initialized via init_srcu_struct(), else you leak memory. | 77 | * was initialized via init_srcu_struct(), else you leak memory. |
78 | */ | 78 | */ |
79 | void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced) | 79 | void cleanup_srcu_struct(struct srcu_struct *ssp) |
80 | { | 80 | { |
81 | WARN_ON(ssp->srcu_lock_nesting[0] || ssp->srcu_lock_nesting[1]); | 81 | WARN_ON(ssp->srcu_lock_nesting[0] || ssp->srcu_lock_nesting[1]); |
82 | if (quiesced) | 82 | flush_work(&ssp->srcu_work); |
83 | WARN_ON(work_pending(&ssp->srcu_work)); | ||
84 | else | ||
85 | flush_work(&ssp->srcu_work); | ||
86 | WARN_ON(ssp->srcu_gp_running); | 83 | WARN_ON(ssp->srcu_gp_running); |
87 | WARN_ON(ssp->srcu_gp_waiting); | 84 | WARN_ON(ssp->srcu_gp_waiting); |
88 | WARN_ON(ssp->srcu_cb_head); | 85 | WARN_ON(ssp->srcu_cb_head); |
89 | WARN_ON(&ssp->srcu_cb_head != ssp->srcu_cb_tail); | 86 | WARN_ON(&ssp->srcu_cb_head != ssp->srcu_cb_tail); |
90 | } | 87 | } |
91 | EXPORT_SYMBOL_GPL(_cleanup_srcu_struct); | 88 | EXPORT_SYMBOL_GPL(cleanup_srcu_struct); |
92 | 89 | ||
93 | /* | 90 | /* |
94 | * Removes the count for the old reader from the appropriate element of | 91 | * Removes the count for the old reader from the appropriate element of |
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index a60b8ba9e1ac..9b761e546de8 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c | |||
@@ -360,8 +360,14 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp) | |||
360 | return SRCU_INTERVAL; | 360 | return SRCU_INTERVAL; |
361 | } | 361 | } |
362 | 362 | ||
363 | /* Helper for cleanup_srcu_struct() and cleanup_srcu_struct_quiesced(). */ | 363 | /** |
364 | void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced) | 364 | * cleanup_srcu_struct - deconstruct a sleep-RCU structure |
365 | * @ssp: structure to clean up. | ||
366 | * | ||
367 | * Must invoke this after you are finished using a given srcu_struct that | ||
368 | * was initialized via init_srcu_struct(), else you leak memory. | ||
369 | */ | ||
370 | void cleanup_srcu_struct(struct srcu_struct *ssp) | ||
365 | { | 371 | { |
366 | int cpu; | 372 | int cpu; |
367 | 373 | ||
@@ -369,24 +375,14 @@ void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced) | |||
369 | return; /* Just leak it! */ | 375 | return; /* Just leak it! */ |
370 | if (WARN_ON(srcu_readers_active(ssp))) | 376 | if (WARN_ON(srcu_readers_active(ssp))) |
371 | return; /* Just leak it! */ | 377 | return; /* Just leak it! */ |
372 | if (quiesced) { | 378 | flush_delayed_work(&ssp->work); |
373 | if (WARN_ON(delayed_work_pending(&ssp->work))) | ||
374 | return; /* Just leak it! */ | ||
375 | } else { | ||
376 | flush_delayed_work(&ssp->work); | ||
377 | } | ||
378 | for_each_possible_cpu(cpu) { | 379 | for_each_possible_cpu(cpu) { |
379 | struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); | 380 | struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); |
380 | 381 | ||
381 | if (quiesced) { | 382 | del_timer_sync(&sdp->delay_work); |
382 | if (WARN_ON(timer_pending(&sdp->delay_work))) | 383 | flush_work(&sdp->work); |
383 | return; /* Just leak it! */ | 384 | if (WARN_ON(rcu_segcblist_n_cbs(&sdp->srcu_cblist))) |
384 | if (WARN_ON(work_pending(&sdp->work))) | 385 | return; /* Forgot srcu_barrier(), so just leak it! */ |
385 | return; /* Just leak it! */ | ||
386 | } else { | ||
387 | del_timer_sync(&sdp->delay_work); | ||
388 | flush_work(&sdp->work); | ||
389 | } | ||
390 | } | 386 | } |
391 | if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) || | 387 | if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) || |
392 | WARN_ON(srcu_readers_active(ssp))) { | 388 | WARN_ON(srcu_readers_active(ssp))) { |
@@ -397,7 +393,7 @@ void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced) | |||
397 | free_percpu(ssp->sda); | 393 | free_percpu(ssp->sda); |
398 | ssp->sda = NULL; | 394 | ssp->sda = NULL; |
399 | } | 395 | } |
400 | EXPORT_SYMBOL_GPL(_cleanup_srcu_struct); | 396 | EXPORT_SYMBOL_GPL(cleanup_srcu_struct); |
401 | 397 | ||
402 | /* | 398 | /* |
403 | * Counts the new reader in the appropriate per-CPU element of the | 399 | * Counts the new reader in the appropriate per-CPU element of the |
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 911bd9076d43..477b4eb44af5 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c | |||
@@ -52,7 +52,7 @@ void rcu_qs(void) | |||
52 | local_irq_save(flags); | 52 | local_irq_save(flags); |
53 | if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) { | 53 | if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) { |
54 | rcu_ctrlblk.donetail = rcu_ctrlblk.curtail; | 54 | rcu_ctrlblk.donetail = rcu_ctrlblk.curtail; |
55 | raise_softirq(RCU_SOFTIRQ); | 55 | raise_softirq_irqoff(RCU_SOFTIRQ); |
56 | } | 56 | } |
57 | local_irq_restore(flags); | 57 | local_irq_restore(flags); |
58 | } | 58 | } |
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index acd6ccf56faf..b4d88a594785 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c | |||
@@ -102,11 +102,6 @@ int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; | |||
102 | /* Number of rcu_nodes at specified level. */ | 102 | /* Number of rcu_nodes at specified level. */ |
103 | int num_rcu_lvl[] = NUM_RCU_LVL_INIT; | 103 | int num_rcu_lvl[] = NUM_RCU_LVL_INIT; |
104 | int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ | 104 | int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ |
105 | /* panic() on RCU Stall sysctl. */ | ||
106 | int sysctl_panic_on_rcu_stall __read_mostly; | ||
107 | /* Commandeer a sysrq key to dump RCU's tree. */ | ||
108 | static bool sysrq_rcu; | ||
109 | module_param(sysrq_rcu, bool, 0444); | ||
110 | 105 | ||
111 | /* | 106 | /* |
112 | * The rcu_scheduler_active variable is initialized to the value | 107 | * The rcu_scheduler_active variable is initialized to the value |
@@ -149,7 +144,7 @@ static void sync_sched_exp_online_cleanup(int cpu); | |||
149 | 144 | ||
150 | /* rcuc/rcub kthread realtime priority */ | 145 | /* rcuc/rcub kthread realtime priority */ |
151 | static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0; | 146 | static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0; |
152 | module_param(kthread_prio, int, 0644); | 147 | module_param(kthread_prio, int, 0444); |
153 | 148 | ||
154 | /* Delay in jiffies for grace-period initialization delays, debug only. */ | 149 | /* Delay in jiffies for grace-period initialization delays, debug only. */ |
155 | 150 | ||
@@ -406,7 +401,7 @@ static bool rcu_kick_kthreads; | |||
406 | */ | 401 | */ |
407 | static ulong jiffies_till_sched_qs = ULONG_MAX; | 402 | static ulong jiffies_till_sched_qs = ULONG_MAX; |
408 | module_param(jiffies_till_sched_qs, ulong, 0444); | 403 | module_param(jiffies_till_sched_qs, ulong, 0444); |
409 | static ulong jiffies_to_sched_qs; /* Adjusted version of above if not default */ | 404 | static ulong jiffies_to_sched_qs; /* See adjust_jiffies_till_sched_qs(). */ |
410 | module_param(jiffies_to_sched_qs, ulong, 0444); /* Display only! */ | 405 | module_param(jiffies_to_sched_qs, ulong, 0444); /* Display only! */ |
411 | 406 | ||
412 | /* | 407 | /* |
@@ -424,6 +419,7 @@ static void adjust_jiffies_till_sched_qs(void) | |||
424 | WRITE_ONCE(jiffies_to_sched_qs, jiffies_till_sched_qs); | 419 | WRITE_ONCE(jiffies_to_sched_qs, jiffies_till_sched_qs); |
425 | return; | 420 | return; |
426 | } | 421 | } |
422 | /* Otherwise, set to third fqs scan, but bound below on large system. */ | ||
427 | j = READ_ONCE(jiffies_till_first_fqs) + | 423 | j = READ_ONCE(jiffies_till_first_fqs) + |
428 | 2 * READ_ONCE(jiffies_till_next_fqs); | 424 | 2 * READ_ONCE(jiffies_till_next_fqs); |
429 | if (j < HZ / 10 + nr_cpu_ids / RCU_JIFFIES_FQS_DIV) | 425 | if (j < HZ / 10 + nr_cpu_ids / RCU_JIFFIES_FQS_DIV) |
@@ -513,74 +509,6 @@ static const char *gp_state_getname(short gs) | |||
513 | } | 509 | } |
514 | 510 | ||
515 | /* | 511 | /* |
516 | * Show the state of the grace-period kthreads. | ||
517 | */ | ||
518 | void show_rcu_gp_kthreads(void) | ||
519 | { | ||
520 | int cpu; | ||
521 | unsigned long j; | ||
522 | unsigned long ja; | ||
523 | unsigned long jr; | ||
524 | unsigned long jw; | ||
525 | struct rcu_data *rdp; | ||
526 | struct rcu_node *rnp; | ||
527 | |||
528 | j = jiffies; | ||
529 | ja = j - READ_ONCE(rcu_state.gp_activity); | ||
530 | jr = j - READ_ONCE(rcu_state.gp_req_activity); | ||
531 | jw = j - READ_ONCE(rcu_state.gp_wake_time); | ||
532 | pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n", | ||
533 | rcu_state.name, gp_state_getname(rcu_state.gp_state), | ||
534 | rcu_state.gp_state, | ||
535 | rcu_state.gp_kthread ? rcu_state.gp_kthread->state : 0x1ffffL, | ||
536 | ja, jr, jw, (long)READ_ONCE(rcu_state.gp_wake_seq), | ||
537 | (long)READ_ONCE(rcu_state.gp_seq), | ||
538 | (long)READ_ONCE(rcu_get_root()->gp_seq_needed), | ||
539 | READ_ONCE(rcu_state.gp_flags)); | ||
540 | rcu_for_each_node_breadth_first(rnp) { | ||
541 | if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed)) | ||
542 | continue; | ||
543 | pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n", | ||
544 | rnp->grplo, rnp->grphi, (long)rnp->gp_seq, | ||
545 | (long)rnp->gp_seq_needed); | ||
546 | if (!rcu_is_leaf_node(rnp)) | ||
547 | continue; | ||
548 | for_each_leaf_node_possible_cpu(rnp, cpu) { | ||
549 | rdp = per_cpu_ptr(&rcu_data, cpu); | ||
550 | if (rdp->gpwrap || | ||
551 | ULONG_CMP_GE(rcu_state.gp_seq, | ||
552 | rdp->gp_seq_needed)) | ||
553 | continue; | ||
554 | pr_info("\tcpu %d ->gp_seq_needed %ld\n", | ||
555 | cpu, (long)rdp->gp_seq_needed); | ||
556 | } | ||
557 | } | ||
558 | /* sched_show_task(rcu_state.gp_kthread); */ | ||
559 | } | ||
560 | EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); | ||
561 | |||
562 | /* Dump grace-period-request information due to commandeered sysrq. */ | ||
563 | static void sysrq_show_rcu(int key) | ||
564 | { | ||
565 | show_rcu_gp_kthreads(); | ||
566 | } | ||
567 | |||
568 | static struct sysrq_key_op sysrq_rcudump_op = { | ||
569 | .handler = sysrq_show_rcu, | ||
570 | .help_msg = "show-rcu(y)", | ||
571 | .action_msg = "Show RCU tree", | ||
572 | .enable_mask = SYSRQ_ENABLE_DUMP, | ||
573 | }; | ||
574 | |||
575 | static int __init rcu_sysrq_init(void) | ||
576 | { | ||
577 | if (sysrq_rcu) | ||
578 | return register_sysrq_key('y', &sysrq_rcudump_op); | ||
579 | return 0; | ||
580 | } | ||
581 | early_initcall(rcu_sysrq_init); | ||
582 | |||
583 | /* | ||
584 | * Send along grace-period-related data for rcutorture diagnostics. | 512 | * Send along grace-period-related data for rcutorture diagnostics. |
585 | */ | 513 | */ |
586 | void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, | 514 | void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, |
@@ -1034,27 +962,6 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) | |||
1034 | } | 962 | } |
1035 | 963 | ||
1036 | /* | 964 | /* |
1037 | * Handler for the irq_work request posted when a grace period has | ||
1038 | * gone on for too long, but not yet long enough for an RCU CPU | ||
1039 | * stall warning. Set state appropriately, but just complain if | ||
1040 | * there is unexpected state on entry. | ||
1041 | */ | ||
1042 | static void rcu_iw_handler(struct irq_work *iwp) | ||
1043 | { | ||
1044 | struct rcu_data *rdp; | ||
1045 | struct rcu_node *rnp; | ||
1046 | |||
1047 | rdp = container_of(iwp, struct rcu_data, rcu_iw); | ||
1048 | rnp = rdp->mynode; | ||
1049 | raw_spin_lock_rcu_node(rnp); | ||
1050 | if (!WARN_ON_ONCE(!rdp->rcu_iw_pending)) { | ||
1051 | rdp->rcu_iw_gp_seq = rnp->gp_seq; | ||
1052 | rdp->rcu_iw_pending = false; | ||
1053 | } | ||
1054 | raw_spin_unlock_rcu_node(rnp); | ||
1055 | } | ||
1056 | |||
1057 | /* | ||
1058 | * Return true if the specified CPU has passed through a quiescent | 965 | * Return true if the specified CPU has passed through a quiescent |
1059 | * state by virtue of being in or having passed through an dynticks | 966 | * state by virtue of being in or having passed through an dynticks |
1060 | * idle state since the last call to dyntick_save_progress_counter() | 967 | * idle state since the last call to dyntick_save_progress_counter() |
@@ -1167,295 +1074,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | |||
1167 | return 0; | 1074 | return 0; |
1168 | } | 1075 | } |
1169 | 1076 | ||
1170 | static void record_gp_stall_check_time(void) | ||
1171 | { | ||
1172 | unsigned long j = jiffies; | ||
1173 | unsigned long j1; | ||
1174 | |||
1175 | rcu_state.gp_start = j; | ||
1176 | j1 = rcu_jiffies_till_stall_check(); | ||
1177 | /* Record ->gp_start before ->jiffies_stall. */ | ||
1178 | smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */ | ||
1179 | rcu_state.jiffies_resched = j + j1 / 2; | ||
1180 | rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs); | ||
1181 | } | ||
1182 | |||
1183 | /* | ||
1184 | * Complain about starvation of grace-period kthread. | ||
1185 | */ | ||
1186 | static void rcu_check_gp_kthread_starvation(void) | ||
1187 | { | ||
1188 | struct task_struct *gpk = rcu_state.gp_kthread; | ||
1189 | unsigned long j; | ||
1190 | |||
1191 | j = jiffies - READ_ONCE(rcu_state.gp_activity); | ||
1192 | if (j > 2 * HZ) { | ||
1193 | pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n", | ||
1194 | rcu_state.name, j, | ||
1195 | (long)rcu_seq_current(&rcu_state.gp_seq), | ||
1196 | READ_ONCE(rcu_state.gp_flags), | ||
1197 | gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, | ||
1198 | gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1); | ||
1199 | if (gpk) { | ||
1200 | pr_err("RCU grace-period kthread stack dump:\n"); | ||
1201 | sched_show_task(gpk); | ||
1202 | wake_up_process(gpk); | ||
1203 | } | ||
1204 | } | ||
1205 | } | ||
1206 | |||
1207 | /* | ||
1208 | * Dump stacks of all tasks running on stalled CPUs. First try using | ||
1209 | * NMIs, but fall back to manual remote stack tracing on architectures | ||
1210 | * that don't support NMI-based stack dumps. The NMI-triggered stack | ||
1211 | * traces are more accurate because they are printed by the target CPU. | ||
1212 | */ | ||
1213 | static void rcu_dump_cpu_stacks(void) | ||
1214 | { | ||
1215 | int cpu; | ||
1216 | unsigned long flags; | ||
1217 | struct rcu_node *rnp; | ||
1218 | |||
1219 | rcu_for_each_leaf_node(rnp) { | ||
1220 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
1221 | for_each_leaf_node_possible_cpu(rnp, cpu) | ||
1222 | if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) | ||
1223 | if (!trigger_single_cpu_backtrace(cpu)) | ||
1224 | dump_cpu_task(cpu); | ||
1225 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
1226 | } | ||
1227 | } | ||
1228 | |||
1229 | /* | ||
1230 | * If too much time has passed in the current grace period, and if | ||
1231 | * so configured, go kick the relevant kthreads. | ||
1232 | */ | ||
1233 | static void rcu_stall_kick_kthreads(void) | ||
1234 | { | ||
1235 | unsigned long j; | ||
1236 | |||
1237 | if (!rcu_kick_kthreads) | ||
1238 | return; | ||
1239 | j = READ_ONCE(rcu_state.jiffies_kick_kthreads); | ||
1240 | if (time_after(jiffies, j) && rcu_state.gp_kthread && | ||
1241 | (rcu_gp_in_progress() || READ_ONCE(rcu_state.gp_flags))) { | ||
1242 | WARN_ONCE(1, "Kicking %s grace-period kthread\n", | ||
1243 | rcu_state.name); | ||
1244 | rcu_ftrace_dump(DUMP_ALL); | ||
1245 | wake_up_process(rcu_state.gp_kthread); | ||
1246 | WRITE_ONCE(rcu_state.jiffies_kick_kthreads, j + HZ); | ||
1247 | } | ||
1248 | } | ||
1249 | |||
1250 | static void panic_on_rcu_stall(void) | ||
1251 | { | ||
1252 | if (sysctl_panic_on_rcu_stall) | ||
1253 | panic("RCU Stall\n"); | ||
1254 | } | ||
1255 | |||
1256 | static void print_other_cpu_stall(unsigned long gp_seq) | ||
1257 | { | ||
1258 | int cpu; | ||
1259 | unsigned long flags; | ||
1260 | unsigned long gpa; | ||
1261 | unsigned long j; | ||
1262 | int ndetected = 0; | ||
1263 | struct rcu_node *rnp = rcu_get_root(); | ||
1264 | long totqlen = 0; | ||
1265 | |||
1266 | /* Kick and suppress, if so configured. */ | ||
1267 | rcu_stall_kick_kthreads(); | ||
1268 | if (rcu_cpu_stall_suppress) | ||
1269 | return; | ||
1270 | |||
1271 | /* | ||
1272 | * OK, time to rat on our buddy... | ||
1273 | * See Documentation/RCU/stallwarn.txt for info on how to debug | ||
1274 | * RCU CPU stall warnings. | ||
1275 | */ | ||
1276 | pr_err("INFO: %s detected stalls on CPUs/tasks:", rcu_state.name); | ||
1277 | print_cpu_stall_info_begin(); | ||
1278 | rcu_for_each_leaf_node(rnp) { | ||
1279 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
1280 | ndetected += rcu_print_task_stall(rnp); | ||
1281 | if (rnp->qsmask != 0) { | ||
1282 | for_each_leaf_node_possible_cpu(rnp, cpu) | ||
1283 | if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { | ||
1284 | print_cpu_stall_info(cpu); | ||
1285 | ndetected++; | ||
1286 | } | ||
1287 | } | ||
1288 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
1289 | } | ||
1290 | |||
1291 | print_cpu_stall_info_end(); | ||
1292 | for_each_possible_cpu(cpu) | ||
1293 | totqlen += rcu_get_n_cbs_cpu(cpu); | ||
1294 | pr_cont("(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n", | ||
1295 | smp_processor_id(), (long)(jiffies - rcu_state.gp_start), | ||
1296 | (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); | ||
1297 | if (ndetected) { | ||
1298 | rcu_dump_cpu_stacks(); | ||
1299 | |||
1300 | /* Complain about tasks blocking the grace period. */ | ||
1301 | rcu_print_detail_task_stall(); | ||
1302 | } else { | ||
1303 | if (rcu_seq_current(&rcu_state.gp_seq) != gp_seq) { | ||
1304 | pr_err("INFO: Stall ended before state dump start\n"); | ||
1305 | } else { | ||
1306 | j = jiffies; | ||
1307 | gpa = READ_ONCE(rcu_state.gp_activity); | ||
1308 | pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n", | ||
1309 | rcu_state.name, j - gpa, j, gpa, | ||
1310 | READ_ONCE(jiffies_till_next_fqs), | ||
1311 | rcu_get_root()->qsmask); | ||
1312 | /* In this case, the current CPU might be at fault. */ | ||
1313 | sched_show_task(current); | ||
1314 | } | ||
1315 | } | ||
1316 | /* Rewrite if needed in case of slow consoles. */ | ||
1317 | if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall))) | ||
1318 | WRITE_ONCE(rcu_state.jiffies_stall, | ||
1319 | jiffies + 3 * rcu_jiffies_till_stall_check() + 3); | ||
1320 | |||
1321 | rcu_check_gp_kthread_starvation(); | ||
1322 | |||
1323 | panic_on_rcu_stall(); | ||
1324 | |||
1325 | rcu_force_quiescent_state(); /* Kick them all. */ | ||
1326 | } | ||
1327 | |||
1328 | static void print_cpu_stall(void) | ||
1329 | { | ||
1330 | int cpu; | ||
1331 | unsigned long flags; | ||
1332 | struct rcu_data *rdp = this_cpu_ptr(&rcu_data); | ||
1333 | struct rcu_node *rnp = rcu_get_root(); | ||
1334 | long totqlen = 0; | ||
1335 | |||
1336 | /* Kick and suppress, if so configured. */ | ||
1337 | rcu_stall_kick_kthreads(); | ||
1338 | if (rcu_cpu_stall_suppress) | ||
1339 | return; | ||
1340 | |||
1341 | /* | ||
1342 | * OK, time to rat on ourselves... | ||
1343 | * See Documentation/RCU/stallwarn.txt for info on how to debug | ||
1344 | * RCU CPU stall warnings. | ||
1345 | */ | ||
1346 | pr_err("INFO: %s self-detected stall on CPU", rcu_state.name); | ||
1347 | print_cpu_stall_info_begin(); | ||
1348 | raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags); | ||
1349 | print_cpu_stall_info(smp_processor_id()); | ||
1350 | raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags); | ||
1351 | print_cpu_stall_info_end(); | ||
1352 | for_each_possible_cpu(cpu) | ||
1353 | totqlen += rcu_get_n_cbs_cpu(cpu); | ||
1354 | pr_cont(" (t=%lu jiffies g=%ld q=%lu)\n", | ||
1355 | jiffies - rcu_state.gp_start, | ||
1356 | (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); | ||
1357 | |||
1358 | rcu_check_gp_kthread_starvation(); | ||
1359 | |||
1360 | rcu_dump_cpu_stacks(); | ||
1361 | |||
1362 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
1363 | /* Rewrite if needed in case of slow consoles. */ | ||
1364 | if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall))) | ||
1365 | WRITE_ONCE(rcu_state.jiffies_stall, | ||
1366 | jiffies + 3 * rcu_jiffies_till_stall_check() + 3); | ||
1367 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
1368 | |||
1369 | panic_on_rcu_stall(); | ||
1370 | |||
1371 | /* | ||
1372 | * Attempt to revive the RCU machinery by forcing a context switch. | ||
1373 | * | ||
1374 | * A context switch would normally allow the RCU state machine to make | ||
1375 | * progress and it could be we're stuck in kernel space without context | ||
1376 | * switches for an entirely unreasonable amount of time. | ||
1377 | */ | ||
1378 | set_tsk_need_resched(current); | ||
1379 | set_preempt_need_resched(); | ||
1380 | } | ||
1381 | |||
1382 | static void check_cpu_stall(struct rcu_data *rdp) | ||
1383 | { | ||
1384 | unsigned long gs1; | ||
1385 | unsigned long gs2; | ||
1386 | unsigned long gps; | ||
1387 | unsigned long j; | ||
1388 | unsigned long jn; | ||
1389 | unsigned long js; | ||
1390 | struct rcu_node *rnp; | ||
1391 | |||
1392 | if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) || | ||
1393 | !rcu_gp_in_progress()) | ||
1394 | return; | ||
1395 | rcu_stall_kick_kthreads(); | ||
1396 | j = jiffies; | ||
1397 | |||
1398 | /* | ||
1399 | * Lots of memory barriers to reject false positives. | ||
1400 | * | ||
1401 | * The idea is to pick up rcu_state.gp_seq, then | ||
1402 | * rcu_state.jiffies_stall, then rcu_state.gp_start, and finally | ||
1403 | * another copy of rcu_state.gp_seq. These values are updated in | ||
1404 | * the opposite order with memory barriers (or equivalent) during | ||
1405 | * grace-period initialization and cleanup. Now, a false positive | ||
1406 | * can occur if we get an new value of rcu_state.gp_start and a old | ||
1407 | * value of rcu_state.jiffies_stall. But given the memory barriers, | ||
1408 | * the only way that this can happen is if one grace period ends | ||
1409 | * and another starts between these two fetches. This is detected | ||
1410 | * by comparing the second fetch of rcu_state.gp_seq with the | ||
1411 | * previous fetch from rcu_state.gp_seq. | ||
1412 | * | ||
1413 | * Given this check, comparisons of jiffies, rcu_state.jiffies_stall, | ||
1414 | * and rcu_state.gp_start suffice to forestall false positives. | ||
1415 | */ | ||
1416 | gs1 = READ_ONCE(rcu_state.gp_seq); | ||
1417 | smp_rmb(); /* Pick up ->gp_seq first... */ | ||
1418 | js = READ_ONCE(rcu_state.jiffies_stall); | ||
1419 | smp_rmb(); /* ...then ->jiffies_stall before the rest... */ | ||
1420 | gps = READ_ONCE(rcu_state.gp_start); | ||
1421 | smp_rmb(); /* ...and finally ->gp_start before ->gp_seq again. */ | ||
1422 | gs2 = READ_ONCE(rcu_state.gp_seq); | ||
1423 | if (gs1 != gs2 || | ||
1424 | ULONG_CMP_LT(j, js) || | ||
1425 | ULONG_CMP_GE(gps, js)) | ||
1426 | return; /* No stall or GP completed since entering function. */ | ||
1427 | rnp = rdp->mynode; | ||
1428 | jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; | ||
1429 | if (rcu_gp_in_progress() && | ||
1430 | (READ_ONCE(rnp->qsmask) & rdp->grpmask) && | ||
1431 | cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { | ||
1432 | |||
1433 | /* We haven't checked in, so go dump stack. */ | ||
1434 | print_cpu_stall(); | ||
1435 | |||
1436 | } else if (rcu_gp_in_progress() && | ||
1437 | ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) && | ||
1438 | cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { | ||
1439 | |||
1440 | /* They had a few time units to dump stack, so complain. */ | ||
1441 | print_other_cpu_stall(gs2); | ||
1442 | } | ||
1443 | } | ||
1444 | |||
1445 | /** | ||
1446 | * rcu_cpu_stall_reset - prevent further stall warnings in current grace period | ||
1447 | * | ||
1448 | * Set the stall-warning timeout way off into the future, thus preventing | ||
1449 | * any RCU CPU stall-warning messages from appearing in the current set of | ||
1450 | * RCU grace periods. | ||
1451 | * | ||
1452 | * The caller must disable hard irqs. | ||
1453 | */ | ||
1454 | void rcu_cpu_stall_reset(void) | ||
1455 | { | ||
1456 | WRITE_ONCE(rcu_state.jiffies_stall, jiffies + ULONG_MAX / 2); | ||
1457 | } | ||
1458 | |||
1459 | /* Trace-event wrapper function for trace_rcu_future_grace_period. */ | 1077 | /* Trace-event wrapper function for trace_rcu_future_grace_period. */ |
1460 | static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, | 1078 | static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, |
1461 | unsigned long gp_seq_req, const char *s) | 1079 | unsigned long gp_seq_req, const char *s) |
@@ -1585,7 +1203,7 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp) | |||
1585 | static void rcu_gp_kthread_wake(void) | 1203 | static void rcu_gp_kthread_wake(void) |
1586 | { | 1204 | { |
1587 | if ((current == rcu_state.gp_kthread && | 1205 | if ((current == rcu_state.gp_kthread && |
1588 | !in_interrupt() && !in_serving_softirq()) || | 1206 | !in_irq() && !in_serving_softirq()) || |
1589 | !READ_ONCE(rcu_state.gp_flags) || | 1207 | !READ_ONCE(rcu_state.gp_flags) || |
1590 | !rcu_state.gp_kthread) | 1208 | !rcu_state.gp_kthread) |
1591 | return; | 1209 | return; |
@@ -2295,11 +1913,10 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) | |||
2295 | return; | 1913 | return; |
2296 | } | 1914 | } |
2297 | mask = rdp->grpmask; | 1915 | mask = rdp->grpmask; |
1916 | rdp->core_needs_qs = false; | ||
2298 | if ((rnp->qsmask & mask) == 0) { | 1917 | if ((rnp->qsmask & mask) == 0) { |
2299 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 1918 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
2300 | } else { | 1919 | } else { |
2301 | rdp->core_needs_qs = false; | ||
2302 | |||
2303 | /* | 1920 | /* |
2304 | * This GP can't end until cpu checks in, so all of our | 1921 | * This GP can't end until cpu checks in, so all of our |
2305 | * callbacks can be processed during the next GP. | 1922 | * callbacks can be processed during the next GP. |
@@ -2548,11 +2165,11 @@ void rcu_sched_clock_irq(int user) | |||
2548 | } | 2165 | } |
2549 | 2166 | ||
2550 | /* | 2167 | /* |
2551 | * Scan the leaf rcu_node structures, processing dyntick state for any that | 2168 | * Scan the leaf rcu_node structures. For each structure on which all |
2552 | * have not yet encountered a quiescent state, using the function specified. | 2169 | * CPUs have reported a quiescent state and on which there are tasks |
2553 | * Also initiate boosting for any threads blocked on the root rcu_node. | 2170 | * blocking the current grace period, initiate RCU priority boosting. |
2554 | * | 2171 | * Otherwise, invoke the specified function to check dyntick state for |
2555 | * The caller must have suppressed start of new grace periods. | 2172 | * each CPU that has not yet reported a quiescent state. |
2556 | */ | 2173 | */ |
2557 | static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) | 2174 | static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) |
2558 | { | 2175 | { |
@@ -2635,101 +2252,6 @@ void rcu_force_quiescent_state(void) | |||
2635 | } | 2252 | } |
2636 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); | 2253 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); |
2637 | 2254 | ||
2638 | /* | ||
2639 | * This function checks for grace-period requests that fail to motivate | ||
2640 | * RCU to come out of its idle mode. | ||
2641 | */ | ||
2642 | void | ||
2643 | rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, | ||
2644 | const unsigned long gpssdelay) | ||
2645 | { | ||
2646 | unsigned long flags; | ||
2647 | unsigned long j; | ||
2648 | struct rcu_node *rnp_root = rcu_get_root(); | ||
2649 | static atomic_t warned = ATOMIC_INIT(0); | ||
2650 | |||
2651 | if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress() || | ||
2652 | ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed)) | ||
2653 | return; | ||
2654 | j = jiffies; /* Expensive access, and in common case don't get here. */ | ||
2655 | if (time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || | ||
2656 | time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) || | ||
2657 | atomic_read(&warned)) | ||
2658 | return; | ||
2659 | |||
2660 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
2661 | j = jiffies; | ||
2662 | if (rcu_gp_in_progress() || | ||
2663 | ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || | ||
2664 | time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || | ||
2665 | time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) || | ||
2666 | atomic_read(&warned)) { | ||
2667 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
2668 | return; | ||
2669 | } | ||
2670 | /* Hold onto the leaf lock to make others see warned==1. */ | ||
2671 | |||
2672 | if (rnp_root != rnp) | ||
2673 | raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ | ||
2674 | j = jiffies; | ||
2675 | if (rcu_gp_in_progress() || | ||
2676 | ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || | ||
2677 | time_before(j, rcu_state.gp_req_activity + gpssdelay) || | ||
2678 | time_before(j, rcu_state.gp_activity + gpssdelay) || | ||
2679 | atomic_xchg(&warned, 1)) { | ||
2680 | raw_spin_unlock_rcu_node(rnp_root); /* irqs remain disabled. */ | ||
2681 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
2682 | return; | ||
2683 | } | ||
2684 | WARN_ON(1); | ||
2685 | if (rnp_root != rnp) | ||
2686 | raw_spin_unlock_rcu_node(rnp_root); | ||
2687 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
2688 | show_rcu_gp_kthreads(); | ||
2689 | } | ||
2690 | |||
2691 | /* | ||
2692 | * Do a forward-progress check for rcutorture. This is normally invoked | ||
2693 | * due to an OOM event. The argument "j" gives the time period during | ||
2694 | * which rcutorture would like progress to have been made. | ||
2695 | */ | ||
2696 | void rcu_fwd_progress_check(unsigned long j) | ||
2697 | { | ||
2698 | unsigned long cbs; | ||
2699 | int cpu; | ||
2700 | unsigned long max_cbs = 0; | ||
2701 | int max_cpu = -1; | ||
2702 | struct rcu_data *rdp; | ||
2703 | |||
2704 | if (rcu_gp_in_progress()) { | ||
2705 | pr_info("%s: GP age %lu jiffies\n", | ||
2706 | __func__, jiffies - rcu_state.gp_start); | ||
2707 | show_rcu_gp_kthreads(); | ||
2708 | } else { | ||
2709 | pr_info("%s: Last GP end %lu jiffies ago\n", | ||
2710 | __func__, jiffies - rcu_state.gp_end); | ||
2711 | preempt_disable(); | ||
2712 | rdp = this_cpu_ptr(&rcu_data); | ||
2713 | rcu_check_gp_start_stall(rdp->mynode, rdp, j); | ||
2714 | preempt_enable(); | ||
2715 | } | ||
2716 | for_each_possible_cpu(cpu) { | ||
2717 | cbs = rcu_get_n_cbs_cpu(cpu); | ||
2718 | if (!cbs) | ||
2719 | continue; | ||
2720 | if (max_cpu < 0) | ||
2721 | pr_info("%s: callbacks", __func__); | ||
2722 | pr_cont(" %d: %lu", cpu, cbs); | ||
2723 | if (cbs <= max_cbs) | ||
2724 | continue; | ||
2725 | max_cbs = cbs; | ||
2726 | max_cpu = cpu; | ||
2727 | } | ||
2728 | if (max_cpu >= 0) | ||
2729 | pr_cont("\n"); | ||
2730 | } | ||
2731 | EXPORT_SYMBOL_GPL(rcu_fwd_progress_check); | ||
2732 | |||
2733 | /* Perform RCU core processing work for the current CPU. */ | 2255 | /* Perform RCU core processing work for the current CPU. */ |
2734 | static __latent_entropy void rcu_core(struct softirq_action *unused) | 2256 | static __latent_entropy void rcu_core(struct softirq_action *unused) |
2735 | { | 2257 | { |
@@ -2870,7 +2392,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy) | |||
2870 | * Use rcu:rcu_callback trace event to find the previous | 2392 | * Use rcu:rcu_callback trace event to find the previous |
2871 | * time callback was passed to __call_rcu(). | 2393 | * time callback was passed to __call_rcu(). |
2872 | */ | 2394 | */ |
2873 | WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pF()!!!\n", | 2395 | WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pS()!!!\n", |
2874 | head, head->func); | 2396 | head, head->func); |
2875 | WRITE_ONCE(head->func, rcu_leak_callback); | 2397 | WRITE_ONCE(head->func, rcu_leak_callback); |
2876 | return; | 2398 | return; |
@@ -3559,13 +3081,11 @@ static int rcu_pm_notify(struct notifier_block *self, | |||
3559 | switch (action) { | 3081 | switch (action) { |
3560 | case PM_HIBERNATION_PREPARE: | 3082 | case PM_HIBERNATION_PREPARE: |
3561 | case PM_SUSPEND_PREPARE: | 3083 | case PM_SUSPEND_PREPARE: |
3562 | if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ | 3084 | rcu_expedite_gp(); |
3563 | rcu_expedite_gp(); | ||
3564 | break; | 3085 | break; |
3565 | case PM_POST_HIBERNATION: | 3086 | case PM_POST_HIBERNATION: |
3566 | case PM_POST_SUSPEND: | 3087 | case PM_POST_SUSPEND: |
3567 | if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ | 3088 | rcu_unexpedite_gp(); |
3568 | rcu_unexpedite_gp(); | ||
3569 | break; | 3089 | break; |
3570 | default: | 3090 | default: |
3571 | break; | 3091 | break; |
@@ -3742,8 +3262,7 @@ static void __init rcu_init_geometry(void) | |||
3742 | jiffies_till_first_fqs = d; | 3262 | jiffies_till_first_fqs = d; |
3743 | if (jiffies_till_next_fqs == ULONG_MAX) | 3263 | if (jiffies_till_next_fqs == ULONG_MAX) |
3744 | jiffies_till_next_fqs = d; | 3264 | jiffies_till_next_fqs = d; |
3745 | if (jiffies_till_sched_qs == ULONG_MAX) | 3265 | adjust_jiffies_till_sched_qs(); |
3746 | adjust_jiffies_till_sched_qs(); | ||
3747 | 3266 | ||
3748 | /* If the compile-time values are accurate, just leave. */ | 3267 | /* If the compile-time values are accurate, just leave. */ |
3749 | if (rcu_fanout_leaf == RCU_FANOUT_LEAF && | 3268 | if (rcu_fanout_leaf == RCU_FANOUT_LEAF && |
@@ -3858,5 +3377,6 @@ void __init rcu_init(void) | |||
3858 | srcu_init(); | 3377 | srcu_init(); |
3859 | } | 3378 | } |
3860 | 3379 | ||
3380 | #include "tree_stall.h" | ||
3861 | #include "tree_exp.h" | 3381 | #include "tree_exp.h" |
3862 | #include "tree_plugin.h" | 3382 | #include "tree_plugin.h" |
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index bb4f995f2d3f..e253d11af3c4 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h | |||
@@ -393,15 +393,13 @@ static const char *tp_rcu_varname __used __tracepoint_string = rcu_name; | |||
393 | 393 | ||
394 | int rcu_dynticks_snap(struct rcu_data *rdp); | 394 | int rcu_dynticks_snap(struct rcu_data *rdp); |
395 | 395 | ||
396 | /* Forward declarations for rcutree_plugin.h */ | 396 | /* Forward declarations for tree_plugin.h */ |
397 | static void rcu_bootup_announce(void); | 397 | static void rcu_bootup_announce(void); |
398 | static void rcu_qs(void); | 398 | static void rcu_qs(void); |
399 | static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); | 399 | static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); |
400 | #ifdef CONFIG_HOTPLUG_CPU | 400 | #ifdef CONFIG_HOTPLUG_CPU |
401 | static bool rcu_preempt_has_tasks(struct rcu_node *rnp); | 401 | static bool rcu_preempt_has_tasks(struct rcu_node *rnp); |
402 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 402 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
403 | static void rcu_print_detail_task_stall(void); | ||
404 | static int rcu_print_task_stall(struct rcu_node *rnp); | ||
405 | static int rcu_print_task_exp_stall(struct rcu_node *rnp); | 403 | static int rcu_print_task_exp_stall(struct rcu_node *rnp); |
406 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); | 404 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); |
407 | static void rcu_flavor_sched_clock_irq(int user); | 405 | static void rcu_flavor_sched_clock_irq(int user); |
@@ -418,9 +416,6 @@ static void rcu_prepare_for_idle(void); | |||
418 | static bool rcu_preempt_has_tasks(struct rcu_node *rnp); | 416 | static bool rcu_preempt_has_tasks(struct rcu_node *rnp); |
419 | static bool rcu_preempt_need_deferred_qs(struct task_struct *t); | 417 | static bool rcu_preempt_need_deferred_qs(struct task_struct *t); |
420 | static void rcu_preempt_deferred_qs(struct task_struct *t); | 418 | static void rcu_preempt_deferred_qs(struct task_struct *t); |
421 | static void print_cpu_stall_info_begin(void); | ||
422 | static void print_cpu_stall_info(int cpu); | ||
423 | static void print_cpu_stall_info_end(void); | ||
424 | static void zero_cpu_stall_ticks(struct rcu_data *rdp); | 419 | static void zero_cpu_stall_ticks(struct rcu_data *rdp); |
425 | static bool rcu_nocb_cpu_needs_barrier(int cpu); | 420 | static bool rcu_nocb_cpu_needs_barrier(int cpu); |
426 | static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp); | 421 | static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp); |
@@ -445,3 +440,10 @@ static void rcu_bind_gp_kthread(void); | |||
445 | static bool rcu_nohz_full_cpu(void); | 440 | static bool rcu_nohz_full_cpu(void); |
446 | static void rcu_dynticks_task_enter(void); | 441 | static void rcu_dynticks_task_enter(void); |
447 | static void rcu_dynticks_task_exit(void); | 442 | static void rcu_dynticks_task_exit(void); |
443 | |||
444 | /* Forward declarations for tree_stall.h */ | ||
445 | static void record_gp_stall_check_time(void); | ||
446 | static void rcu_iw_handler(struct irq_work *iwp); | ||
447 | static void check_cpu_stall(struct rcu_data *rdp); | ||
448 | static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, | ||
449 | const unsigned long gpssdelay); | ||
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 4c2a0189e748..9c990df880d1 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h | |||
@@ -10,6 +10,7 @@ | |||
10 | #include <linux/lockdep.h> | 10 | #include <linux/lockdep.h> |
11 | 11 | ||
12 | static void rcu_exp_handler(void *unused); | 12 | static void rcu_exp_handler(void *unused); |
13 | static int rcu_print_task_exp_stall(struct rcu_node *rnp); | ||
13 | 14 | ||
14 | /* | 15 | /* |
15 | * Record the start of an expedited grace period. | 16 | * Record the start of an expedited grace period. |
@@ -633,7 +634,7 @@ static void rcu_exp_handler(void *unused) | |||
633 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | 634 | raw_spin_lock_irqsave_rcu_node(rnp, flags); |
634 | if (rnp->expmask & rdp->grpmask) { | 635 | if (rnp->expmask & rdp->grpmask) { |
635 | rdp->deferred_qs = true; | 636 | rdp->deferred_qs = true; |
636 | WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, true); | 637 | t->rcu_read_unlock_special.b.exp_hint = true; |
637 | } | 638 | } |
638 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 639 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
639 | return; | 640 | return; |
@@ -648,7 +649,7 @@ static void rcu_exp_handler(void *unused) | |||
648 | * | 649 | * |
649 | * If the CPU is fully enabled (or if some buggy RCU-preempt | 650 | * If the CPU is fully enabled (or if some buggy RCU-preempt |
650 | * read-side critical section is being used from idle), just | 651 | * read-side critical section is being used from idle), just |
651 | * invoke rcu_preempt_defer_qs() to immediately report the | 652 | * invoke rcu_preempt_deferred_qs() to immediately report the |
652 | * quiescent state. We cannot use rcu_read_unlock_special() | 653 | * quiescent state. We cannot use rcu_read_unlock_special() |
653 | * because we are in an interrupt handler, which will cause that | 654 | * because we are in an interrupt handler, which will cause that |
654 | * function to take an early exit without doing anything. | 655 | * function to take an early exit without doing anything. |
@@ -670,6 +671,27 @@ static void sync_sched_exp_online_cleanup(int cpu) | |||
670 | { | 671 | { |
671 | } | 672 | } |
672 | 673 | ||
674 | /* | ||
675 | * Scan the current list of tasks blocked within RCU read-side critical | ||
676 | * sections, printing out the tid of each that is blocking the current | ||
677 | * expedited grace period. | ||
678 | */ | ||
679 | static int rcu_print_task_exp_stall(struct rcu_node *rnp) | ||
680 | { | ||
681 | struct task_struct *t; | ||
682 | int ndetected = 0; | ||
683 | |||
684 | if (!rnp->exp_tasks) | ||
685 | return 0; | ||
686 | t = list_entry(rnp->exp_tasks->prev, | ||
687 | struct task_struct, rcu_node_entry); | ||
688 | list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { | ||
689 | pr_cont(" P%d", t->pid); | ||
690 | ndetected++; | ||
691 | } | ||
692 | return ndetected; | ||
693 | } | ||
694 | |||
673 | #else /* #ifdef CONFIG_PREEMPT_RCU */ | 695 | #else /* #ifdef CONFIG_PREEMPT_RCU */ |
674 | 696 | ||
675 | /* Invoked on each online non-idle CPU for expedited quiescent state. */ | 697 | /* Invoked on each online non-idle CPU for expedited quiescent state. */ |
@@ -709,6 +731,16 @@ static void sync_sched_exp_online_cleanup(int cpu) | |||
709 | WARN_ON_ONCE(ret); | 731 | WARN_ON_ONCE(ret); |
710 | } | 732 | } |
711 | 733 | ||
734 | /* | ||
735 | * Because preemptible RCU does not exist, we never have to check for | ||
736 | * tasks blocked within RCU read-side critical sections that are | ||
737 | * blocking the current expedited grace period. | ||
738 | */ | ||
739 | static int rcu_print_task_exp_stall(struct rcu_node *rnp) | ||
740 | { | ||
741 | return 0; | ||
742 | } | ||
743 | |||
712 | #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ | 744 | #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ |
713 | 745 | ||
714 | /** | 746 | /** |
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 97dba50f6fb2..1102765f91fd 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h | |||
@@ -285,7 +285,7 @@ static void rcu_qs(void) | |||
285 | TPS("cpuqs")); | 285 | TPS("cpuqs")); |
286 | __this_cpu_write(rcu_data.cpu_no_qs.b.norm, false); | 286 | __this_cpu_write(rcu_data.cpu_no_qs.b.norm, false); |
287 | barrier(); /* Coordinate with rcu_flavor_sched_clock_irq(). */ | 287 | barrier(); /* Coordinate with rcu_flavor_sched_clock_irq(). */ |
288 | current->rcu_read_unlock_special.b.need_qs = false; | 288 | WRITE_ONCE(current->rcu_read_unlock_special.b.need_qs, false); |
289 | } | 289 | } |
290 | } | 290 | } |
291 | 291 | ||
@@ -643,100 +643,6 @@ static void rcu_read_unlock_special(struct task_struct *t) | |||
643 | } | 643 | } |
644 | 644 | ||
645 | /* | 645 | /* |
646 | * Dump detailed information for all tasks blocking the current RCU | ||
647 | * grace period on the specified rcu_node structure. | ||
648 | */ | ||
649 | static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) | ||
650 | { | ||
651 | unsigned long flags; | ||
652 | struct task_struct *t; | ||
653 | |||
654 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
655 | if (!rcu_preempt_blocked_readers_cgp(rnp)) { | ||
656 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
657 | return; | ||
658 | } | ||
659 | t = list_entry(rnp->gp_tasks->prev, | ||
660 | struct task_struct, rcu_node_entry); | ||
661 | list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { | ||
662 | /* | ||
663 | * We could be printing a lot while holding a spinlock. | ||
664 | * Avoid triggering hard lockup. | ||
665 | */ | ||
666 | touch_nmi_watchdog(); | ||
667 | sched_show_task(t); | ||
668 | } | ||
669 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
670 | } | ||
671 | |||
672 | /* | ||
673 | * Dump detailed information for all tasks blocking the current RCU | ||
674 | * grace period. | ||
675 | */ | ||
676 | static void rcu_print_detail_task_stall(void) | ||
677 | { | ||
678 | struct rcu_node *rnp = rcu_get_root(); | ||
679 | |||
680 | rcu_print_detail_task_stall_rnp(rnp); | ||
681 | rcu_for_each_leaf_node(rnp) | ||
682 | rcu_print_detail_task_stall_rnp(rnp); | ||
683 | } | ||
684 | |||
685 | static void rcu_print_task_stall_begin(struct rcu_node *rnp) | ||
686 | { | ||
687 | pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", | ||
688 | rnp->level, rnp->grplo, rnp->grphi); | ||
689 | } | ||
690 | |||
691 | static void rcu_print_task_stall_end(void) | ||
692 | { | ||
693 | pr_cont("\n"); | ||
694 | } | ||
695 | |||
696 | /* | ||
697 | * Scan the current list of tasks blocked within RCU read-side critical | ||
698 | * sections, printing out the tid of each. | ||
699 | */ | ||
700 | static int rcu_print_task_stall(struct rcu_node *rnp) | ||
701 | { | ||
702 | struct task_struct *t; | ||
703 | int ndetected = 0; | ||
704 | |||
705 | if (!rcu_preempt_blocked_readers_cgp(rnp)) | ||
706 | return 0; | ||
707 | rcu_print_task_stall_begin(rnp); | ||
708 | t = list_entry(rnp->gp_tasks->prev, | ||
709 | struct task_struct, rcu_node_entry); | ||
710 | list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { | ||
711 | pr_cont(" P%d", t->pid); | ||
712 | ndetected++; | ||
713 | } | ||
714 | rcu_print_task_stall_end(); | ||
715 | return ndetected; | ||
716 | } | ||
717 | |||
718 | /* | ||
719 | * Scan the current list of tasks blocked within RCU read-side critical | ||
720 | * sections, printing out the tid of each that is blocking the current | ||
721 | * expedited grace period. | ||
722 | */ | ||
723 | static int rcu_print_task_exp_stall(struct rcu_node *rnp) | ||
724 | { | ||
725 | struct task_struct *t; | ||
726 | int ndetected = 0; | ||
727 | |||
728 | if (!rnp->exp_tasks) | ||
729 | return 0; | ||
730 | t = list_entry(rnp->exp_tasks->prev, | ||
731 | struct task_struct, rcu_node_entry); | ||
732 | list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { | ||
733 | pr_cont(" P%d", t->pid); | ||
734 | ndetected++; | ||
735 | } | ||
736 | return ndetected; | ||
737 | } | ||
738 | |||
739 | /* | ||
740 | * Check that the list of blocked tasks for the newly completed grace | 646 | * Check that the list of blocked tasks for the newly completed grace |
741 | * period is in fact empty. It is a serious bug to complete a grace | 647 | * period is in fact empty. It is a serious bug to complete a grace |
742 | * period that still has RCU readers blocked! This function must be | 648 | * period that still has RCU readers blocked! This function must be |
@@ -804,19 +710,25 @@ static void rcu_flavor_sched_clock_irq(int user) | |||
804 | 710 | ||
805 | /* | 711 | /* |
806 | * Check for a task exiting while in a preemptible-RCU read-side | 712 | * Check for a task exiting while in a preemptible-RCU read-side |
807 | * critical section, clean up if so. No need to issue warnings, | 713 | * critical section, clean up if so. No need to issue warnings, as |
808 | * as debug_check_no_locks_held() already does this if lockdep | 714 | * debug_check_no_locks_held() already does this if lockdep is enabled. |
809 | * is enabled. | 715 | * Besides, if this function does anything other than just immediately |
716 | * return, there was a bug of some sort. Spewing warnings from this | ||
717 | * function is like as not to simply obscure important prior warnings. | ||
810 | */ | 718 | */ |
811 | void exit_rcu(void) | 719 | void exit_rcu(void) |
812 | { | 720 | { |
813 | struct task_struct *t = current; | 721 | struct task_struct *t = current; |
814 | 722 | ||
815 | if (likely(list_empty(¤t->rcu_node_entry))) | 723 | if (unlikely(!list_empty(¤t->rcu_node_entry))) { |
724 | t->rcu_read_lock_nesting = 1; | ||
725 | barrier(); | ||
726 | WRITE_ONCE(t->rcu_read_unlock_special.b.blocked, true); | ||
727 | } else if (unlikely(t->rcu_read_lock_nesting)) { | ||
728 | t->rcu_read_lock_nesting = 1; | ||
729 | } else { | ||
816 | return; | 730 | return; |
817 | t->rcu_read_lock_nesting = 1; | 731 | } |
818 | barrier(); | ||
819 | t->rcu_read_unlock_special.b.blocked = true; | ||
820 | __rcu_read_unlock(); | 732 | __rcu_read_unlock(); |
821 | rcu_preempt_deferred_qs(current); | 733 | rcu_preempt_deferred_qs(current); |
822 | } | 734 | } |
@@ -980,33 +892,6 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) | |||
980 | static void rcu_preempt_deferred_qs(struct task_struct *t) { } | 892 | static void rcu_preempt_deferred_qs(struct task_struct *t) { } |
981 | 893 | ||
982 | /* | 894 | /* |
983 | * Because preemptible RCU does not exist, we never have to check for | ||
984 | * tasks blocked within RCU read-side critical sections. | ||
985 | */ | ||
986 | static void rcu_print_detail_task_stall(void) | ||
987 | { | ||
988 | } | ||
989 | |||
990 | /* | ||
991 | * Because preemptible RCU does not exist, we never have to check for | ||
992 | * tasks blocked within RCU read-side critical sections. | ||
993 | */ | ||
994 | static int rcu_print_task_stall(struct rcu_node *rnp) | ||
995 | { | ||
996 | return 0; | ||
997 | } | ||
998 | |||
999 | /* | ||
1000 | * Because preemptible RCU does not exist, we never have to check for | ||
1001 | * tasks blocked within RCU read-side critical sections that are | ||
1002 | * blocking the current expedited grace period. | ||
1003 | */ | ||
1004 | static int rcu_print_task_exp_stall(struct rcu_node *rnp) | ||
1005 | { | ||
1006 | return 0; | ||
1007 | } | ||
1008 | |||
1009 | /* | ||
1010 | * Because there is no preemptible RCU, there can be no readers blocked, | 895 | * Because there is no preemptible RCU, there can be no readers blocked, |
1011 | * so there is no need to check for blocked tasks. So check only for | 896 | * so there is no need to check for blocked tasks. So check only for |
1012 | * bogus qsmask values. | 897 | * bogus qsmask values. |
@@ -1185,8 +1070,6 @@ static int rcu_boost_kthread(void *arg) | |||
1185 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) | 1070 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) |
1186 | __releases(rnp->lock) | 1071 | __releases(rnp->lock) |
1187 | { | 1072 | { |
1188 | struct task_struct *t; | ||
1189 | |||
1190 | raw_lockdep_assert_held_rcu_node(rnp); | 1073 | raw_lockdep_assert_held_rcu_node(rnp); |
1191 | if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { | 1074 | if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { |
1192 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 1075 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
@@ -1200,9 +1083,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) | |||
1200 | if (rnp->exp_tasks == NULL) | 1083 | if (rnp->exp_tasks == NULL) |
1201 | rnp->boost_tasks = rnp->gp_tasks; | 1084 | rnp->boost_tasks = rnp->gp_tasks; |
1202 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 1085 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
1203 | t = rnp->boost_kthread_task; | 1086 | rcu_wake_cond(rnp->boost_kthread_task, |
1204 | if (t) | 1087 | rnp->boost_kthread_status); |
1205 | rcu_wake_cond(t, rnp->boost_kthread_status); | ||
1206 | } else { | 1088 | } else { |
1207 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 1089 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
1208 | } | 1090 | } |
@@ -1649,98 +1531,6 @@ static void rcu_cleanup_after_idle(void) | |||
1649 | 1531 | ||
1650 | #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ | 1532 | #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ |
1651 | 1533 | ||
1652 | #ifdef CONFIG_RCU_FAST_NO_HZ | ||
1653 | |||
1654 | static void print_cpu_stall_fast_no_hz(char *cp, int cpu) | ||
1655 | { | ||
1656 | struct rcu_data *rdp = &per_cpu(rcu_data, cpu); | ||
1657 | |||
1658 | sprintf(cp, "last_accelerate: %04lx/%04lx, Nonlazy posted: %c%c%c", | ||
1659 | rdp->last_accelerate & 0xffff, jiffies & 0xffff, | ||
1660 | ".l"[rdp->all_lazy], | ||
1661 | ".L"[!rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)], | ||
1662 | ".D"[!rdp->tick_nohz_enabled_snap]); | ||
1663 | } | ||
1664 | |||
1665 | #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ | ||
1666 | |||
1667 | static void print_cpu_stall_fast_no_hz(char *cp, int cpu) | ||
1668 | { | ||
1669 | *cp = '\0'; | ||
1670 | } | ||
1671 | |||
1672 | #endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ | ||
1673 | |||
1674 | /* Initiate the stall-info list. */ | ||
1675 | static void print_cpu_stall_info_begin(void) | ||
1676 | { | ||
1677 | pr_cont("\n"); | ||
1678 | } | ||
1679 | |||
1680 | /* | ||
1681 | * Print out diagnostic information for the specified stalled CPU. | ||
1682 | * | ||
1683 | * If the specified CPU is aware of the current RCU grace period, then | ||
1684 | * print the number of scheduling clock interrupts the CPU has taken | ||
1685 | * during the time that it has been aware. Otherwise, print the number | ||
1686 | * of RCU grace periods that this CPU is ignorant of, for example, "1" | ||
1687 | * if the CPU was aware of the previous grace period. | ||
1688 | * | ||
1689 | * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info. | ||
1690 | */ | ||
1691 | static void print_cpu_stall_info(int cpu) | ||
1692 | { | ||
1693 | unsigned long delta; | ||
1694 | char fast_no_hz[72]; | ||
1695 | struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); | ||
1696 | char *ticks_title; | ||
1697 | unsigned long ticks_value; | ||
1698 | |||
1699 | /* | ||
1700 | * We could be printing a lot while holding a spinlock. Avoid | ||
1701 | * triggering hard lockup. | ||
1702 | */ | ||
1703 | touch_nmi_watchdog(); | ||
1704 | |||
1705 | ticks_value = rcu_seq_ctr(rcu_state.gp_seq - rdp->gp_seq); | ||
1706 | if (ticks_value) { | ||
1707 | ticks_title = "GPs behind"; | ||
1708 | } else { | ||
1709 | ticks_title = "ticks this GP"; | ||
1710 | ticks_value = rdp->ticks_this_gp; | ||
1711 | } | ||
1712 | print_cpu_stall_fast_no_hz(fast_no_hz, cpu); | ||
1713 | delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); | ||
1714 | pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n", | ||
1715 | cpu, | ||
1716 | "O."[!!cpu_online(cpu)], | ||
1717 | "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], | ||
1718 | "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)], | ||
1719 | !IS_ENABLED(CONFIG_IRQ_WORK) ? '?' : | ||
1720 | rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' : | ||
1721 | "!."[!delta], | ||
1722 | ticks_value, ticks_title, | ||
1723 | rcu_dynticks_snap(rdp) & 0xfff, | ||
1724 | rdp->dynticks_nesting, rdp->dynticks_nmi_nesting, | ||
1725 | rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), | ||
1726 | READ_ONCE(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, | ||
1727 | fast_no_hz); | ||
1728 | } | ||
1729 | |||
1730 | /* Terminate the stall-info list. */ | ||
1731 | static void print_cpu_stall_info_end(void) | ||
1732 | { | ||
1733 | pr_err("\t"); | ||
1734 | } | ||
1735 | |||
1736 | /* Zero ->ticks_this_gp and snapshot the number of RCU softirq handlers. */ | ||
1737 | static void zero_cpu_stall_ticks(struct rcu_data *rdp) | ||
1738 | { | ||
1739 | rdp->ticks_this_gp = 0; | ||
1740 | rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id()); | ||
1741 | WRITE_ONCE(rdp->last_fqs_resched, jiffies); | ||
1742 | } | ||
1743 | |||
1744 | #ifdef CONFIG_RCU_NOCB_CPU | 1534 | #ifdef CONFIG_RCU_NOCB_CPU |
1745 | 1535 | ||
1746 | /* | 1536 | /* |
@@ -1766,11 +1556,22 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp) | |||
1766 | */ | 1556 | */ |
1767 | 1557 | ||
1768 | 1558 | ||
1769 | /* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */ | 1559 | /* |
1560 | * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. | ||
1561 | * The string after the "rcu_nocbs=" is either "all" for all CPUs, or a | ||
1562 | * comma-separated list of CPUs and/or CPU ranges. If an invalid list is | ||
1563 | * given, a warning is emitted and all CPUs are offloaded. | ||
1564 | */ | ||
1770 | static int __init rcu_nocb_setup(char *str) | 1565 | static int __init rcu_nocb_setup(char *str) |
1771 | { | 1566 | { |
1772 | alloc_bootmem_cpumask_var(&rcu_nocb_mask); | 1567 | alloc_bootmem_cpumask_var(&rcu_nocb_mask); |
1773 | cpulist_parse(str, rcu_nocb_mask); | 1568 | if (!strcasecmp(str, "all")) |
1569 | cpumask_setall(rcu_nocb_mask); | ||
1570 | else | ||
1571 | if (cpulist_parse(str, rcu_nocb_mask)) { | ||
1572 | pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n"); | ||
1573 | cpumask_setall(rcu_nocb_mask); | ||
1574 | } | ||
1774 | return 1; | 1575 | return 1; |
1775 | } | 1576 | } |
1776 | __setup("rcu_nocbs=", rcu_nocb_setup); | 1577 | __setup("rcu_nocbs=", rcu_nocb_setup); |
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h new file mode 100644 index 000000000000..f65a73a97323 --- /dev/null +++ b/kernel/rcu/tree_stall.h | |||
@@ -0,0 +1,709 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0+ | ||
2 | /* | ||
3 | * RCU CPU stall warnings for normal RCU grace periods | ||
4 | * | ||
5 | * Copyright IBM Corporation, 2019 | ||
6 | * | ||
7 | * Author: Paul E. McKenney <paulmck@linux.ibm.com> | ||
8 | */ | ||
9 | |||
10 | ////////////////////////////////////////////////////////////////////////////// | ||
11 | // | ||
12 | // Controlling CPU stall warnings, including delay calculation. | ||
13 | |||
14 | /* panic() on RCU Stall sysctl. */ | ||
15 | int sysctl_panic_on_rcu_stall __read_mostly; | ||
16 | |||
17 | #ifdef CONFIG_PROVE_RCU | ||
18 | #define RCU_STALL_DELAY_DELTA (5 * HZ) | ||
19 | #else | ||
20 | #define RCU_STALL_DELAY_DELTA 0 | ||
21 | #endif | ||
22 | |||
23 | /* Limit-check stall timeouts specified at boottime and runtime. */ | ||
24 | int rcu_jiffies_till_stall_check(void) | ||
25 | { | ||
26 | int till_stall_check = READ_ONCE(rcu_cpu_stall_timeout); | ||
27 | |||
28 | /* | ||
29 | * Limit check must be consistent with the Kconfig limits | ||
30 | * for CONFIG_RCU_CPU_STALL_TIMEOUT. | ||
31 | */ | ||
32 | if (till_stall_check < 3) { | ||
33 | WRITE_ONCE(rcu_cpu_stall_timeout, 3); | ||
34 | till_stall_check = 3; | ||
35 | } else if (till_stall_check > 300) { | ||
36 | WRITE_ONCE(rcu_cpu_stall_timeout, 300); | ||
37 | till_stall_check = 300; | ||
38 | } | ||
39 | return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; | ||
40 | } | ||
41 | EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check); | ||
42 | |||
43 | /* Don't do RCU CPU stall warnings during long sysrq printouts. */ | ||
44 | void rcu_sysrq_start(void) | ||
45 | { | ||
46 | if (!rcu_cpu_stall_suppress) | ||
47 | rcu_cpu_stall_suppress = 2; | ||
48 | } | ||
49 | |||
50 | void rcu_sysrq_end(void) | ||
51 | { | ||
52 | if (rcu_cpu_stall_suppress == 2) | ||
53 | rcu_cpu_stall_suppress = 0; | ||
54 | } | ||
55 | |||
56 | /* Don't print RCU CPU stall warnings during a kernel panic. */ | ||
57 | static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) | ||
58 | { | ||
59 | rcu_cpu_stall_suppress = 1; | ||
60 | return NOTIFY_DONE; | ||
61 | } | ||
62 | |||
63 | static struct notifier_block rcu_panic_block = { | ||
64 | .notifier_call = rcu_panic, | ||
65 | }; | ||
66 | |||
67 | static int __init check_cpu_stall_init(void) | ||
68 | { | ||
69 | atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); | ||
70 | return 0; | ||
71 | } | ||
72 | early_initcall(check_cpu_stall_init); | ||
73 | |||
74 | /* If so specified via sysctl, panic, yielding cleaner stall-warning output. */ | ||
75 | static void panic_on_rcu_stall(void) | ||
76 | { | ||
77 | if (sysctl_panic_on_rcu_stall) | ||
78 | panic("RCU Stall\n"); | ||
79 | } | ||
80 | |||
81 | /** | ||
82 | * rcu_cpu_stall_reset - prevent further stall warnings in current grace period | ||
83 | * | ||
84 | * Set the stall-warning timeout way off into the future, thus preventing | ||
85 | * any RCU CPU stall-warning messages from appearing in the current set of | ||
86 | * RCU grace periods. | ||
87 | * | ||
88 | * The caller must disable hard irqs. | ||
89 | */ | ||
90 | void rcu_cpu_stall_reset(void) | ||
91 | { | ||
92 | WRITE_ONCE(rcu_state.jiffies_stall, jiffies + ULONG_MAX / 2); | ||
93 | } | ||
94 | |||
95 | ////////////////////////////////////////////////////////////////////////////// | ||
96 | // | ||
97 | // Interaction with RCU grace periods | ||
98 | |||
99 | /* Start of new grace period, so record stall time (and forcing times). */ | ||
100 | static void record_gp_stall_check_time(void) | ||
101 | { | ||
102 | unsigned long j = jiffies; | ||
103 | unsigned long j1; | ||
104 | |||
105 | rcu_state.gp_start = j; | ||
106 | j1 = rcu_jiffies_till_stall_check(); | ||
107 | /* Record ->gp_start before ->jiffies_stall. */ | ||
108 | smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */ | ||
109 | rcu_state.jiffies_resched = j + j1 / 2; | ||
110 | rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs); | ||
111 | } | ||
112 | |||
113 | /* Zero ->ticks_this_gp and snapshot the number of RCU softirq handlers. */ | ||
114 | static void zero_cpu_stall_ticks(struct rcu_data *rdp) | ||
115 | { | ||
116 | rdp->ticks_this_gp = 0; | ||
117 | rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id()); | ||
118 | WRITE_ONCE(rdp->last_fqs_resched, jiffies); | ||
119 | } | ||
120 | |||
121 | /* | ||
122 | * If too much time has passed in the current grace period, and if | ||
123 | * so configured, go kick the relevant kthreads. | ||
124 | */ | ||
125 | static void rcu_stall_kick_kthreads(void) | ||
126 | { | ||
127 | unsigned long j; | ||
128 | |||
129 | if (!rcu_kick_kthreads) | ||
130 | return; | ||
131 | j = READ_ONCE(rcu_state.jiffies_kick_kthreads); | ||
132 | if (time_after(jiffies, j) && rcu_state.gp_kthread && | ||
133 | (rcu_gp_in_progress() || READ_ONCE(rcu_state.gp_flags))) { | ||
134 | WARN_ONCE(1, "Kicking %s grace-period kthread\n", | ||
135 | rcu_state.name); | ||
136 | rcu_ftrace_dump(DUMP_ALL); | ||
137 | wake_up_process(rcu_state.gp_kthread); | ||
138 | WRITE_ONCE(rcu_state.jiffies_kick_kthreads, j + HZ); | ||
139 | } | ||
140 | } | ||
141 | |||
142 | /* | ||
143 | * Handler for the irq_work request posted about halfway into the RCU CPU | ||
144 | * stall timeout, and used to detect excessive irq disabling. Set state | ||
145 | * appropriately, but just complain if there is unexpected state on entry. | ||
146 | */ | ||
147 | static void rcu_iw_handler(struct irq_work *iwp) | ||
148 | { | ||
149 | struct rcu_data *rdp; | ||
150 | struct rcu_node *rnp; | ||
151 | |||
152 | rdp = container_of(iwp, struct rcu_data, rcu_iw); | ||
153 | rnp = rdp->mynode; | ||
154 | raw_spin_lock_rcu_node(rnp); | ||
155 | if (!WARN_ON_ONCE(!rdp->rcu_iw_pending)) { | ||
156 | rdp->rcu_iw_gp_seq = rnp->gp_seq; | ||
157 | rdp->rcu_iw_pending = false; | ||
158 | } | ||
159 | raw_spin_unlock_rcu_node(rnp); | ||
160 | } | ||
161 | |||
162 | ////////////////////////////////////////////////////////////////////////////// | ||
163 | // | ||
164 | // Printing RCU CPU stall warnings | ||
165 | |||
166 | #ifdef CONFIG_PREEMPT | ||
167 | |||
168 | /* | ||
169 | * Dump detailed information for all tasks blocking the current RCU | ||
170 | * grace period on the specified rcu_node structure. | ||
171 | */ | ||
172 | static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) | ||
173 | { | ||
174 | unsigned long flags; | ||
175 | struct task_struct *t; | ||
176 | |||
177 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
178 | if (!rcu_preempt_blocked_readers_cgp(rnp)) { | ||
179 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
180 | return; | ||
181 | } | ||
182 | t = list_entry(rnp->gp_tasks->prev, | ||
183 | struct task_struct, rcu_node_entry); | ||
184 | list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { | ||
185 | /* | ||
186 | * We could be printing a lot while holding a spinlock. | ||
187 | * Avoid triggering hard lockup. | ||
188 | */ | ||
189 | touch_nmi_watchdog(); | ||
190 | sched_show_task(t); | ||
191 | } | ||
192 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
193 | } | ||
194 | |||
195 | /* | ||
196 | * Scan the current list of tasks blocked within RCU read-side critical | ||
197 | * sections, printing out the tid of each. | ||
198 | */ | ||
199 | static int rcu_print_task_stall(struct rcu_node *rnp) | ||
200 | { | ||
201 | struct task_struct *t; | ||
202 | int ndetected = 0; | ||
203 | |||
204 | if (!rcu_preempt_blocked_readers_cgp(rnp)) | ||
205 | return 0; | ||
206 | pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", | ||
207 | rnp->level, rnp->grplo, rnp->grphi); | ||
208 | t = list_entry(rnp->gp_tasks->prev, | ||
209 | struct task_struct, rcu_node_entry); | ||
210 | list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { | ||
211 | pr_cont(" P%d", t->pid); | ||
212 | ndetected++; | ||
213 | } | ||
214 | pr_cont("\n"); | ||
215 | return ndetected; | ||
216 | } | ||
217 | |||
218 | #else /* #ifdef CONFIG_PREEMPT */ | ||
219 | |||
220 | /* | ||
221 | * Because preemptible RCU does not exist, we never have to check for | ||
222 | * tasks blocked within RCU read-side critical sections. | ||
223 | */ | ||
224 | static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) | ||
225 | { | ||
226 | } | ||
227 | |||
228 | /* | ||
229 | * Because preemptible RCU does not exist, we never have to check for | ||
230 | * tasks blocked within RCU read-side critical sections. | ||
231 | */ | ||
232 | static int rcu_print_task_stall(struct rcu_node *rnp) | ||
233 | { | ||
234 | return 0; | ||
235 | } | ||
236 | #endif /* #else #ifdef CONFIG_PREEMPT */ | ||
237 | |||
238 | /* | ||
239 | * Dump stacks of all tasks running on stalled CPUs. First try using | ||
240 | * NMIs, but fall back to manual remote stack tracing on architectures | ||
241 | * that don't support NMI-based stack dumps. The NMI-triggered stack | ||
242 | * traces are more accurate because they are printed by the target CPU. | ||
243 | */ | ||
244 | static void rcu_dump_cpu_stacks(void) | ||
245 | { | ||
246 | int cpu; | ||
247 | unsigned long flags; | ||
248 | struct rcu_node *rnp; | ||
249 | |||
250 | rcu_for_each_leaf_node(rnp) { | ||
251 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
252 | for_each_leaf_node_possible_cpu(rnp, cpu) | ||
253 | if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) | ||
254 | if (!trigger_single_cpu_backtrace(cpu)) | ||
255 | dump_cpu_task(cpu); | ||
256 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
257 | } | ||
258 | } | ||
259 | |||
260 | #ifdef CONFIG_RCU_FAST_NO_HZ | ||
261 | |||
262 | static void print_cpu_stall_fast_no_hz(char *cp, int cpu) | ||
263 | { | ||
264 | struct rcu_data *rdp = &per_cpu(rcu_data, cpu); | ||
265 | |||
266 | sprintf(cp, "last_accelerate: %04lx/%04lx, Nonlazy posted: %c%c%c", | ||
267 | rdp->last_accelerate & 0xffff, jiffies & 0xffff, | ||
268 | ".l"[rdp->all_lazy], | ||
269 | ".L"[!rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)], | ||
270 | ".D"[!!rdp->tick_nohz_enabled_snap]); | ||
271 | } | ||
272 | |||
273 | #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ | ||
274 | |||
275 | static void print_cpu_stall_fast_no_hz(char *cp, int cpu) | ||
276 | { | ||
277 | *cp = '\0'; | ||
278 | } | ||
279 | |||
280 | #endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ | ||
281 | |||
282 | /* | ||
283 | * Print out diagnostic information for the specified stalled CPU. | ||
284 | * | ||
285 | * If the specified CPU is aware of the current RCU grace period, then | ||
286 | * print the number of scheduling clock interrupts the CPU has taken | ||
287 | * during the time that it has been aware. Otherwise, print the number | ||
288 | * of RCU grace periods that this CPU is ignorant of, for example, "1" | ||
289 | * if the CPU was aware of the previous grace period. | ||
290 | * | ||
291 | * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info. | ||
292 | */ | ||
293 | static void print_cpu_stall_info(int cpu) | ||
294 | { | ||
295 | unsigned long delta; | ||
296 | char fast_no_hz[72]; | ||
297 | struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); | ||
298 | char *ticks_title; | ||
299 | unsigned long ticks_value; | ||
300 | |||
301 | /* | ||
302 | * We could be printing a lot while holding a spinlock. Avoid | ||
303 | * triggering hard lockup. | ||
304 | */ | ||
305 | touch_nmi_watchdog(); | ||
306 | |||
307 | ticks_value = rcu_seq_ctr(rcu_state.gp_seq - rdp->gp_seq); | ||
308 | if (ticks_value) { | ||
309 | ticks_title = "GPs behind"; | ||
310 | } else { | ||
311 | ticks_title = "ticks this GP"; | ||
312 | ticks_value = rdp->ticks_this_gp; | ||
313 | } | ||
314 | print_cpu_stall_fast_no_hz(fast_no_hz, cpu); | ||
315 | delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); | ||
316 | pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n", | ||
317 | cpu, | ||
318 | "O."[!!cpu_online(cpu)], | ||
319 | "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], | ||
320 | "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)], | ||
321 | !IS_ENABLED(CONFIG_IRQ_WORK) ? '?' : | ||
322 | rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' : | ||
323 | "!."[!delta], | ||
324 | ticks_value, ticks_title, | ||
325 | rcu_dynticks_snap(rdp) & 0xfff, | ||
326 | rdp->dynticks_nesting, rdp->dynticks_nmi_nesting, | ||
327 | rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), | ||
328 | READ_ONCE(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, | ||
329 | fast_no_hz); | ||
330 | } | ||
331 | |||
332 | /* Complain about starvation of grace-period kthread. */ | ||
333 | static void rcu_check_gp_kthread_starvation(void) | ||
334 | { | ||
335 | struct task_struct *gpk = rcu_state.gp_kthread; | ||
336 | unsigned long j; | ||
337 | |||
338 | j = jiffies - READ_ONCE(rcu_state.gp_activity); | ||
339 | if (j > 2 * HZ) { | ||
340 | pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n", | ||
341 | rcu_state.name, j, | ||
342 | (long)rcu_seq_current(&rcu_state.gp_seq), | ||
343 | READ_ONCE(rcu_state.gp_flags), | ||
344 | gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, | ||
345 | gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1); | ||
346 | if (gpk) { | ||
347 | pr_err("RCU grace-period kthread stack dump:\n"); | ||
348 | sched_show_task(gpk); | ||
349 | wake_up_process(gpk); | ||
350 | } | ||
351 | } | ||
352 | } | ||
353 | |||
354 | static void print_other_cpu_stall(unsigned long gp_seq) | ||
355 | { | ||
356 | int cpu; | ||
357 | unsigned long flags; | ||
358 | unsigned long gpa; | ||
359 | unsigned long j; | ||
360 | int ndetected = 0; | ||
361 | struct rcu_node *rnp; | ||
362 | long totqlen = 0; | ||
363 | |||
364 | /* Kick and suppress, if so configured. */ | ||
365 | rcu_stall_kick_kthreads(); | ||
366 | if (rcu_cpu_stall_suppress) | ||
367 | return; | ||
368 | |||
369 | /* | ||
370 | * OK, time to rat on our buddy... | ||
371 | * See Documentation/RCU/stallwarn.txt for info on how to debug | ||
372 | * RCU CPU stall warnings. | ||
373 | */ | ||
374 | pr_err("INFO: %s detected stalls on CPUs/tasks:\n", rcu_state.name); | ||
375 | rcu_for_each_leaf_node(rnp) { | ||
376 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
377 | ndetected += rcu_print_task_stall(rnp); | ||
378 | if (rnp->qsmask != 0) { | ||
379 | for_each_leaf_node_possible_cpu(rnp, cpu) | ||
380 | if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { | ||
381 | print_cpu_stall_info(cpu); | ||
382 | ndetected++; | ||
383 | } | ||
384 | } | ||
385 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
386 | } | ||
387 | |||
388 | for_each_possible_cpu(cpu) | ||
389 | totqlen += rcu_get_n_cbs_cpu(cpu); | ||
390 | pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n", | ||
391 | smp_processor_id(), (long)(jiffies - rcu_state.gp_start), | ||
392 | (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); | ||
393 | if (ndetected) { | ||
394 | rcu_dump_cpu_stacks(); | ||
395 | |||
396 | /* Complain about tasks blocking the grace period. */ | ||
397 | rcu_for_each_leaf_node(rnp) | ||
398 | rcu_print_detail_task_stall_rnp(rnp); | ||
399 | } else { | ||
400 | if (rcu_seq_current(&rcu_state.gp_seq) != gp_seq) { | ||
401 | pr_err("INFO: Stall ended before state dump start\n"); | ||
402 | } else { | ||
403 | j = jiffies; | ||
404 | gpa = READ_ONCE(rcu_state.gp_activity); | ||
405 | pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n", | ||
406 | rcu_state.name, j - gpa, j, gpa, | ||
407 | READ_ONCE(jiffies_till_next_fqs), | ||
408 | rcu_get_root()->qsmask); | ||
409 | /* In this case, the current CPU might be at fault. */ | ||
410 | sched_show_task(current); | ||
411 | } | ||
412 | } | ||
413 | /* Rewrite if needed in case of slow consoles. */ | ||
414 | if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall))) | ||
415 | WRITE_ONCE(rcu_state.jiffies_stall, | ||
416 | jiffies + 3 * rcu_jiffies_till_stall_check() + 3); | ||
417 | |||
418 | rcu_check_gp_kthread_starvation(); | ||
419 | |||
420 | panic_on_rcu_stall(); | ||
421 | |||
422 | rcu_force_quiescent_state(); /* Kick them all. */ | ||
423 | } | ||
424 | |||
425 | static void print_cpu_stall(void) | ||
426 | { | ||
427 | int cpu; | ||
428 | unsigned long flags; | ||
429 | struct rcu_data *rdp = this_cpu_ptr(&rcu_data); | ||
430 | struct rcu_node *rnp = rcu_get_root(); | ||
431 | long totqlen = 0; | ||
432 | |||
433 | /* Kick and suppress, if so configured. */ | ||
434 | rcu_stall_kick_kthreads(); | ||
435 | if (rcu_cpu_stall_suppress) | ||
436 | return; | ||
437 | |||
438 | /* | ||
439 | * OK, time to rat on ourselves... | ||
440 | * See Documentation/RCU/stallwarn.txt for info on how to debug | ||
441 | * RCU CPU stall warnings. | ||
442 | */ | ||
443 | pr_err("INFO: %s self-detected stall on CPU\n", rcu_state.name); | ||
444 | raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags); | ||
445 | print_cpu_stall_info(smp_processor_id()); | ||
446 | raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags); | ||
447 | for_each_possible_cpu(cpu) | ||
448 | totqlen += rcu_get_n_cbs_cpu(cpu); | ||
449 | pr_cont("\t(t=%lu jiffies g=%ld q=%lu)\n", | ||
450 | jiffies - rcu_state.gp_start, | ||
451 | (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); | ||
452 | |||
453 | rcu_check_gp_kthread_starvation(); | ||
454 | |||
455 | rcu_dump_cpu_stacks(); | ||
456 | |||
457 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
458 | /* Rewrite if needed in case of slow consoles. */ | ||
459 | if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall))) | ||
460 | WRITE_ONCE(rcu_state.jiffies_stall, | ||
461 | jiffies + 3 * rcu_jiffies_till_stall_check() + 3); | ||
462 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
463 | |||
464 | panic_on_rcu_stall(); | ||
465 | |||
466 | /* | ||
467 | * Attempt to revive the RCU machinery by forcing a context switch. | ||
468 | * | ||
469 | * A context switch would normally allow the RCU state machine to make | ||
470 | * progress and it could be we're stuck in kernel space without context | ||
471 | * switches for an entirely unreasonable amount of time. | ||
472 | */ | ||
473 | set_tsk_need_resched(current); | ||
474 | set_preempt_need_resched(); | ||
475 | } | ||
476 | |||
477 | static void check_cpu_stall(struct rcu_data *rdp) | ||
478 | { | ||
479 | unsigned long gs1; | ||
480 | unsigned long gs2; | ||
481 | unsigned long gps; | ||
482 | unsigned long j; | ||
483 | unsigned long jn; | ||
484 | unsigned long js; | ||
485 | struct rcu_node *rnp; | ||
486 | |||
487 | if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) || | ||
488 | !rcu_gp_in_progress()) | ||
489 | return; | ||
490 | rcu_stall_kick_kthreads(); | ||
491 | j = jiffies; | ||
492 | |||
493 | /* | ||
494 | * Lots of memory barriers to reject false positives. | ||
495 | * | ||
496 | * The idea is to pick up rcu_state.gp_seq, then | ||
497 | * rcu_state.jiffies_stall, then rcu_state.gp_start, and finally | ||
498 | * another copy of rcu_state.gp_seq. These values are updated in | ||
499 | * the opposite order with memory barriers (or equivalent) during | ||
500 | * grace-period initialization and cleanup. Now, a false positive | ||
501 | * can occur if we get an new value of rcu_state.gp_start and a old | ||
502 | * value of rcu_state.jiffies_stall. But given the memory barriers, | ||
503 | * the only way that this can happen is if one grace period ends | ||
504 | * and another starts between these two fetches. This is detected | ||
505 | * by comparing the second fetch of rcu_state.gp_seq with the | ||
506 | * previous fetch from rcu_state.gp_seq. | ||
507 | * | ||
508 | * Given this check, comparisons of jiffies, rcu_state.jiffies_stall, | ||
509 | * and rcu_state.gp_start suffice to forestall false positives. | ||
510 | */ | ||
511 | gs1 = READ_ONCE(rcu_state.gp_seq); | ||
512 | smp_rmb(); /* Pick up ->gp_seq first... */ | ||
513 | js = READ_ONCE(rcu_state.jiffies_stall); | ||
514 | smp_rmb(); /* ...then ->jiffies_stall before the rest... */ | ||
515 | gps = READ_ONCE(rcu_state.gp_start); | ||
516 | smp_rmb(); /* ...and finally ->gp_start before ->gp_seq again. */ | ||
517 | gs2 = READ_ONCE(rcu_state.gp_seq); | ||
518 | if (gs1 != gs2 || | ||
519 | ULONG_CMP_LT(j, js) || | ||
520 | ULONG_CMP_GE(gps, js)) | ||
521 | return; /* No stall or GP completed since entering function. */ | ||
522 | rnp = rdp->mynode; | ||
523 | jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; | ||
524 | if (rcu_gp_in_progress() && | ||
525 | (READ_ONCE(rnp->qsmask) & rdp->grpmask) && | ||
526 | cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { | ||
527 | |||
528 | /* We haven't checked in, so go dump stack. */ | ||
529 | print_cpu_stall(); | ||
530 | |||
531 | } else if (rcu_gp_in_progress() && | ||
532 | ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) && | ||
533 | cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { | ||
534 | |||
535 | /* They had a few time units to dump stack, so complain. */ | ||
536 | print_other_cpu_stall(gs2); | ||
537 | } | ||
538 | } | ||
539 | |||
540 | ////////////////////////////////////////////////////////////////////////////// | ||
541 | // | ||
542 | // RCU forward-progress mechanisms, including of callback invocation. | ||
543 | |||
544 | |||
545 | /* | ||
546 | * Show the state of the grace-period kthreads. | ||
547 | */ | ||
548 | void show_rcu_gp_kthreads(void) | ||
549 | { | ||
550 | int cpu; | ||
551 | unsigned long j; | ||
552 | unsigned long ja; | ||
553 | unsigned long jr; | ||
554 | unsigned long jw; | ||
555 | struct rcu_data *rdp; | ||
556 | struct rcu_node *rnp; | ||
557 | |||
558 | j = jiffies; | ||
559 | ja = j - READ_ONCE(rcu_state.gp_activity); | ||
560 | jr = j - READ_ONCE(rcu_state.gp_req_activity); | ||
561 | jw = j - READ_ONCE(rcu_state.gp_wake_time); | ||
562 | pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n", | ||
563 | rcu_state.name, gp_state_getname(rcu_state.gp_state), | ||
564 | rcu_state.gp_state, | ||
565 | rcu_state.gp_kthread ? rcu_state.gp_kthread->state : 0x1ffffL, | ||
566 | ja, jr, jw, (long)READ_ONCE(rcu_state.gp_wake_seq), | ||
567 | (long)READ_ONCE(rcu_state.gp_seq), | ||
568 | (long)READ_ONCE(rcu_get_root()->gp_seq_needed), | ||
569 | READ_ONCE(rcu_state.gp_flags)); | ||
570 | rcu_for_each_node_breadth_first(rnp) { | ||
571 | if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed)) | ||
572 | continue; | ||
573 | pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n", | ||
574 | rnp->grplo, rnp->grphi, (long)rnp->gp_seq, | ||
575 | (long)rnp->gp_seq_needed); | ||
576 | if (!rcu_is_leaf_node(rnp)) | ||
577 | continue; | ||
578 | for_each_leaf_node_possible_cpu(rnp, cpu) { | ||
579 | rdp = per_cpu_ptr(&rcu_data, cpu); | ||
580 | if (rdp->gpwrap || | ||
581 | ULONG_CMP_GE(rcu_state.gp_seq, | ||
582 | rdp->gp_seq_needed)) | ||
583 | continue; | ||
584 | pr_info("\tcpu %d ->gp_seq_needed %ld\n", | ||
585 | cpu, (long)rdp->gp_seq_needed); | ||
586 | } | ||
587 | } | ||
588 | /* sched_show_task(rcu_state.gp_kthread); */ | ||
589 | } | ||
590 | EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); | ||
591 | |||
592 | /* | ||
593 | * This function checks for grace-period requests that fail to motivate | ||
594 | * RCU to come out of its idle mode. | ||
595 | */ | ||
596 | static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, | ||
597 | const unsigned long gpssdelay) | ||
598 | { | ||
599 | unsigned long flags; | ||
600 | unsigned long j; | ||
601 | struct rcu_node *rnp_root = rcu_get_root(); | ||
602 | static atomic_t warned = ATOMIC_INIT(0); | ||
603 | |||
604 | if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress() || | ||
605 | ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed)) | ||
606 | return; | ||
607 | j = jiffies; /* Expensive access, and in common case don't get here. */ | ||
608 | if (time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || | ||
609 | time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) || | ||
610 | atomic_read(&warned)) | ||
611 | return; | ||
612 | |||
613 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
614 | j = jiffies; | ||
615 | if (rcu_gp_in_progress() || | ||
616 | ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || | ||
617 | time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || | ||
618 | time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) || | ||
619 | atomic_read(&warned)) { | ||
620 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
621 | return; | ||
622 | } | ||
623 | /* Hold onto the leaf lock to make others see warned==1. */ | ||
624 | |||
625 | if (rnp_root != rnp) | ||
626 | raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ | ||
627 | j = jiffies; | ||
628 | if (rcu_gp_in_progress() || | ||
629 | ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || | ||
630 | time_before(j, rcu_state.gp_req_activity + gpssdelay) || | ||
631 | time_before(j, rcu_state.gp_activity + gpssdelay) || | ||
632 | atomic_xchg(&warned, 1)) { | ||
633 | raw_spin_unlock_rcu_node(rnp_root); /* irqs remain disabled. */ | ||
634 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
635 | return; | ||
636 | } | ||
637 | WARN_ON(1); | ||
638 | if (rnp_root != rnp) | ||
639 | raw_spin_unlock_rcu_node(rnp_root); | ||
640 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | ||
641 | show_rcu_gp_kthreads(); | ||
642 | } | ||
643 | |||
644 | /* | ||
645 | * Do a forward-progress check for rcutorture. This is normally invoked | ||
646 | * due to an OOM event. The argument "j" gives the time period during | ||
647 | * which rcutorture would like progress to have been made. | ||
648 | */ | ||
649 | void rcu_fwd_progress_check(unsigned long j) | ||
650 | { | ||
651 | unsigned long cbs; | ||
652 | int cpu; | ||
653 | unsigned long max_cbs = 0; | ||
654 | int max_cpu = -1; | ||
655 | struct rcu_data *rdp; | ||
656 | |||
657 | if (rcu_gp_in_progress()) { | ||
658 | pr_info("%s: GP age %lu jiffies\n", | ||
659 | __func__, jiffies - rcu_state.gp_start); | ||
660 | show_rcu_gp_kthreads(); | ||
661 | } else { | ||
662 | pr_info("%s: Last GP end %lu jiffies ago\n", | ||
663 | __func__, jiffies - rcu_state.gp_end); | ||
664 | preempt_disable(); | ||
665 | rdp = this_cpu_ptr(&rcu_data); | ||
666 | rcu_check_gp_start_stall(rdp->mynode, rdp, j); | ||
667 | preempt_enable(); | ||
668 | } | ||
669 | for_each_possible_cpu(cpu) { | ||
670 | cbs = rcu_get_n_cbs_cpu(cpu); | ||
671 | if (!cbs) | ||
672 | continue; | ||
673 | if (max_cpu < 0) | ||
674 | pr_info("%s: callbacks", __func__); | ||
675 | pr_cont(" %d: %lu", cpu, cbs); | ||
676 | if (cbs <= max_cbs) | ||
677 | continue; | ||
678 | max_cbs = cbs; | ||
679 | max_cpu = cpu; | ||
680 | } | ||
681 | if (max_cpu >= 0) | ||
682 | pr_cont("\n"); | ||
683 | } | ||
684 | EXPORT_SYMBOL_GPL(rcu_fwd_progress_check); | ||
685 | |||
686 | /* Commandeer a sysrq key to dump RCU's tree. */ | ||
687 | static bool sysrq_rcu; | ||
688 | module_param(sysrq_rcu, bool, 0444); | ||
689 | |||
690 | /* Dump grace-period-request information due to commandeered sysrq. */ | ||
691 | static void sysrq_show_rcu(int key) | ||
692 | { | ||
693 | show_rcu_gp_kthreads(); | ||
694 | } | ||
695 | |||
696 | static struct sysrq_key_op sysrq_rcudump_op = { | ||
697 | .handler = sysrq_show_rcu, | ||
698 | .help_msg = "show-rcu(y)", | ||
699 | .action_msg = "Show RCU tree", | ||
700 | .enable_mask = SYSRQ_ENABLE_DUMP, | ||
701 | }; | ||
702 | |||
703 | static int __init rcu_sysrq_init(void) | ||
704 | { | ||
705 | if (sysrq_rcu) | ||
706 | return register_sysrq_key('y', &sysrq_rcudump_op); | ||
707 | return 0; | ||
708 | } | ||
709 | early_initcall(rcu_sysrq_init); | ||
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index cbaa976c5945..c3bf44ba42e5 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c | |||
@@ -424,68 +424,11 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); | |||
424 | #endif | 424 | #endif |
425 | 425 | ||
426 | #ifdef CONFIG_RCU_STALL_COMMON | 426 | #ifdef CONFIG_RCU_STALL_COMMON |
427 | |||
428 | #ifdef CONFIG_PROVE_RCU | ||
429 | #define RCU_STALL_DELAY_DELTA (5 * HZ) | ||
430 | #else | ||
431 | #define RCU_STALL_DELAY_DELTA 0 | ||
432 | #endif | ||
433 | |||
434 | int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ | 427 | int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ |
435 | EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress); | 428 | EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress); |
436 | static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; | ||
437 | |||
438 | module_param(rcu_cpu_stall_suppress, int, 0644); | 429 | module_param(rcu_cpu_stall_suppress, int, 0644); |
430 | int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; | ||
439 | module_param(rcu_cpu_stall_timeout, int, 0644); | 431 | module_param(rcu_cpu_stall_timeout, int, 0644); |
440 | |||
441 | int rcu_jiffies_till_stall_check(void) | ||
442 | { | ||
443 | int till_stall_check = READ_ONCE(rcu_cpu_stall_timeout); | ||
444 | |||
445 | /* | ||
446 | * Limit check must be consistent with the Kconfig limits | ||
447 | * for CONFIG_RCU_CPU_STALL_TIMEOUT. | ||
448 | */ | ||
449 | if (till_stall_check < 3) { | ||
450 | WRITE_ONCE(rcu_cpu_stall_timeout, 3); | ||
451 | till_stall_check = 3; | ||
452 | } else if (till_stall_check > 300) { | ||
453 | WRITE_ONCE(rcu_cpu_stall_timeout, 300); | ||
454 | till_stall_check = 300; | ||
455 | } | ||
456 | return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; | ||
457 | } | ||
458 | EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check); | ||
459 | |||
460 | void rcu_sysrq_start(void) | ||
461 | { | ||
462 | if (!rcu_cpu_stall_suppress) | ||
463 | rcu_cpu_stall_suppress = 2; | ||
464 | } | ||
465 | |||
466 | void rcu_sysrq_end(void) | ||
467 | { | ||
468 | if (rcu_cpu_stall_suppress == 2) | ||
469 | rcu_cpu_stall_suppress = 0; | ||
470 | } | ||
471 | |||
472 | static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) | ||
473 | { | ||
474 | rcu_cpu_stall_suppress = 1; | ||
475 | return NOTIFY_DONE; | ||
476 | } | ||
477 | |||
478 | static struct notifier_block rcu_panic_block = { | ||
479 | .notifier_call = rcu_panic, | ||
480 | }; | ||
481 | |||
482 | static int __init check_cpu_stall_init(void) | ||
483 | { | ||
484 | atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); | ||
485 | return 0; | ||
486 | } | ||
487 | early_initcall(check_cpu_stall_init); | ||
488 | |||
489 | #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ | 432 | #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ |
490 | 433 | ||
491 | #ifdef CONFIG_TASKS_RCU | 434 | #ifdef CONFIG_TASKS_RCU |
diff --git a/kernel/resource.c b/kernel/resource.c index 92190f62ebc5..8c15f846e8ef 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
@@ -520,21 +520,20 @@ EXPORT_SYMBOL_GPL(page_is_ram); | |||
520 | int region_intersects(resource_size_t start, size_t size, unsigned long flags, | 520 | int region_intersects(resource_size_t start, size_t size, unsigned long flags, |
521 | unsigned long desc) | 521 | unsigned long desc) |
522 | { | 522 | { |
523 | resource_size_t end = start + size - 1; | 523 | struct resource res; |
524 | int type = 0; int other = 0; | 524 | int type = 0; int other = 0; |
525 | struct resource *p; | 525 | struct resource *p; |
526 | 526 | ||
527 | res.start = start; | ||
528 | res.end = start + size - 1; | ||
529 | |||
527 | read_lock(&resource_lock); | 530 | read_lock(&resource_lock); |
528 | for (p = iomem_resource.child; p ; p = p->sibling) { | 531 | for (p = iomem_resource.child; p ; p = p->sibling) { |
529 | bool is_type = (((p->flags & flags) == flags) && | 532 | bool is_type = (((p->flags & flags) == flags) && |
530 | ((desc == IORES_DESC_NONE) || | 533 | ((desc == IORES_DESC_NONE) || |
531 | (desc == p->desc))); | 534 | (desc == p->desc))); |
532 | 535 | ||
533 | if (start >= p->start && start <= p->end) | 536 | if (resource_overlaps(p, &res)) |
534 | is_type ? type++ : other++; | ||
535 | if (end >= p->start && end <= p->end) | ||
536 | is_type ? type++ : other++; | ||
537 | if (p->start >= start && p->end <= end) | ||
538 | is_type ? type++ : other++; | 537 | is_type ? type++ : other++; |
539 | } | 538 | } |
540 | read_unlock(&resource_lock); | 539 | read_unlock(&resource_lock); |
diff --git a/kernel/rseq.c b/kernel/rseq.c index 25e9a7b60eba..9424ee90589e 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c | |||
@@ -254,8 +254,7 @@ static int rseq_ip_fixup(struct pt_regs *regs) | |||
254 | * - signal delivery, | 254 | * - signal delivery, |
255 | * and return to user-space. | 255 | * and return to user-space. |
256 | * | 256 | * |
257 | * This is how we can ensure that the entire rseq critical section, | 257 | * This is how we can ensure that the entire rseq critical section |
258 | * consisting of both the C part and the assembly instruction sequence, | ||
259 | * will issue the commit instruction only if executed atomically with | 258 | * will issue the commit instruction only if executed atomically with |
260 | * respect to other threads scheduled on the same CPU, and with respect | 259 | * respect to other threads scheduled on the same CPU, and with respect |
261 | * to signal handlers. | 260 | * to signal handlers. |
@@ -314,7 +313,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, | |||
314 | /* Unregister rseq for current thread. */ | 313 | /* Unregister rseq for current thread. */ |
315 | if (current->rseq != rseq || !current->rseq) | 314 | if (current->rseq != rseq || !current->rseq) |
316 | return -EINVAL; | 315 | return -EINVAL; |
317 | if (current->rseq_len != rseq_len) | 316 | if (rseq_len != sizeof(*rseq)) |
318 | return -EINVAL; | 317 | return -EINVAL; |
319 | if (current->rseq_sig != sig) | 318 | if (current->rseq_sig != sig) |
320 | return -EPERM; | 319 | return -EPERM; |
@@ -322,7 +321,6 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, | |||
322 | if (ret) | 321 | if (ret) |
323 | return ret; | 322 | return ret; |
324 | current->rseq = NULL; | 323 | current->rseq = NULL; |
325 | current->rseq_len = 0; | ||
326 | current->rseq_sig = 0; | 324 | current->rseq_sig = 0; |
327 | return 0; | 325 | return 0; |
328 | } | 326 | } |
@@ -336,7 +334,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, | |||
336 | * the provided address differs from the prior | 334 | * the provided address differs from the prior |
337 | * one. | 335 | * one. |
338 | */ | 336 | */ |
339 | if (current->rseq != rseq || current->rseq_len != rseq_len) | 337 | if (current->rseq != rseq || rseq_len != sizeof(*rseq)) |
340 | return -EINVAL; | 338 | return -EINVAL; |
341 | if (current->rseq_sig != sig) | 339 | if (current->rseq_sig != sig) |
342 | return -EPERM; | 340 | return -EPERM; |
@@ -354,7 +352,6 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, | |||
354 | if (!access_ok(rseq, rseq_len)) | 352 | if (!access_ok(rseq, rseq_len)) |
355 | return -EFAULT; | 353 | return -EFAULT; |
356 | current->rseq = rseq; | 354 | current->rseq = rseq; |
357 | current->rseq_len = rseq_len; | ||
358 | current->rseq_sig = sig; | 355 | current->rseq_sig = sig; |
359 | /* | 356 | /* |
360 | * If rseq was previously inactive, and has just been | 357 | * If rseq was previously inactive, and has just been |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ead464a0f2e5..102dfcf0a29a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -792,10 +792,14 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags) | |||
792 | rq->nr_uninterruptible--; | 792 | rq->nr_uninterruptible--; |
793 | 793 | ||
794 | enqueue_task(rq, p, flags); | 794 | enqueue_task(rq, p, flags); |
795 | |||
796 | p->on_rq = TASK_ON_RQ_QUEUED; | ||
795 | } | 797 | } |
796 | 798 | ||
797 | void deactivate_task(struct rq *rq, struct task_struct *p, int flags) | 799 | void deactivate_task(struct rq *rq, struct task_struct *p, int flags) |
798 | { | 800 | { |
801 | p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING; | ||
802 | |||
799 | if (task_contributes_to_load(p)) | 803 | if (task_contributes_to_load(p)) |
800 | rq->nr_uninterruptible++; | 804 | rq->nr_uninterruptible++; |
801 | 805 | ||
@@ -920,7 +924,7 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) | |||
920 | } | 924 | } |
921 | 925 | ||
922 | /* | 926 | /* |
923 | * Per-CPU kthreads are allowed to run on !actie && online CPUs, see | 927 | * Per-CPU kthreads are allowed to run on !active && online CPUs, see |
924 | * __set_cpus_allowed_ptr() and select_fallback_rq(). | 928 | * __set_cpus_allowed_ptr() and select_fallback_rq(). |
925 | */ | 929 | */ |
926 | static inline bool is_cpu_allowed(struct task_struct *p, int cpu) | 930 | static inline bool is_cpu_allowed(struct task_struct *p, int cpu) |
@@ -1151,7 +1155,6 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, | |||
1151 | /* Need help from migration thread: drop lock and wait. */ | 1155 | /* Need help from migration thread: drop lock and wait. */ |
1152 | task_rq_unlock(rq, p, &rf); | 1156 | task_rq_unlock(rq, p, &rf); |
1153 | stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); | 1157 | stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); |
1154 | tlb_migrate_finish(p->mm); | ||
1155 | return 0; | 1158 | return 0; |
1156 | } else if (task_on_rq_queued(p)) { | 1159 | } else if (task_on_rq_queued(p)) { |
1157 | /* | 1160 | /* |
@@ -1237,11 +1240,9 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) | |||
1237 | rq_pin_lock(src_rq, &srf); | 1240 | rq_pin_lock(src_rq, &srf); |
1238 | rq_pin_lock(dst_rq, &drf); | 1241 | rq_pin_lock(dst_rq, &drf); |
1239 | 1242 | ||
1240 | p->on_rq = TASK_ON_RQ_MIGRATING; | ||
1241 | deactivate_task(src_rq, p, 0); | 1243 | deactivate_task(src_rq, p, 0); |
1242 | set_task_cpu(p, cpu); | 1244 | set_task_cpu(p, cpu); |
1243 | activate_task(dst_rq, p, 0); | 1245 | activate_task(dst_rq, p, 0); |
1244 | p->on_rq = TASK_ON_RQ_QUEUED; | ||
1245 | check_preempt_curr(dst_rq, p, 0); | 1246 | check_preempt_curr(dst_rq, p, 0); |
1246 | 1247 | ||
1247 | rq_unpin_lock(dst_rq, &drf); | 1248 | rq_unpin_lock(dst_rq, &drf); |
@@ -1681,16 +1682,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) | |||
1681 | __schedstat_inc(p->se.statistics.nr_wakeups_sync); | 1682 | __schedstat_inc(p->se.statistics.nr_wakeups_sync); |
1682 | } | 1683 | } |
1683 | 1684 | ||
1684 | static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) | ||
1685 | { | ||
1686 | activate_task(rq, p, en_flags); | ||
1687 | p->on_rq = TASK_ON_RQ_QUEUED; | ||
1688 | |||
1689 | /* If a worker is waking up, notify the workqueue: */ | ||
1690 | if (p->flags & PF_WQ_WORKER) | ||
1691 | wq_worker_waking_up(p, cpu_of(rq)); | ||
1692 | } | ||
1693 | |||
1694 | /* | 1685 | /* |
1695 | * Mark the task runnable and perform wakeup-preemption. | 1686 | * Mark the task runnable and perform wakeup-preemption. |
1696 | */ | 1687 | */ |
@@ -1742,7 +1733,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, | |||
1742 | en_flags |= ENQUEUE_MIGRATED; | 1733 | en_flags |= ENQUEUE_MIGRATED; |
1743 | #endif | 1734 | #endif |
1744 | 1735 | ||
1745 | ttwu_activate(rq, p, en_flags); | 1736 | activate_task(rq, p, en_flags); |
1746 | ttwu_do_wakeup(rq, p, wake_flags, rf); | 1737 | ttwu_do_wakeup(rq, p, wake_flags, rf); |
1747 | } | 1738 | } |
1748 | 1739 | ||
@@ -2107,56 +2098,6 @@ out: | |||
2107 | } | 2098 | } |
2108 | 2099 | ||
2109 | /** | 2100 | /** |
2110 | * try_to_wake_up_local - try to wake up a local task with rq lock held | ||
2111 | * @p: the thread to be awakened | ||
2112 | * @rf: request-queue flags for pinning | ||
2113 | * | ||
2114 | * Put @p on the run-queue if it's not already there. The caller must | ||
2115 | * ensure that this_rq() is locked, @p is bound to this_rq() and not | ||
2116 | * the current task. | ||
2117 | */ | ||
2118 | static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf) | ||
2119 | { | ||
2120 | struct rq *rq = task_rq(p); | ||
2121 | |||
2122 | if (WARN_ON_ONCE(rq != this_rq()) || | ||
2123 | WARN_ON_ONCE(p == current)) | ||
2124 | return; | ||
2125 | |||
2126 | lockdep_assert_held(&rq->lock); | ||
2127 | |||
2128 | if (!raw_spin_trylock(&p->pi_lock)) { | ||
2129 | /* | ||
2130 | * This is OK, because current is on_cpu, which avoids it being | ||
2131 | * picked for load-balance and preemption/IRQs are still | ||
2132 | * disabled avoiding further scheduler activity on it and we've | ||
2133 | * not yet picked a replacement task. | ||
2134 | */ | ||
2135 | rq_unlock(rq, rf); | ||
2136 | raw_spin_lock(&p->pi_lock); | ||
2137 | rq_relock(rq, rf); | ||
2138 | } | ||
2139 | |||
2140 | if (!(p->state & TASK_NORMAL)) | ||
2141 | goto out; | ||
2142 | |||
2143 | trace_sched_waking(p); | ||
2144 | |||
2145 | if (!task_on_rq_queued(p)) { | ||
2146 | if (p->in_iowait) { | ||
2147 | delayacct_blkio_end(p); | ||
2148 | atomic_dec(&rq->nr_iowait); | ||
2149 | } | ||
2150 | ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK); | ||
2151 | } | ||
2152 | |||
2153 | ttwu_do_wakeup(rq, p, 0, rf); | ||
2154 | ttwu_stat(p, smp_processor_id(), 0); | ||
2155 | out: | ||
2156 | raw_spin_unlock(&p->pi_lock); | ||
2157 | } | ||
2158 | |||
2159 | /** | ||
2160 | * wake_up_process - Wake up a specific process | 2101 | * wake_up_process - Wake up a specific process |
2161 | * @p: The process to be woken up. | 2102 | * @p: The process to be woken up. |
2162 | * | 2103 | * |
@@ -2467,7 +2408,6 @@ void wake_up_new_task(struct task_struct *p) | |||
2467 | post_init_entity_util_avg(p); | 2408 | post_init_entity_util_avg(p); |
2468 | 2409 | ||
2469 | activate_task(rq, p, ENQUEUE_NOCLOCK); | 2410 | activate_task(rq, p, ENQUEUE_NOCLOCK); |
2470 | p->on_rq = TASK_ON_RQ_QUEUED; | ||
2471 | trace_sched_wakeup_new(p); | 2411 | trace_sched_wakeup_new(p); |
2472 | check_preempt_curr(rq, p, WF_FORK); | 2412 | check_preempt_curr(rq, p, WF_FORK); |
2473 | #ifdef CONFIG_SMP | 2413 | #ifdef CONFIG_SMP |
@@ -3466,25 +3406,11 @@ static void __sched notrace __schedule(bool preempt) | |||
3466 | prev->state = TASK_RUNNING; | 3406 | prev->state = TASK_RUNNING; |
3467 | } else { | 3407 | } else { |
3468 | deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); | 3408 | deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); |
3469 | prev->on_rq = 0; | ||
3470 | 3409 | ||
3471 | if (prev->in_iowait) { | 3410 | if (prev->in_iowait) { |
3472 | atomic_inc(&rq->nr_iowait); | 3411 | atomic_inc(&rq->nr_iowait); |
3473 | delayacct_blkio_start(); | 3412 | delayacct_blkio_start(); |
3474 | } | 3413 | } |
3475 | |||
3476 | /* | ||
3477 | * If a worker went to sleep, notify and ask workqueue | ||
3478 | * whether it wants to wake up a task to maintain | ||
3479 | * concurrency. | ||
3480 | */ | ||
3481 | if (prev->flags & PF_WQ_WORKER) { | ||
3482 | struct task_struct *to_wakeup; | ||
3483 | |||
3484 | to_wakeup = wq_worker_sleeping(prev); | ||
3485 | if (to_wakeup) | ||
3486 | try_to_wake_up_local(to_wakeup, &rf); | ||
3487 | } | ||
3488 | } | 3414 | } |
3489 | switch_count = &prev->nvcsw; | 3415 | switch_count = &prev->nvcsw; |
3490 | } | 3416 | } |
@@ -3544,6 +3470,20 @@ static inline void sched_submit_work(struct task_struct *tsk) | |||
3544 | { | 3470 | { |
3545 | if (!tsk->state || tsk_is_pi_blocked(tsk)) | 3471 | if (!tsk->state || tsk_is_pi_blocked(tsk)) |
3546 | return; | 3472 | return; |
3473 | |||
3474 | /* | ||
3475 | * If a worker went to sleep, notify and ask workqueue whether | ||
3476 | * it wants to wake up a task to maintain concurrency. | ||
3477 | * As this function is called inside the schedule() context, | ||
3478 | * we disable preemption to avoid it calling schedule() again | ||
3479 | * in the possible wakeup of a kworker. | ||
3480 | */ | ||
3481 | if (tsk->flags & PF_WQ_WORKER) { | ||
3482 | preempt_disable(); | ||
3483 | wq_worker_sleeping(tsk); | ||
3484 | preempt_enable_no_resched(); | ||
3485 | } | ||
3486 | |||
3547 | /* | 3487 | /* |
3548 | * If we are going to sleep and we have plugged IO queued, | 3488 | * If we are going to sleep and we have plugged IO queued, |
3549 | * make sure to submit it to avoid deadlocks. | 3489 | * make sure to submit it to avoid deadlocks. |
@@ -3552,6 +3492,12 @@ static inline void sched_submit_work(struct task_struct *tsk) | |||
3552 | blk_schedule_flush_plug(tsk); | 3492 | blk_schedule_flush_plug(tsk); |
3553 | } | 3493 | } |
3554 | 3494 | ||
3495 | static void sched_update_worker(struct task_struct *tsk) | ||
3496 | { | ||
3497 | if (tsk->flags & PF_WQ_WORKER) | ||
3498 | wq_worker_running(tsk); | ||
3499 | } | ||
3500 | |||
3555 | asmlinkage __visible void __sched schedule(void) | 3501 | asmlinkage __visible void __sched schedule(void) |
3556 | { | 3502 | { |
3557 | struct task_struct *tsk = current; | 3503 | struct task_struct *tsk = current; |
@@ -3562,6 +3508,7 @@ asmlinkage __visible void __sched schedule(void) | |||
3562 | __schedule(false); | 3508 | __schedule(false); |
3563 | sched_preempt_enable_no_resched(); | 3509 | sched_preempt_enable_no_resched(); |
3564 | } while (need_resched()); | 3510 | } while (need_resched()); |
3511 | sched_update_worker(tsk); | ||
3565 | } | 3512 | } |
3566 | EXPORT_SYMBOL(schedule); | 3513 | EXPORT_SYMBOL(schedule); |
3567 | 3514 | ||
@@ -5918,7 +5865,7 @@ void __init sched_init_smp(void) | |||
5918 | 5865 | ||
5919 | static int __init migration_init(void) | 5866 | static int __init migration_init(void) |
5920 | { | 5867 | { |
5921 | sched_rq_cpu_starting(smp_processor_id()); | 5868 | sched_cpu_starting(smp_processor_id()); |
5922 | return 0; | 5869 | return 0; |
5923 | } | 5870 | } |
5924 | early_initcall(migration_init); | 5871 | early_initcall(migration_init); |
@@ -6559,6 +6506,8 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) | |||
6559 | static int cpu_shares_write_u64(struct cgroup_subsys_state *css, | 6506 | static int cpu_shares_write_u64(struct cgroup_subsys_state *css, |
6560 | struct cftype *cftype, u64 shareval) | 6507 | struct cftype *cftype, u64 shareval) |
6561 | { | 6508 | { |
6509 | if (shareval > scale_load_down(ULONG_MAX)) | ||
6510 | shareval = MAX_SHARES; | ||
6562 | return sched_group_set_shares(css_tg(css), scale_load(shareval)); | 6511 | return sched_group_set_shares(css_tg(css), scale_load(shareval)); |
6563 | } | 6512 | } |
6564 | 6513 | ||
@@ -6574,7 +6523,7 @@ static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, | |||
6574 | static DEFINE_MUTEX(cfs_constraints_mutex); | 6523 | static DEFINE_MUTEX(cfs_constraints_mutex); |
6575 | 6524 | ||
6576 | const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ | 6525 | const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ |
6577 | const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ | 6526 | static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ |
6578 | 6527 | ||
6579 | static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); | 6528 | static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); |
6580 | 6529 | ||
@@ -6654,20 +6603,22 @@ out_unlock: | |||
6654 | return ret; | 6603 | return ret; |
6655 | } | 6604 | } |
6656 | 6605 | ||
6657 | int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) | 6606 | static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) |
6658 | { | 6607 | { |
6659 | u64 quota, period; | 6608 | u64 quota, period; |
6660 | 6609 | ||
6661 | period = ktime_to_ns(tg->cfs_bandwidth.period); | 6610 | period = ktime_to_ns(tg->cfs_bandwidth.period); |
6662 | if (cfs_quota_us < 0) | 6611 | if (cfs_quota_us < 0) |
6663 | quota = RUNTIME_INF; | 6612 | quota = RUNTIME_INF; |
6664 | else | 6613 | else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC) |
6665 | quota = (u64)cfs_quota_us * NSEC_PER_USEC; | 6614 | quota = (u64)cfs_quota_us * NSEC_PER_USEC; |
6615 | else | ||
6616 | return -EINVAL; | ||
6666 | 6617 | ||
6667 | return tg_set_cfs_bandwidth(tg, period, quota); | 6618 | return tg_set_cfs_bandwidth(tg, period, quota); |
6668 | } | 6619 | } |
6669 | 6620 | ||
6670 | long tg_get_cfs_quota(struct task_group *tg) | 6621 | static long tg_get_cfs_quota(struct task_group *tg) |
6671 | { | 6622 | { |
6672 | u64 quota_us; | 6623 | u64 quota_us; |
6673 | 6624 | ||
@@ -6680,17 +6631,20 @@ long tg_get_cfs_quota(struct task_group *tg) | |||
6680 | return quota_us; | 6631 | return quota_us; |
6681 | } | 6632 | } |
6682 | 6633 | ||
6683 | int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) | 6634 | static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) |
6684 | { | 6635 | { |
6685 | u64 quota, period; | 6636 | u64 quota, period; |
6686 | 6637 | ||
6638 | if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC) | ||
6639 | return -EINVAL; | ||
6640 | |||
6687 | period = (u64)cfs_period_us * NSEC_PER_USEC; | 6641 | period = (u64)cfs_period_us * NSEC_PER_USEC; |
6688 | quota = tg->cfs_bandwidth.quota; | 6642 | quota = tg->cfs_bandwidth.quota; |
6689 | 6643 | ||
6690 | return tg_set_cfs_bandwidth(tg, period, quota); | 6644 | return tg_set_cfs_bandwidth(tg, period, quota); |
6691 | } | 6645 | } |
6692 | 6646 | ||
6693 | long tg_get_cfs_period(struct task_group *tg) | 6647 | static long tg_get_cfs_period(struct task_group *tg) |
6694 | { | 6648 | { |
6695 | u64 cfs_period_us; | 6649 | u64 cfs_period_us; |
6696 | 6650 | ||
@@ -6998,7 +6952,7 @@ static int __maybe_unused cpu_period_quota_parse(char *buf, | |||
6998 | { | 6952 | { |
6999 | char tok[21]; /* U64_MAX */ | 6953 | char tok[21]; /* U64_MAX */ |
7000 | 6954 | ||
7001 | if (!sscanf(buf, "%s %llu", tok, periodp)) | 6955 | if (sscanf(buf, "%20s %llu", tok, periodp) < 1) |
7002 | return -EINVAL; | 6956 | return -EINVAL; |
7003 | 6957 | ||
7004 | *periodp *= NSEC_PER_USEC; | 6958 | *periodp *= NSEC_PER_USEC; |
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index 835671f0f917..b5dcd1d83c7f 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c | |||
@@ -7,7 +7,7 @@ | |||
7 | */ | 7 | */ |
8 | #include "sched.h" | 8 | #include "sched.h" |
9 | 9 | ||
10 | DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); | 10 | DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); |
11 | 11 | ||
12 | /** | 12 | /** |
13 | * cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer. | 13 | * cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer. |
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 2efe629425be..962cf343f798 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c | |||
@@ -13,6 +13,8 @@ | |||
13 | #include <linux/sched/cpufreq.h> | 13 | #include <linux/sched/cpufreq.h> |
14 | #include <trace/events/power.h> | 14 | #include <trace/events/power.h> |
15 | 15 | ||
16 | #define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8) | ||
17 | |||
16 | struct sugov_tunables { | 18 | struct sugov_tunables { |
17 | struct gov_attr_set attr_set; | 19 | struct gov_attr_set attr_set; |
18 | unsigned int rate_limit_us; | 20 | unsigned int rate_limit_us; |
@@ -48,7 +50,6 @@ struct sugov_cpu { | |||
48 | 50 | ||
49 | bool iowait_boost_pending; | 51 | bool iowait_boost_pending; |
50 | unsigned int iowait_boost; | 52 | unsigned int iowait_boost; |
51 | unsigned int iowait_boost_max; | ||
52 | u64 last_update; | 53 | u64 last_update; |
53 | 54 | ||
54 | unsigned long bw_dl; | 55 | unsigned long bw_dl; |
@@ -291,8 +292,8 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) | |||
291 | * | 292 | * |
292 | * The IO wait boost of a task is disabled after a tick since the last update | 293 | * The IO wait boost of a task is disabled after a tick since the last update |
293 | * of a CPU. If a new IO wait boost is requested after more then a tick, then | 294 | * of a CPU. If a new IO wait boost is requested after more then a tick, then |
294 | * we enable the boost starting from the minimum frequency, which improves | 295 | * we enable the boost starting from IOWAIT_BOOST_MIN, which improves energy |
295 | * energy efficiency by ignoring sporadic wakeups from IO. | 296 | * efficiency by ignoring sporadic wakeups from IO. |
296 | */ | 297 | */ |
297 | static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time, | 298 | static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time, |
298 | bool set_iowait_boost) | 299 | bool set_iowait_boost) |
@@ -303,8 +304,7 @@ static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time, | |||
303 | if (delta_ns <= TICK_NSEC) | 304 | if (delta_ns <= TICK_NSEC) |
304 | return false; | 305 | return false; |
305 | 306 | ||
306 | sg_cpu->iowait_boost = set_iowait_boost | 307 | sg_cpu->iowait_boost = set_iowait_boost ? IOWAIT_BOOST_MIN : 0; |
307 | ? sg_cpu->sg_policy->policy->min : 0; | ||
308 | sg_cpu->iowait_boost_pending = set_iowait_boost; | 308 | sg_cpu->iowait_boost_pending = set_iowait_boost; |
309 | 309 | ||
310 | return true; | 310 | return true; |
@@ -318,8 +318,9 @@ static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time, | |||
318 | * | 318 | * |
319 | * Each time a task wakes up after an IO operation, the CPU utilization can be | 319 | * Each time a task wakes up after an IO operation, the CPU utilization can be |
320 | * boosted to a certain utilization which doubles at each "frequent and | 320 | * boosted to a certain utilization which doubles at each "frequent and |
321 | * successive" wakeup from IO, ranging from the utilization of the minimum | 321 | * successive" wakeup from IO, ranging from IOWAIT_BOOST_MIN to the utilization |
322 | * OPP to the utilization of the maximum OPP. | 322 | * of the maximum OPP. |
323 | * | ||
323 | * To keep doubling, an IO boost has to be requested at least once per tick, | 324 | * To keep doubling, an IO boost has to be requested at least once per tick, |
324 | * otherwise we restart from the utilization of the minimum OPP. | 325 | * otherwise we restart from the utilization of the minimum OPP. |
325 | */ | 326 | */ |
@@ -344,14 +345,13 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, | |||
344 | 345 | ||
345 | /* Double the boost at each request */ | 346 | /* Double the boost at each request */ |
346 | if (sg_cpu->iowait_boost) { | 347 | if (sg_cpu->iowait_boost) { |
347 | sg_cpu->iowait_boost <<= 1; | 348 | sg_cpu->iowait_boost = |
348 | if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max) | 349 | min_t(unsigned int, sg_cpu->iowait_boost << 1, SCHED_CAPACITY_SCALE); |
349 | sg_cpu->iowait_boost = sg_cpu->iowait_boost_max; | ||
350 | return; | 350 | return; |
351 | } | 351 | } |
352 | 352 | ||
353 | /* First wakeup after IO: start with minimum boost */ | 353 | /* First wakeup after IO: start with minimum boost */ |
354 | sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min; | 354 | sg_cpu->iowait_boost = IOWAIT_BOOST_MIN; |
355 | } | 355 | } |
356 | 356 | ||
357 | /** | 357 | /** |
@@ -373,47 +373,38 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, | |||
373 | * This mechanism is designed to boost high frequently IO waiting tasks, while | 373 | * This mechanism is designed to boost high frequently IO waiting tasks, while |
374 | * being more conservative on tasks which does sporadic IO operations. | 374 | * being more conservative on tasks which does sporadic IO operations. |
375 | */ | 375 | */ |
376 | static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, | 376 | static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, |
377 | unsigned long *util, unsigned long *max) | 377 | unsigned long util, unsigned long max) |
378 | { | 378 | { |
379 | unsigned int boost_util, boost_max; | 379 | unsigned long boost; |
380 | 380 | ||
381 | /* No boost currently required */ | 381 | /* No boost currently required */ |
382 | if (!sg_cpu->iowait_boost) | 382 | if (!sg_cpu->iowait_boost) |
383 | return; | 383 | return util; |
384 | 384 | ||
385 | /* Reset boost if the CPU appears to have been idle enough */ | 385 | /* Reset boost if the CPU appears to have been idle enough */ |
386 | if (sugov_iowait_reset(sg_cpu, time, false)) | 386 | if (sugov_iowait_reset(sg_cpu, time, false)) |
387 | return; | 387 | return util; |
388 | 388 | ||
389 | /* | 389 | if (!sg_cpu->iowait_boost_pending) { |
390 | * An IO waiting task has just woken up: | ||
391 | * allow to further double the boost value | ||
392 | */ | ||
393 | if (sg_cpu->iowait_boost_pending) { | ||
394 | sg_cpu->iowait_boost_pending = false; | ||
395 | } else { | ||
396 | /* | 390 | /* |
397 | * Otherwise: reduce the boost value and disable it when we | 391 | * No boost pending; reduce the boost value. |
398 | * reach the minimum. | ||
399 | */ | 392 | */ |
400 | sg_cpu->iowait_boost >>= 1; | 393 | sg_cpu->iowait_boost >>= 1; |
401 | if (sg_cpu->iowait_boost < sg_cpu->sg_policy->policy->min) { | 394 | if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) { |
402 | sg_cpu->iowait_boost = 0; | 395 | sg_cpu->iowait_boost = 0; |
403 | return; | 396 | return util; |
404 | } | 397 | } |
405 | } | 398 | } |
406 | 399 | ||
400 | sg_cpu->iowait_boost_pending = false; | ||
401 | |||
407 | /* | 402 | /* |
408 | * Apply the current boost value: a CPU is boosted only if its current | 403 | * @util is already in capacity scale; convert iowait_boost |
409 | * utilization is smaller then the current IO boost level. | 404 | * into the same scale so we can compare. |
410 | */ | 405 | */ |
411 | boost_util = sg_cpu->iowait_boost; | 406 | boost = (sg_cpu->iowait_boost * max) >> SCHED_CAPACITY_SHIFT; |
412 | boost_max = sg_cpu->iowait_boost_max; | 407 | return max(boost, util); |
413 | if (*util * boost_max < *max * boost_util) { | ||
414 | *util = boost_util; | ||
415 | *max = boost_max; | ||
416 | } | ||
417 | } | 408 | } |
418 | 409 | ||
419 | #ifdef CONFIG_NO_HZ_COMMON | 410 | #ifdef CONFIG_NO_HZ_COMMON |
@@ -460,7 +451,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, | |||
460 | 451 | ||
461 | util = sugov_get_util(sg_cpu); | 452 | util = sugov_get_util(sg_cpu); |
462 | max = sg_cpu->max; | 453 | max = sg_cpu->max; |
463 | sugov_iowait_apply(sg_cpu, time, &util, &max); | 454 | util = sugov_iowait_apply(sg_cpu, time, util, max); |
464 | next_f = get_next_freq(sg_policy, util, max); | 455 | next_f = get_next_freq(sg_policy, util, max); |
465 | /* | 456 | /* |
466 | * Do not reduce the frequency if the CPU has not been idle | 457 | * Do not reduce the frequency if the CPU has not been idle |
@@ -500,7 +491,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) | |||
500 | 491 | ||
501 | j_util = sugov_get_util(j_sg_cpu); | 492 | j_util = sugov_get_util(j_sg_cpu); |
502 | j_max = j_sg_cpu->max; | 493 | j_max = j_sg_cpu->max; |
503 | sugov_iowait_apply(j_sg_cpu, time, &j_util, &j_max); | 494 | j_util = sugov_iowait_apply(j_sg_cpu, time, j_util, j_max); |
504 | 495 | ||
505 | if (j_util * max > j_max * util) { | 496 | if (j_util * max > j_max * util) { |
506 | util = j_util; | 497 | util = j_util; |
@@ -609,13 +600,14 @@ rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count | |||
609 | 600 | ||
610 | static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us); | 601 | static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us); |
611 | 602 | ||
612 | static struct attribute *sugov_attributes[] = { | 603 | static struct attribute *sugov_attrs[] = { |
613 | &rate_limit_us.attr, | 604 | &rate_limit_us.attr, |
614 | NULL | 605 | NULL |
615 | }; | 606 | }; |
607 | ATTRIBUTE_GROUPS(sugov); | ||
616 | 608 | ||
617 | static struct kobj_type sugov_tunables_ktype = { | 609 | static struct kobj_type sugov_tunables_ktype = { |
618 | .default_attrs = sugov_attributes, | 610 | .default_groups = sugov_groups, |
619 | .sysfs_ops = &governor_sysfs_ops, | 611 | .sysfs_ops = &governor_sysfs_ops, |
620 | }; | 612 | }; |
621 | 613 | ||
@@ -782,6 +774,7 @@ out: | |||
782 | return 0; | 774 | return 0; |
783 | 775 | ||
784 | fail: | 776 | fail: |
777 | kobject_put(&tunables->attr_set.kobj); | ||
785 | policy->governor_data = NULL; | 778 | policy->governor_data = NULL; |
786 | sugov_tunables_free(tunables); | 779 | sugov_tunables_free(tunables); |
787 | 780 | ||
@@ -837,7 +830,6 @@ static int sugov_start(struct cpufreq_policy *policy) | |||
837 | memset(sg_cpu, 0, sizeof(*sg_cpu)); | 830 | memset(sg_cpu, 0, sizeof(*sg_cpu)); |
838 | sg_cpu->cpu = cpu; | 831 | sg_cpu->cpu = cpu; |
839 | sg_cpu->sg_policy = sg_policy; | 832 | sg_cpu->sg_policy = sg_policy; |
840 | sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; | ||
841 | } | 833 | } |
842 | 834 | ||
843 | for_each_cpu(cpu, policy->cpus) { | 835 | for_each_cpu(cpu, policy->cpus) { |
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 6a73e41a2016..43901fa3f269 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
@@ -252,7 +252,6 @@ static void task_non_contending(struct task_struct *p) | |||
252 | if (dl_entity_is_special(dl_se)) | 252 | if (dl_entity_is_special(dl_se)) |
253 | return; | 253 | return; |
254 | 254 | ||
255 | WARN_ON(hrtimer_active(&dl_se->inactive_timer)); | ||
256 | WARN_ON(dl_se->dl_non_contending); | 255 | WARN_ON(dl_se->dl_non_contending); |
257 | 256 | ||
258 | zerolag_time = dl_se->deadline - | 257 | zerolag_time = dl_se->deadline - |
@@ -269,7 +268,7 @@ static void task_non_contending(struct task_struct *p) | |||
269 | * If the "0-lag time" already passed, decrease the active | 268 | * If the "0-lag time" already passed, decrease the active |
270 | * utilization now, instead of starting a timer | 269 | * utilization now, instead of starting a timer |
271 | */ | 270 | */ |
272 | if (zerolag_time < 0) { | 271 | if ((zerolag_time < 0) || hrtimer_active(&dl_se->inactive_timer)) { |
273 | if (dl_task(p)) | 272 | if (dl_task(p)) |
274 | sub_running_bw(dl_se, dl_rq); | 273 | sub_running_bw(dl_se, dl_rq); |
275 | if (!dl_task(p) || p->state == TASK_DEAD) { | 274 | if (!dl_task(p) || p->state == TASK_DEAD) { |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 8039d62ae36e..678bfb9bd87f 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -702,7 +702,7 @@ do { \ | |||
702 | 702 | ||
703 | static const char *sched_tunable_scaling_names[] = { | 703 | static const char *sched_tunable_scaling_names[] = { |
704 | "none", | 704 | "none", |
705 | "logaritmic", | 705 | "logarithmic", |
706 | "linear" | 706 | "linear" |
707 | }; | 707 | }; |
708 | 708 | ||
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ea74d43924b2..f35930f5e528 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -2007,6 +2007,10 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) | |||
2007 | if (p->last_task_numa_placement) { | 2007 | if (p->last_task_numa_placement) { |
2008 | delta = runtime - p->last_sum_exec_runtime; | 2008 | delta = runtime - p->last_sum_exec_runtime; |
2009 | *period = now - p->last_task_numa_placement; | 2009 | *period = now - p->last_task_numa_placement; |
2010 | |||
2011 | /* Avoid time going backwards, prevent potential divide error: */ | ||
2012 | if (unlikely((s64)*period < 0)) | ||
2013 | *period = 0; | ||
2010 | } else { | 2014 | } else { |
2011 | delta = p->se.avg.load_sum; | 2015 | delta = p->se.avg.load_sum; |
2012 | *period = LOAD_AVG_MAX; | 2016 | *period = LOAD_AVG_MAX; |
@@ -2593,7 +2597,7 @@ out: | |||
2593 | /* | 2597 | /* |
2594 | * Drive the periodic memory faults.. | 2598 | * Drive the periodic memory faults.. |
2595 | */ | 2599 | */ |
2596 | void task_tick_numa(struct rq *rq, struct task_struct *curr) | 2600 | static void task_tick_numa(struct rq *rq, struct task_struct *curr) |
2597 | { | 2601 | { |
2598 | struct callback_head *work = &curr->numa_work; | 2602 | struct callback_head *work = &curr->numa_work; |
2599 | u64 period, now; | 2603 | u64 period, now; |
@@ -3567,7 +3571,7 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) | |||
3567 | * Synchronize entity load avg of dequeued entity without locking | 3571 | * Synchronize entity load avg of dequeued entity without locking |
3568 | * the previous rq. | 3572 | * the previous rq. |
3569 | */ | 3573 | */ |
3570 | void sync_entity_load_avg(struct sched_entity *se) | 3574 | static void sync_entity_load_avg(struct sched_entity *se) |
3571 | { | 3575 | { |
3572 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 3576 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
3573 | u64 last_update_time; | 3577 | u64 last_update_time; |
@@ -3580,7 +3584,7 @@ void sync_entity_load_avg(struct sched_entity *se) | |||
3580 | * Task first catches up with cfs_rq, and then subtract | 3584 | * Task first catches up with cfs_rq, and then subtract |
3581 | * itself from the cfs_rq (task must be off the queue now). | 3585 | * itself from the cfs_rq (task must be off the queue now). |
3582 | */ | 3586 | */ |
3583 | void remove_entity_load_avg(struct sched_entity *se) | 3587 | static void remove_entity_load_avg(struct sched_entity *se) |
3584 | { | 3588 | { |
3585 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 3589 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
3586 | unsigned long flags; | 3590 | unsigned long flags; |
@@ -4885,6 +4889,8 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) | |||
4885 | return HRTIMER_NORESTART; | 4889 | return HRTIMER_NORESTART; |
4886 | } | 4890 | } |
4887 | 4891 | ||
4892 | extern const u64 max_cfs_quota_period; | ||
4893 | |||
4888 | static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) | 4894 | static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) |
4889 | { | 4895 | { |
4890 | struct cfs_bandwidth *cfs_b = | 4896 | struct cfs_bandwidth *cfs_b = |
@@ -4892,6 +4898,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) | |||
4892 | unsigned long flags; | 4898 | unsigned long flags; |
4893 | int overrun; | 4899 | int overrun; |
4894 | int idle = 0; | 4900 | int idle = 0; |
4901 | int count = 0; | ||
4895 | 4902 | ||
4896 | raw_spin_lock_irqsave(&cfs_b->lock, flags); | 4903 | raw_spin_lock_irqsave(&cfs_b->lock, flags); |
4897 | for (;;) { | 4904 | for (;;) { |
@@ -4899,6 +4906,28 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) | |||
4899 | if (!overrun) | 4906 | if (!overrun) |
4900 | break; | 4907 | break; |
4901 | 4908 | ||
4909 | if (++count > 3) { | ||
4910 | u64 new, old = ktime_to_ns(cfs_b->period); | ||
4911 | |||
4912 | new = (old * 147) / 128; /* ~115% */ | ||
4913 | new = min(new, max_cfs_quota_period); | ||
4914 | |||
4915 | cfs_b->period = ns_to_ktime(new); | ||
4916 | |||
4917 | /* since max is 1s, this is limited to 1e9^2, which fits in u64 */ | ||
4918 | cfs_b->quota *= new; | ||
4919 | cfs_b->quota = div64_u64(cfs_b->quota, old); | ||
4920 | |||
4921 | pr_warn_ratelimited( | ||
4922 | "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n", | ||
4923 | smp_processor_id(), | ||
4924 | div_u64(new, NSEC_PER_USEC), | ||
4925 | div_u64(cfs_b->quota, NSEC_PER_USEC)); | ||
4926 | |||
4927 | /* reset count so we don't come right back in here */ | ||
4928 | count = 0; | ||
4929 | } | ||
4930 | |||
4902 | idle = do_sched_cfs_period_timer(cfs_b, overrun, flags); | 4931 | idle = do_sched_cfs_period_timer(cfs_b, overrun, flags); |
4903 | } | 4932 | } |
4904 | if (idle) | 4933 | if (idle) |
@@ -5116,7 +5145,6 @@ static inline void hrtick_update(struct rq *rq) | |||
5116 | 5145 | ||
5117 | #ifdef CONFIG_SMP | 5146 | #ifdef CONFIG_SMP |
5118 | static inline unsigned long cpu_util(int cpu); | 5147 | static inline unsigned long cpu_util(int cpu); |
5119 | static unsigned long capacity_of(int cpu); | ||
5120 | 5148 | ||
5121 | static inline bool cpu_overutilized(int cpu) | 5149 | static inline bool cpu_overutilized(int cpu) |
5122 | { | 5150 | { |
@@ -7492,7 +7520,6 @@ static void detach_task(struct task_struct *p, struct lb_env *env) | |||
7492 | { | 7520 | { |
7493 | lockdep_assert_held(&env->src_rq->lock); | 7521 | lockdep_assert_held(&env->src_rq->lock); |
7494 | 7522 | ||
7495 | p->on_rq = TASK_ON_RQ_MIGRATING; | ||
7496 | deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); | 7523 | deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); |
7497 | set_task_cpu(p, env->dst_cpu); | 7524 | set_task_cpu(p, env->dst_cpu); |
7498 | } | 7525 | } |
@@ -7628,7 +7655,6 @@ static void attach_task(struct rq *rq, struct task_struct *p) | |||
7628 | 7655 | ||
7629 | BUG_ON(task_rq(p) != rq); | 7656 | BUG_ON(task_rq(p) != rq); |
7630 | activate_task(rq, p, ENQUEUE_NOCLOCK); | 7657 | activate_task(rq, p, ENQUEUE_NOCLOCK); |
7631 | p->on_rq = TASK_ON_RQ_QUEUED; | ||
7632 | check_preempt_curr(rq, p, 0); | 7658 | check_preempt_curr(rq, p, 0); |
7633 | } | 7659 | } |
7634 | 7660 | ||
@@ -7784,10 +7810,10 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq) | |||
7784 | if (cfs_rq->last_h_load_update == now) | 7810 | if (cfs_rq->last_h_load_update == now) |
7785 | return; | 7811 | return; |
7786 | 7812 | ||
7787 | cfs_rq->h_load_next = NULL; | 7813 | WRITE_ONCE(cfs_rq->h_load_next, NULL); |
7788 | for_each_sched_entity(se) { | 7814 | for_each_sched_entity(se) { |
7789 | cfs_rq = cfs_rq_of(se); | 7815 | cfs_rq = cfs_rq_of(se); |
7790 | cfs_rq->h_load_next = se; | 7816 | WRITE_ONCE(cfs_rq->h_load_next, se); |
7791 | if (cfs_rq->last_h_load_update == now) | 7817 | if (cfs_rq->last_h_load_update == now) |
7792 | break; | 7818 | break; |
7793 | } | 7819 | } |
@@ -7797,7 +7823,7 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq) | |||
7797 | cfs_rq->last_h_load_update = now; | 7823 | cfs_rq->last_h_load_update = now; |
7798 | } | 7824 | } |
7799 | 7825 | ||
7800 | while ((se = cfs_rq->h_load_next) != NULL) { | 7826 | while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) { |
7801 | load = cfs_rq->h_load; | 7827 | load = cfs_rq->h_load; |
7802 | load = div64_ul(load * se->avg.load_avg, | 7828 | load = div64_ul(load * se->avg.load_avg, |
7803 | cfs_rq_load_avg(cfs_rq) + 1); | 7829 | cfs_rq_load_avg(cfs_rq) + 1); |
@@ -8060,6 +8086,18 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd) | |||
8060 | } | 8086 | } |
8061 | 8087 | ||
8062 | /* | 8088 | /* |
8089 | * Check whether a rq has a misfit task and if it looks like we can actually | ||
8090 | * help that task: we can migrate the task to a CPU of higher capacity, or | ||
8091 | * the task's current CPU is heavily pressured. | ||
8092 | */ | ||
8093 | static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd) | ||
8094 | { | ||
8095 | return rq->misfit_task_load && | ||
8096 | (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity || | ||
8097 | check_cpu_capacity(rq, sd)); | ||
8098 | } | ||
8099 | |||
8100 | /* | ||
8063 | * Group imbalance indicates (and tries to solve) the problem where balancing | 8101 | * Group imbalance indicates (and tries to solve) the problem where balancing |
8064 | * groups is inadequate due to ->cpus_allowed constraints. | 8102 | * groups is inadequate due to ->cpus_allowed constraints. |
8065 | * | 8103 | * |
@@ -9510,22 +9548,26 @@ static inline int on_null_domain(struct rq *rq) | |||
9510 | * - When one of the busy CPUs notice that there may be an idle rebalancing | 9548 | * - When one of the busy CPUs notice that there may be an idle rebalancing |
9511 | * needed, they will kick the idle load balancer, which then does idle | 9549 | * needed, they will kick the idle load balancer, which then does idle |
9512 | * load balancing for all the idle CPUs. | 9550 | * load balancing for all the idle CPUs. |
9551 | * - HK_FLAG_MISC CPUs are used for this task, because HK_FLAG_SCHED not set | ||
9552 | * anywhere yet. | ||
9513 | */ | 9553 | */ |
9514 | 9554 | ||
9515 | static inline int find_new_ilb(void) | 9555 | static inline int find_new_ilb(void) |
9516 | { | 9556 | { |
9517 | int ilb = cpumask_first(nohz.idle_cpus_mask); | 9557 | int ilb; |
9518 | 9558 | ||
9519 | if (ilb < nr_cpu_ids && idle_cpu(ilb)) | 9559 | for_each_cpu_and(ilb, nohz.idle_cpus_mask, |
9520 | return ilb; | 9560 | housekeeping_cpumask(HK_FLAG_MISC)) { |
9561 | if (idle_cpu(ilb)) | ||
9562 | return ilb; | ||
9563 | } | ||
9521 | 9564 | ||
9522 | return nr_cpu_ids; | 9565 | return nr_cpu_ids; |
9523 | } | 9566 | } |
9524 | 9567 | ||
9525 | /* | 9568 | /* |
9526 | * Kick a CPU to do the nohz balancing, if it is time for it. We pick the | 9569 | * Kick a CPU to do the nohz balancing, if it is time for it. We pick any |
9527 | * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle | 9570 | * idle CPU in the HK_FLAG_MISC housekeeping set (if there is one). |
9528 | * CPU (if there is one). | ||
9529 | */ | 9571 | */ |
9530 | static void kick_ilb(unsigned int flags) | 9572 | static void kick_ilb(unsigned int flags) |
9531 | { | 9573 | { |
@@ -9586,35 +9628,21 @@ static void nohz_balancer_kick(struct rq *rq) | |||
9586 | if (time_before(now, nohz.next_balance)) | 9628 | if (time_before(now, nohz.next_balance)) |
9587 | goto out; | 9629 | goto out; |
9588 | 9630 | ||
9589 | if (rq->nr_running >= 2 || rq->misfit_task_load) { | 9631 | if (rq->nr_running >= 2) { |
9590 | flags = NOHZ_KICK_MASK; | 9632 | flags = NOHZ_KICK_MASK; |
9591 | goto out; | 9633 | goto out; |
9592 | } | 9634 | } |
9593 | 9635 | ||
9594 | rcu_read_lock(); | 9636 | rcu_read_lock(); |
9595 | sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); | ||
9596 | if (sds) { | ||
9597 | /* | ||
9598 | * If there is an imbalance between LLC domains (IOW we could | ||
9599 | * increase the overall cache use), we need some less-loaded LLC | ||
9600 | * domain to pull some load. Likewise, we may need to spread | ||
9601 | * load within the current LLC domain (e.g. packed SMT cores but | ||
9602 | * other CPUs are idle). We can't really know from here how busy | ||
9603 | * the others are - so just get a nohz balance going if it looks | ||
9604 | * like this LLC domain has tasks we could move. | ||
9605 | */ | ||
9606 | nr_busy = atomic_read(&sds->nr_busy_cpus); | ||
9607 | if (nr_busy > 1) { | ||
9608 | flags = NOHZ_KICK_MASK; | ||
9609 | goto unlock; | ||
9610 | } | ||
9611 | |||
9612 | } | ||
9613 | 9637 | ||
9614 | sd = rcu_dereference(rq->sd); | 9638 | sd = rcu_dereference(rq->sd); |
9615 | if (sd) { | 9639 | if (sd) { |
9616 | if ((rq->cfs.h_nr_running >= 1) && | 9640 | /* |
9617 | check_cpu_capacity(rq, sd)) { | 9641 | * If there's a CFS task and the current CPU has reduced |
9642 | * capacity; kick the ILB to see if there's a better CPU to run | ||
9643 | * on. | ||
9644 | */ | ||
9645 | if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) { | ||
9618 | flags = NOHZ_KICK_MASK; | 9646 | flags = NOHZ_KICK_MASK; |
9619 | goto unlock; | 9647 | goto unlock; |
9620 | } | 9648 | } |
@@ -9622,6 +9650,11 @@ static void nohz_balancer_kick(struct rq *rq) | |||
9622 | 9650 | ||
9623 | sd = rcu_dereference(per_cpu(sd_asym_packing, cpu)); | 9651 | sd = rcu_dereference(per_cpu(sd_asym_packing, cpu)); |
9624 | if (sd) { | 9652 | if (sd) { |
9653 | /* | ||
9654 | * When ASYM_PACKING; see if there's a more preferred CPU | ||
9655 | * currently idle; in which case, kick the ILB to move tasks | ||
9656 | * around. | ||
9657 | */ | ||
9625 | for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { | 9658 | for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { |
9626 | if (sched_asym_prefer(i, cpu)) { | 9659 | if (sched_asym_prefer(i, cpu)) { |
9627 | flags = NOHZ_KICK_MASK; | 9660 | flags = NOHZ_KICK_MASK; |
@@ -9629,6 +9662,45 @@ static void nohz_balancer_kick(struct rq *rq) | |||
9629 | } | 9662 | } |
9630 | } | 9663 | } |
9631 | } | 9664 | } |
9665 | |||
9666 | sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu)); | ||
9667 | if (sd) { | ||
9668 | /* | ||
9669 | * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU | ||
9670 | * to run the misfit task on. | ||
9671 | */ | ||
9672 | if (check_misfit_status(rq, sd)) { | ||
9673 | flags = NOHZ_KICK_MASK; | ||
9674 | goto unlock; | ||
9675 | } | ||
9676 | |||
9677 | /* | ||
9678 | * For asymmetric systems, we do not want to nicely balance | ||
9679 | * cache use, instead we want to embrace asymmetry and only | ||
9680 | * ensure tasks have enough CPU capacity. | ||
9681 | * | ||
9682 | * Skip the LLC logic because it's not relevant in that case. | ||
9683 | */ | ||
9684 | goto unlock; | ||
9685 | } | ||
9686 | |||
9687 | sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); | ||
9688 | if (sds) { | ||
9689 | /* | ||
9690 | * If there is an imbalance between LLC domains (IOW we could | ||
9691 | * increase the overall cache use), we need some less-loaded LLC | ||
9692 | * domain to pull some load. Likewise, we may need to spread | ||
9693 | * load within the current LLC domain (e.g. packed SMT cores but | ||
9694 | * other CPUs are idle). We can't really know from here how busy | ||
9695 | * the others are - so just get a nohz balance going if it looks | ||
9696 | * like this LLC domain has tasks we could move. | ||
9697 | */ | ||
9698 | nr_busy = atomic_read(&sds->nr_busy_cpus); | ||
9699 | if (nr_busy > 1) { | ||
9700 | flags = NOHZ_KICK_MASK; | ||
9701 | goto unlock; | ||
9702 | } | ||
9703 | } | ||
9632 | unlock: | 9704 | unlock: |
9633 | rcu_read_unlock(); | 9705 | rcu_read_unlock(); |
9634 | out: | 9706 | out: |
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index b02d148e7672..687302051a27 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c | |||
@@ -65,6 +65,7 @@ void __init housekeeping_init(void) | |||
65 | static int __init housekeeping_setup(char *str, enum hk_flags flags) | 65 | static int __init housekeeping_setup(char *str, enum hk_flags flags) |
66 | { | 66 | { |
67 | cpumask_var_t non_housekeeping_mask; | 67 | cpumask_var_t non_housekeeping_mask; |
68 | cpumask_var_t tmp; | ||
68 | int err; | 69 | int err; |
69 | 70 | ||
70 | alloc_bootmem_cpumask_var(&non_housekeeping_mask); | 71 | alloc_bootmem_cpumask_var(&non_housekeeping_mask); |
@@ -75,16 +76,23 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags) | |||
75 | return 0; | 76 | return 0; |
76 | } | 77 | } |
77 | 78 | ||
79 | alloc_bootmem_cpumask_var(&tmp); | ||
78 | if (!housekeeping_flags) { | 80 | if (!housekeeping_flags) { |
79 | alloc_bootmem_cpumask_var(&housekeeping_mask); | 81 | alloc_bootmem_cpumask_var(&housekeeping_mask); |
80 | cpumask_andnot(housekeeping_mask, | 82 | cpumask_andnot(housekeeping_mask, |
81 | cpu_possible_mask, non_housekeeping_mask); | 83 | cpu_possible_mask, non_housekeeping_mask); |
82 | if (cpumask_empty(housekeeping_mask)) | 84 | |
85 | cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask); | ||
86 | if (cpumask_empty(tmp)) { | ||
87 | pr_warn("Housekeeping: must include one present CPU, " | ||
88 | "using boot CPU:%d\n", smp_processor_id()); | ||
83 | __cpumask_set_cpu(smp_processor_id(), housekeeping_mask); | 89 | __cpumask_set_cpu(smp_processor_id(), housekeeping_mask); |
90 | __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask); | ||
91 | } | ||
84 | } else { | 92 | } else { |
85 | cpumask_var_t tmp; | 93 | cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask); |
86 | 94 | if (cpumask_empty(tmp)) | |
87 | alloc_bootmem_cpumask_var(&tmp); | 95 | __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask); |
88 | cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask); | 96 | cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask); |
89 | if (!cpumask_equal(tmp, housekeeping_mask)) { | 97 | if (!cpumask_equal(tmp, housekeeping_mask)) { |
90 | pr_warn("Housekeeping: nohz_full= must match isolcpus=\n"); | 98 | pr_warn("Housekeeping: nohz_full= must match isolcpus=\n"); |
@@ -92,8 +100,8 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags) | |||
92 | free_bootmem_cpumask_var(non_housekeeping_mask); | 100 | free_bootmem_cpumask_var(non_housekeeping_mask); |
93 | return 0; | 101 | return 0; |
94 | } | 102 | } |
95 | free_bootmem_cpumask_var(tmp); | ||
96 | } | 103 | } |
104 | free_bootmem_cpumask_var(tmp); | ||
97 | 105 | ||
98 | if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) { | 106 | if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) { |
99 | if (IS_ENABLED(CONFIG_NO_HZ_FULL)) { | 107 | if (IS_ENABLED(CONFIG_NO_HZ_FULL)) { |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 90fa23d36565..1e6b909dca36 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -2555,6 +2555,8 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) | |||
2555 | rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; | 2555 | rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; |
2556 | if (rt_runtime_us < 0) | 2556 | if (rt_runtime_us < 0) |
2557 | rt_runtime = RUNTIME_INF; | 2557 | rt_runtime = RUNTIME_INF; |
2558 | else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC) | ||
2559 | return -EINVAL; | ||
2558 | 2560 | ||
2559 | return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); | 2561 | return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); |
2560 | } | 2562 | } |
@@ -2575,6 +2577,9 @@ int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us) | |||
2575 | { | 2577 | { |
2576 | u64 rt_runtime, rt_period; | 2578 | u64 rt_runtime, rt_period; |
2577 | 2579 | ||
2580 | if (rt_period_us > U64_MAX / NSEC_PER_USEC) | ||
2581 | return -EINVAL; | ||
2582 | |||
2578 | rt_period = rt_period_us * NSEC_PER_USEC; | 2583 | rt_period = rt_period_us * NSEC_PER_USEC; |
2579 | rt_runtime = tg->rt_bandwidth.rt_runtime; | 2584 | rt_runtime = tg->rt_bandwidth.rt_runtime; |
2580 | 2585 | ||
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index efa686eeff26..b52ed1ada0be 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -780,7 +780,7 @@ struct root_domain { | |||
780 | * NULL-terminated list of performance domains intersecting with the | 780 | * NULL-terminated list of performance domains intersecting with the |
781 | * CPUs of the rd. Protected by RCU. | 781 | * CPUs of the rd. Protected by RCU. |
782 | */ | 782 | */ |
783 | struct perf_domain *pd; | 783 | struct perf_domain __rcu *pd; |
784 | }; | 784 | }; |
785 | 785 | ||
786 | extern struct root_domain def_root_domain; | 786 | extern struct root_domain def_root_domain; |
@@ -869,8 +869,8 @@ struct rq { | |||
869 | atomic_t nr_iowait; | 869 | atomic_t nr_iowait; |
870 | 870 | ||
871 | #ifdef CONFIG_SMP | 871 | #ifdef CONFIG_SMP |
872 | struct root_domain *rd; | 872 | struct root_domain *rd; |
873 | struct sched_domain *sd; | 873 | struct sched_domain __rcu *sd; |
874 | 874 | ||
875 | unsigned long cpu_capacity; | 875 | unsigned long cpu_capacity; |
876 | unsigned long cpu_capacity_orig; | 876 | unsigned long cpu_capacity_orig; |
@@ -1324,13 +1324,13 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) | |||
1324 | return sd; | 1324 | return sd; |
1325 | } | 1325 | } |
1326 | 1326 | ||
1327 | DECLARE_PER_CPU(struct sched_domain *, sd_llc); | 1327 | DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc); |
1328 | DECLARE_PER_CPU(int, sd_llc_size); | 1328 | DECLARE_PER_CPU(int, sd_llc_size); |
1329 | DECLARE_PER_CPU(int, sd_llc_id); | 1329 | DECLARE_PER_CPU(int, sd_llc_id); |
1330 | DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); | 1330 | DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); |
1331 | DECLARE_PER_CPU(struct sched_domain *, sd_numa); | 1331 | DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); |
1332 | DECLARE_PER_CPU(struct sched_domain *, sd_asym_packing); | 1332 | DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); |
1333 | DECLARE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity); | 1333 | DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); |
1334 | extern struct static_key_false sched_asym_cpucapacity; | 1334 | extern struct static_key_false sched_asym_cpucapacity; |
1335 | 1335 | ||
1336 | struct sched_group_capacity { | 1336 | struct sched_group_capacity { |
@@ -2185,7 +2185,7 @@ static inline u64 irq_time_read(int cpu) | |||
2185 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | 2185 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ |
2186 | 2186 | ||
2187 | #ifdef CONFIG_CPU_FREQ | 2187 | #ifdef CONFIG_CPU_FREQ |
2188 | DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); | 2188 | DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); |
2189 | 2189 | ||
2190 | /** | 2190 | /** |
2191 | * cpufreq_update_util - Take a note about CPU utilization changes. | 2191 | * cpufreq_update_util - Take a note about CPU utilization changes. |
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index ab7f371a3a17..f53f89df837d 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c | |||
@@ -615,13 +615,13 @@ static void destroy_sched_domains(struct sched_domain *sd) | |||
615 | * the cpumask of the domain), this allows us to quickly tell if | 615 | * the cpumask of the domain), this allows us to quickly tell if |
616 | * two CPUs are in the same cache domain, see cpus_share_cache(). | 616 | * two CPUs are in the same cache domain, see cpus_share_cache(). |
617 | */ | 617 | */ |
618 | DEFINE_PER_CPU(struct sched_domain *, sd_llc); | 618 | DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc); |
619 | DEFINE_PER_CPU(int, sd_llc_size); | 619 | DEFINE_PER_CPU(int, sd_llc_size); |
620 | DEFINE_PER_CPU(int, sd_llc_id); | 620 | DEFINE_PER_CPU(int, sd_llc_id); |
621 | DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); | 621 | DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); |
622 | DEFINE_PER_CPU(struct sched_domain *, sd_numa); | 622 | DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); |
623 | DEFINE_PER_CPU(struct sched_domain *, sd_asym_packing); | 623 | DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); |
624 | DEFINE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity); | 624 | DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); |
625 | DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); | 625 | DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); |
626 | 626 | ||
627 | static void update_top_cache_domain(int cpu) | 627 | static void update_top_cache_domain(int cpu) |
@@ -1059,6 +1059,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd) | |||
1059 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); | 1059 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); |
1060 | struct sched_domain *child = sd->child; | 1060 | struct sched_domain *child = sd->child; |
1061 | struct sched_group *sg; | 1061 | struct sched_group *sg; |
1062 | bool already_visited; | ||
1062 | 1063 | ||
1063 | if (child) | 1064 | if (child) |
1064 | cpu = cpumask_first(sched_domain_span(child)); | 1065 | cpu = cpumask_first(sched_domain_span(child)); |
@@ -1066,9 +1067,14 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd) | |||
1066 | sg = *per_cpu_ptr(sdd->sg, cpu); | 1067 | sg = *per_cpu_ptr(sdd->sg, cpu); |
1067 | sg->sgc = *per_cpu_ptr(sdd->sgc, cpu); | 1068 | sg->sgc = *per_cpu_ptr(sdd->sgc, cpu); |
1068 | 1069 | ||
1069 | /* For claim_allocations: */ | 1070 | /* Increase refcounts for claim_allocations: */ |
1070 | atomic_inc(&sg->ref); | 1071 | already_visited = atomic_inc_return(&sg->ref) > 1; |
1071 | atomic_inc(&sg->sgc->ref); | 1072 | /* sgc visits should follow a similar trend as sg */ |
1073 | WARN_ON(already_visited != (atomic_inc_return(&sg->sgc->ref) > 1)); | ||
1074 | |||
1075 | /* If we have already visited that group, it's already initialized. */ | ||
1076 | if (already_visited) | ||
1077 | return sg; | ||
1072 | 1078 | ||
1073 | if (child) { | 1079 | if (child) { |
1074 | cpumask_copy(sched_group_span(sg), sched_domain_span(child)); | 1080 | cpumask_copy(sched_group_span(sg), sched_domain_span(child)); |
@@ -1087,8 +1093,8 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd) | |||
1087 | 1093 | ||
1088 | /* | 1094 | /* |
1089 | * build_sched_groups will build a circular linked list of the groups | 1095 | * build_sched_groups will build a circular linked list of the groups |
1090 | * covered by the given span, and will set each group's ->cpumask correctly, | 1096 | * covered by the given span, will set each group's ->cpumask correctly, |
1091 | * and ->cpu_capacity to 0. | 1097 | * and will initialize their ->sgc. |
1092 | * | 1098 | * |
1093 | * Assumes the sched_domain tree is fully constructed | 1099 | * Assumes the sched_domain tree is fully constructed |
1094 | */ | 1100 | */ |
@@ -2075,9 +2081,8 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) | |||
2075 | } | 2081 | } |
2076 | 2082 | ||
2077 | /* | 2083 | /* |
2078 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | 2084 | * Set up scheduler domains and groups. For now this just excludes isolated |
2079 | * For now this just excludes isolated CPUs, but could be used to | 2085 | * CPUs, but could be used to exclude other special cases in the future. |
2080 | * exclude other special cases in the future. | ||
2081 | */ | 2086 | */ |
2082 | int sched_init_domains(const struct cpumask *cpu_map) | 2087 | int sched_init_domains(const struct cpumask *cpu_map) |
2083 | { | 2088 | { |
diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 54a0347ca812..a635ecba6fe2 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c | |||
@@ -149,7 +149,7 @@ static void populate_seccomp_data(struct seccomp_data *sd) | |||
149 | 149 | ||
150 | sd->nr = syscall_get_nr(task, regs); | 150 | sd->nr = syscall_get_nr(task, regs); |
151 | sd->arch = syscall_get_arch(); | 151 | sd->arch = syscall_get_arch(); |
152 | syscall_get_arguments(task, regs, 0, 6, args); | 152 | syscall_get_arguments(task, regs, args); |
153 | sd->args[0] = args[0]; | 153 | sd->args[0] = args[0]; |
154 | sd->args[1] = args[1]; | 154 | sd->args[1] = args[1]; |
155 | sd->args[2] = args[2]; | 155 | sd->args[2] = args[2]; |
@@ -331,7 +331,7 @@ static int is_ancestor(struct seccomp_filter *parent, | |||
331 | * Expects sighand and cred_guard_mutex locks to be held. | 331 | * Expects sighand and cred_guard_mutex locks to be held. |
332 | * | 332 | * |
333 | * Returns 0 on success, -ve on error, or the pid of a thread which was | 333 | * Returns 0 on success, -ve on error, or the pid of a thread which was |
334 | * either not in the correct seccomp mode or it did not have an ancestral | 334 | * either not in the correct seccomp mode or did not have an ancestral |
335 | * seccomp filter. | 335 | * seccomp filter. |
336 | */ | 336 | */ |
337 | static inline pid_t seccomp_can_sync_threads(void) | 337 | static inline pid_t seccomp_can_sync_threads(void) |
@@ -502,7 +502,10 @@ out: | |||
502 | * | 502 | * |
503 | * Caller must be holding current->sighand->siglock lock. | 503 | * Caller must be holding current->sighand->siglock lock. |
504 | * | 504 | * |
505 | * Returns 0 on success, -ve on error. | 505 | * Returns 0 on success, -ve on error, or |
506 | * - in TSYNC mode: the pid of a thread which was either not in the correct | ||
507 | * seccomp mode or did not have an ancestral seccomp filter | ||
508 | * - in NEW_LISTENER mode: the fd of the new listener | ||
506 | */ | 509 | */ |
507 | static long seccomp_attach_filter(unsigned int flags, | 510 | static long seccomp_attach_filter(unsigned int flags, |
508 | struct seccomp_filter *filter) | 511 | struct seccomp_filter *filter) |
@@ -1258,6 +1261,16 @@ static long seccomp_set_mode_filter(unsigned int flags, | |||
1258 | if (flags & ~SECCOMP_FILTER_FLAG_MASK) | 1261 | if (flags & ~SECCOMP_FILTER_FLAG_MASK) |
1259 | return -EINVAL; | 1262 | return -EINVAL; |
1260 | 1263 | ||
1264 | /* | ||
1265 | * In the successful case, NEW_LISTENER returns the new listener fd. | ||
1266 | * But in the failure case, TSYNC returns the thread that died. If you | ||
1267 | * combine these two flags, there's no way to tell whether something | ||
1268 | * succeeded or failed. So, let's disallow this combination. | ||
1269 | */ | ||
1270 | if ((flags & SECCOMP_FILTER_FLAG_TSYNC) && | ||
1271 | (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER)) | ||
1272 | return -EINVAL; | ||
1273 | |||
1261 | /* Prepare the new filter before holding any locks. */ | 1274 | /* Prepare the new filter before holding any locks. */ |
1262 | prepared = seccomp_prepare_user_filter(filter); | 1275 | prepared = seccomp_prepare_user_filter(filter); |
1263 | if (IS_ERR(prepared)) | 1276 | if (IS_ERR(prepared)) |
@@ -1304,7 +1317,7 @@ out: | |||
1304 | mutex_unlock(¤t->signal->cred_guard_mutex); | 1317 | mutex_unlock(¤t->signal->cred_guard_mutex); |
1305 | out_put_fd: | 1318 | out_put_fd: |
1306 | if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) { | 1319 | if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) { |
1307 | if (ret < 0) { | 1320 | if (ret) { |
1308 | listener_f->private_data = NULL; | 1321 | listener_f->private_data = NULL; |
1309 | fput(listener_f); | 1322 | fput(listener_f); |
1310 | put_unused_fd(listener); | 1323 | put_unused_fd(listener); |
diff --git a/kernel/signal.c b/kernel/signal.c index b7953934aa99..cd83cc376767 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -3513,7 +3513,6 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) | |||
3513 | return kill_something_info(sig, &info, pid); | 3513 | return kill_something_info(sig, &info, pid); |
3514 | } | 3514 | } |
3515 | 3515 | ||
3516 | #ifdef CONFIG_PROC_FS | ||
3517 | /* | 3516 | /* |
3518 | * Verify that the signaler and signalee either are in the same pid namespace | 3517 | * Verify that the signaler and signalee either are in the same pid namespace |
3519 | * or that the signaler's pid namespace is an ancestor of the signalee's pid | 3518 | * or that the signaler's pid namespace is an ancestor of the signalee's pid |
@@ -3550,6 +3549,14 @@ static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, siginfo_t *info) | |||
3550 | return copy_siginfo_from_user(kinfo, info); | 3549 | return copy_siginfo_from_user(kinfo, info); |
3551 | } | 3550 | } |
3552 | 3551 | ||
3552 | static struct pid *pidfd_to_pid(const struct file *file) | ||
3553 | { | ||
3554 | if (file->f_op == &pidfd_fops) | ||
3555 | return file->private_data; | ||
3556 | |||
3557 | return tgid_pidfd_to_pid(file); | ||
3558 | } | ||
3559 | |||
3553 | /** | 3560 | /** |
3554 | * sys_pidfd_send_signal - send a signal to a process through a task file | 3561 | * sys_pidfd_send_signal - send a signal to a process through a task file |
3555 | * descriptor | 3562 | * descriptor |
@@ -3581,12 +3588,12 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, | |||
3581 | if (flags) | 3588 | if (flags) |
3582 | return -EINVAL; | 3589 | return -EINVAL; |
3583 | 3590 | ||
3584 | f = fdget_raw(pidfd); | 3591 | f = fdget(pidfd); |
3585 | if (!f.file) | 3592 | if (!f.file) |
3586 | return -EBADF; | 3593 | return -EBADF; |
3587 | 3594 | ||
3588 | /* Is this a pidfd? */ | 3595 | /* Is this a pidfd? */ |
3589 | pid = tgid_pidfd_to_pid(f.file); | 3596 | pid = pidfd_to_pid(f.file); |
3590 | if (IS_ERR(pid)) { | 3597 | if (IS_ERR(pid)) { |
3591 | ret = PTR_ERR(pid); | 3598 | ret = PTR_ERR(pid); |
3592 | goto err; | 3599 | goto err; |
@@ -3605,16 +3612,11 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, | |||
3605 | if (unlikely(sig != kinfo.si_signo)) | 3612 | if (unlikely(sig != kinfo.si_signo)) |
3606 | goto err; | 3613 | goto err; |
3607 | 3614 | ||
3615 | /* Only allow sending arbitrary signals to yourself. */ | ||
3616 | ret = -EPERM; | ||
3608 | if ((task_pid(current) != pid) && | 3617 | if ((task_pid(current) != pid) && |
3609 | (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL)) { | 3618 | (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL)) |
3610 | /* Only allow sending arbitrary signals to yourself. */ | 3619 | goto err; |
3611 | ret = -EPERM; | ||
3612 | if (kinfo.si_code != SI_USER) | ||
3613 | goto err; | ||
3614 | |||
3615 | /* Turn this into a regular kill signal. */ | ||
3616 | prepare_kill_siginfo(sig, &kinfo); | ||
3617 | } | ||
3618 | } else { | 3620 | } else { |
3619 | prepare_kill_siginfo(sig, &kinfo); | 3621 | prepare_kill_siginfo(sig, &kinfo); |
3620 | } | 3622 | } |
@@ -3625,7 +3627,6 @@ err: | |||
3625 | fdput(f); | 3627 | fdput(f); |
3626 | return ret; | 3628 | return ret; |
3627 | } | 3629 | } |
3628 | #endif /* CONFIG_PROC_FS */ | ||
3629 | 3630 | ||
3630 | static int | 3631 | static int |
3631 | do_send_specific(pid_t tgid, pid_t pid, int sig, struct kernel_siginfo *info) | 3632 | do_send_specific(pid_t tgid, pid_t pid, int sig, struct kernel_siginfo *info) |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 10277429ed84..2c3382378d94 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -573,57 +573,6 @@ void tasklet_kill(struct tasklet_struct *t) | |||
573 | } | 573 | } |
574 | EXPORT_SYMBOL(tasklet_kill); | 574 | EXPORT_SYMBOL(tasklet_kill); |
575 | 575 | ||
576 | /* | ||
577 | * tasklet_hrtimer | ||
578 | */ | ||
579 | |||
580 | /* | ||
581 | * The trampoline is called when the hrtimer expires. It schedules a tasklet | ||
582 | * to run __tasklet_hrtimer_trampoline() which in turn will call the intended | ||
583 | * hrtimer callback, but from softirq context. | ||
584 | */ | ||
585 | static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer) | ||
586 | { | ||
587 | struct tasklet_hrtimer *ttimer = | ||
588 | container_of(timer, struct tasklet_hrtimer, timer); | ||
589 | |||
590 | tasklet_hi_schedule(&ttimer->tasklet); | ||
591 | return HRTIMER_NORESTART; | ||
592 | } | ||
593 | |||
594 | /* | ||
595 | * Helper function which calls the hrtimer callback from | ||
596 | * tasklet/softirq context | ||
597 | */ | ||
598 | static void __tasklet_hrtimer_trampoline(unsigned long data) | ||
599 | { | ||
600 | struct tasklet_hrtimer *ttimer = (void *)data; | ||
601 | enum hrtimer_restart restart; | ||
602 | |||
603 | restart = ttimer->function(&ttimer->timer); | ||
604 | if (restart != HRTIMER_NORESTART) | ||
605 | hrtimer_restart(&ttimer->timer); | ||
606 | } | ||
607 | |||
608 | /** | ||
609 | * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks | ||
610 | * @ttimer: tasklet_hrtimer which is initialized | ||
611 | * @function: hrtimer callback function which gets called from softirq context | ||
612 | * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME) | ||
613 | * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL) | ||
614 | */ | ||
615 | void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer, | ||
616 | enum hrtimer_restart (*function)(struct hrtimer *), | ||
617 | clockid_t which_clock, enum hrtimer_mode mode) | ||
618 | { | ||
619 | hrtimer_init(&ttimer->timer, which_clock, mode); | ||
620 | ttimer->timer.function = __hrtimer_tasklet_trampoline; | ||
621 | tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline, | ||
622 | (unsigned long)ttimer); | ||
623 | ttimer->function = function; | ||
624 | } | ||
625 | EXPORT_SYMBOL_GPL(tasklet_hrtimer_init); | ||
626 | |||
627 | void __init softirq_init(void) | 576 | void __init softirq_init(void) |
628 | { | 577 | { |
629 | int cpu; | 578 | int cpu; |
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index f8edee9c792d..27bafc1e271e 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c | |||
@@ -5,41 +5,56 @@ | |||
5 | * | 5 | * |
6 | * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | 6 | * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> |
7 | */ | 7 | */ |
8 | #include <linux/sched/task_stack.h> | ||
9 | #include <linux/sched/debug.h> | ||
8 | #include <linux/sched.h> | 10 | #include <linux/sched.h> |
9 | #include <linux/kernel.h> | 11 | #include <linux/kernel.h> |
10 | #include <linux/export.h> | 12 | #include <linux/export.h> |
11 | #include <linux/kallsyms.h> | 13 | #include <linux/kallsyms.h> |
12 | #include <linux/stacktrace.h> | 14 | #include <linux/stacktrace.h> |
13 | 15 | ||
14 | void print_stack_trace(struct stack_trace *trace, int spaces) | 16 | /** |
17 | * stack_trace_print - Print the entries in the stack trace | ||
18 | * @entries: Pointer to storage array | ||
19 | * @nr_entries: Number of entries in the storage array | ||
20 | * @spaces: Number of leading spaces to print | ||
21 | */ | ||
22 | void stack_trace_print(unsigned long *entries, unsigned int nr_entries, | ||
23 | int spaces) | ||
15 | { | 24 | { |
16 | int i; | 25 | unsigned int i; |
17 | 26 | ||
18 | if (WARN_ON(!trace->entries)) | 27 | if (WARN_ON(!entries)) |
19 | return; | 28 | return; |
20 | 29 | ||
21 | for (i = 0; i < trace->nr_entries; i++) | 30 | for (i = 0; i < nr_entries; i++) |
22 | printk("%*c%pS\n", 1 + spaces, ' ', (void *)trace->entries[i]); | 31 | printk("%*c%pS\n", 1 + spaces, ' ', (void *)entries[i]); |
23 | } | 32 | } |
24 | EXPORT_SYMBOL_GPL(print_stack_trace); | 33 | EXPORT_SYMBOL_GPL(stack_trace_print); |
25 | 34 | ||
26 | int snprint_stack_trace(char *buf, size_t size, | 35 | /** |
27 | struct stack_trace *trace, int spaces) | 36 | * stack_trace_snprint - Print the entries in the stack trace into a buffer |
37 | * @buf: Pointer to the print buffer | ||
38 | * @size: Size of the print buffer | ||
39 | * @entries: Pointer to storage array | ||
40 | * @nr_entries: Number of entries in the storage array | ||
41 | * @spaces: Number of leading spaces to print | ||
42 | * | ||
43 | * Return: Number of bytes printed. | ||
44 | */ | ||
45 | int stack_trace_snprint(char *buf, size_t size, unsigned long *entries, | ||
46 | unsigned int nr_entries, int spaces) | ||
28 | { | 47 | { |
29 | int i; | 48 | unsigned int generated, i, total = 0; |
30 | int generated; | ||
31 | int total = 0; | ||
32 | 49 | ||
33 | if (WARN_ON(!trace->entries)) | 50 | if (WARN_ON(!entries)) |
34 | return 0; | 51 | return 0; |
35 | 52 | ||
36 | for (i = 0; i < trace->nr_entries; i++) { | 53 | for (i = 0; i < nr_entries && size; i++) { |
37 | generated = snprintf(buf, size, "%*c%pS\n", 1 + spaces, ' ', | 54 | generated = snprintf(buf, size, "%*c%pS\n", 1 + spaces, ' ', |
38 | (void *)trace->entries[i]); | 55 | (void *)entries[i]); |
39 | 56 | ||
40 | total += generated; | 57 | total += generated; |
41 | |||
42 | /* Assume that generated isn't a negative number */ | ||
43 | if (generated >= size) { | 58 | if (generated >= size) { |
44 | buf += size; | 59 | buf += size; |
45 | size = 0; | 60 | size = 0; |
@@ -51,7 +66,176 @@ int snprint_stack_trace(char *buf, size_t size, | |||
51 | 66 | ||
52 | return total; | 67 | return total; |
53 | } | 68 | } |
54 | EXPORT_SYMBOL_GPL(snprint_stack_trace); | 69 | EXPORT_SYMBOL_GPL(stack_trace_snprint); |
70 | |||
71 | #ifdef CONFIG_ARCH_STACKWALK | ||
72 | |||
73 | struct stacktrace_cookie { | ||
74 | unsigned long *store; | ||
75 | unsigned int size; | ||
76 | unsigned int skip; | ||
77 | unsigned int len; | ||
78 | }; | ||
79 | |||
80 | static bool stack_trace_consume_entry(void *cookie, unsigned long addr, | ||
81 | bool reliable) | ||
82 | { | ||
83 | struct stacktrace_cookie *c = cookie; | ||
84 | |||
85 | if (c->len >= c->size) | ||
86 | return false; | ||
87 | |||
88 | if (c->skip > 0) { | ||
89 | c->skip--; | ||
90 | return true; | ||
91 | } | ||
92 | c->store[c->len++] = addr; | ||
93 | return c->len < c->size; | ||
94 | } | ||
95 | |||
96 | static bool stack_trace_consume_entry_nosched(void *cookie, unsigned long addr, | ||
97 | bool reliable) | ||
98 | { | ||
99 | if (in_sched_functions(addr)) | ||
100 | return true; | ||
101 | return stack_trace_consume_entry(cookie, addr, reliable); | ||
102 | } | ||
103 | |||
104 | /** | ||
105 | * stack_trace_save - Save a stack trace into a storage array | ||
106 | * @store: Pointer to storage array | ||
107 | * @size: Size of the storage array | ||
108 | * @skipnr: Number of entries to skip at the start of the stack trace | ||
109 | * | ||
110 | * Return: Number of trace entries stored. | ||
111 | */ | ||
112 | unsigned int stack_trace_save(unsigned long *store, unsigned int size, | ||
113 | unsigned int skipnr) | ||
114 | { | ||
115 | stack_trace_consume_fn consume_entry = stack_trace_consume_entry; | ||
116 | struct stacktrace_cookie c = { | ||
117 | .store = store, | ||
118 | .size = size, | ||
119 | .skip = skipnr + 1, | ||
120 | }; | ||
121 | |||
122 | arch_stack_walk(consume_entry, &c, current, NULL); | ||
123 | return c.len; | ||
124 | } | ||
125 | EXPORT_SYMBOL_GPL(stack_trace_save); | ||
126 | |||
127 | /** | ||
128 | * stack_trace_save_tsk - Save a task stack trace into a storage array | ||
129 | * @task: The task to examine | ||
130 | * @store: Pointer to storage array | ||
131 | * @size: Size of the storage array | ||
132 | * @skipnr: Number of entries to skip at the start of the stack trace | ||
133 | * | ||
134 | * Return: Number of trace entries stored. | ||
135 | */ | ||
136 | unsigned int stack_trace_save_tsk(struct task_struct *tsk, unsigned long *store, | ||
137 | unsigned int size, unsigned int skipnr) | ||
138 | { | ||
139 | stack_trace_consume_fn consume_entry = stack_trace_consume_entry_nosched; | ||
140 | struct stacktrace_cookie c = { | ||
141 | .store = store, | ||
142 | .size = size, | ||
143 | .skip = skipnr + 1, | ||
144 | }; | ||
145 | |||
146 | if (!try_get_task_stack(tsk)) | ||
147 | return 0; | ||
148 | |||
149 | arch_stack_walk(consume_entry, &c, tsk, NULL); | ||
150 | put_task_stack(tsk); | ||
151 | return c.len; | ||
152 | } | ||
153 | |||
154 | /** | ||
155 | * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array | ||
156 | * @regs: Pointer to pt_regs to examine | ||
157 | * @store: Pointer to storage array | ||
158 | * @size: Size of the storage array | ||
159 | * @skipnr: Number of entries to skip at the start of the stack trace | ||
160 | * | ||
161 | * Return: Number of trace entries stored. | ||
162 | */ | ||
163 | unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store, | ||
164 | unsigned int size, unsigned int skipnr) | ||
165 | { | ||
166 | stack_trace_consume_fn consume_entry = stack_trace_consume_entry; | ||
167 | struct stacktrace_cookie c = { | ||
168 | .store = store, | ||
169 | .size = size, | ||
170 | .skip = skipnr, | ||
171 | }; | ||
172 | |||
173 | arch_stack_walk(consume_entry, &c, current, regs); | ||
174 | return c.len; | ||
175 | } | ||
176 | |||
177 | #ifdef CONFIG_HAVE_RELIABLE_STACKTRACE | ||
178 | /** | ||
179 | * stack_trace_save_tsk_reliable - Save task stack with verification | ||
180 | * @tsk: Pointer to the task to examine | ||
181 | * @store: Pointer to storage array | ||
182 | * @size: Size of the storage array | ||
183 | * | ||
184 | * Return: An error if it detects any unreliable features of the | ||
185 | * stack. Otherwise it guarantees that the stack trace is | ||
186 | * reliable and returns the number of entries stored. | ||
187 | * | ||
188 | * If the task is not 'current', the caller *must* ensure the task is inactive. | ||
189 | */ | ||
190 | int stack_trace_save_tsk_reliable(struct task_struct *tsk, unsigned long *store, | ||
191 | unsigned int size) | ||
192 | { | ||
193 | stack_trace_consume_fn consume_entry = stack_trace_consume_entry; | ||
194 | struct stacktrace_cookie c = { | ||
195 | .store = store, | ||
196 | .size = size, | ||
197 | }; | ||
198 | int ret; | ||
199 | |||
200 | /* | ||
201 | * If the task doesn't have a stack (e.g., a zombie), the stack is | ||
202 | * "reliably" empty. | ||
203 | */ | ||
204 | if (!try_get_task_stack(tsk)) | ||
205 | return 0; | ||
206 | |||
207 | ret = arch_stack_walk_reliable(consume_entry, &c, tsk); | ||
208 | put_task_stack(tsk); | ||
209 | return ret; | ||
210 | } | ||
211 | #endif | ||
212 | |||
213 | #ifdef CONFIG_USER_STACKTRACE_SUPPORT | ||
214 | /** | ||
215 | * stack_trace_save_user - Save a user space stack trace into a storage array | ||
216 | * @store: Pointer to storage array | ||
217 | * @size: Size of the storage array | ||
218 | * | ||
219 | * Return: Number of trace entries stored. | ||
220 | */ | ||
221 | unsigned int stack_trace_save_user(unsigned long *store, unsigned int size) | ||
222 | { | ||
223 | stack_trace_consume_fn consume_entry = stack_trace_consume_entry; | ||
224 | struct stacktrace_cookie c = { | ||
225 | .store = store, | ||
226 | .size = size, | ||
227 | }; | ||
228 | |||
229 | /* Trace user stack if not a kernel thread */ | ||
230 | if (!current->mm) | ||
231 | return 0; | ||
232 | |||
233 | arch_stack_walk_user(consume_entry, &c, task_pt_regs(current)); | ||
234 | return c.len; | ||
235 | } | ||
236 | #endif | ||
237 | |||
238 | #else /* CONFIG_ARCH_STACKWALK */ | ||
55 | 239 | ||
56 | /* | 240 | /* |
57 | * Architectures that do not implement save_stack_trace_*() | 241 | * Architectures that do not implement save_stack_trace_*() |
@@ -77,3 +261,118 @@ save_stack_trace_tsk_reliable(struct task_struct *tsk, | |||
77 | WARN_ONCE(1, KERN_INFO "save_stack_tsk_reliable() not implemented yet.\n"); | 261 | WARN_ONCE(1, KERN_INFO "save_stack_tsk_reliable() not implemented yet.\n"); |
78 | return -ENOSYS; | 262 | return -ENOSYS; |
79 | } | 263 | } |
264 | |||
265 | /** | ||
266 | * stack_trace_save - Save a stack trace into a storage array | ||
267 | * @store: Pointer to storage array | ||
268 | * @size: Size of the storage array | ||
269 | * @skipnr: Number of entries to skip at the start of the stack trace | ||
270 | * | ||
271 | * Return: Number of trace entries stored | ||
272 | */ | ||
273 | unsigned int stack_trace_save(unsigned long *store, unsigned int size, | ||
274 | unsigned int skipnr) | ||
275 | { | ||
276 | struct stack_trace trace = { | ||
277 | .entries = store, | ||
278 | .max_entries = size, | ||
279 | .skip = skipnr + 1, | ||
280 | }; | ||
281 | |||
282 | save_stack_trace(&trace); | ||
283 | return trace.nr_entries; | ||
284 | } | ||
285 | EXPORT_SYMBOL_GPL(stack_trace_save); | ||
286 | |||
287 | /** | ||
288 | * stack_trace_save_tsk - Save a task stack trace into a storage array | ||
289 | * @task: The task to examine | ||
290 | * @store: Pointer to storage array | ||
291 | * @size: Size of the storage array | ||
292 | * @skipnr: Number of entries to skip at the start of the stack trace | ||
293 | * | ||
294 | * Return: Number of trace entries stored | ||
295 | */ | ||
296 | unsigned int stack_trace_save_tsk(struct task_struct *task, | ||
297 | unsigned long *store, unsigned int size, | ||
298 | unsigned int skipnr) | ||
299 | { | ||
300 | struct stack_trace trace = { | ||
301 | .entries = store, | ||
302 | .max_entries = size, | ||
303 | .skip = skipnr + 1, | ||
304 | }; | ||
305 | |||
306 | save_stack_trace_tsk(task, &trace); | ||
307 | return trace.nr_entries; | ||
308 | } | ||
309 | |||
310 | /** | ||
311 | * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array | ||
312 | * @regs: Pointer to pt_regs to examine | ||
313 | * @store: Pointer to storage array | ||
314 | * @size: Size of the storage array | ||
315 | * @skipnr: Number of entries to skip at the start of the stack trace | ||
316 | * | ||
317 | * Return: Number of trace entries stored | ||
318 | */ | ||
319 | unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store, | ||
320 | unsigned int size, unsigned int skipnr) | ||
321 | { | ||
322 | struct stack_trace trace = { | ||
323 | .entries = store, | ||
324 | .max_entries = size, | ||
325 | .skip = skipnr, | ||
326 | }; | ||
327 | |||
328 | save_stack_trace_regs(regs, &trace); | ||
329 | return trace.nr_entries; | ||
330 | } | ||
331 | |||
332 | #ifdef CONFIG_HAVE_RELIABLE_STACKTRACE | ||
333 | /** | ||
334 | * stack_trace_save_tsk_reliable - Save task stack with verification | ||
335 | * @tsk: Pointer to the task to examine | ||
336 | * @store: Pointer to storage array | ||
337 | * @size: Size of the storage array | ||
338 | * | ||
339 | * Return: An error if it detects any unreliable features of the | ||
340 | * stack. Otherwise it guarantees that the stack trace is | ||
341 | * reliable and returns the number of entries stored. | ||
342 | * | ||
343 | * If the task is not 'current', the caller *must* ensure the task is inactive. | ||
344 | */ | ||
345 | int stack_trace_save_tsk_reliable(struct task_struct *tsk, unsigned long *store, | ||
346 | unsigned int size) | ||
347 | { | ||
348 | struct stack_trace trace = { | ||
349 | .entries = store, | ||
350 | .max_entries = size, | ||
351 | }; | ||
352 | int ret = save_stack_trace_tsk_reliable(tsk, &trace); | ||
353 | |||
354 | return ret ? ret : trace.nr_entries; | ||
355 | } | ||
356 | #endif | ||
357 | |||
358 | #ifdef CONFIG_USER_STACKTRACE_SUPPORT | ||
359 | /** | ||
360 | * stack_trace_save_user - Save a user space stack trace into a storage array | ||
361 | * @store: Pointer to storage array | ||
362 | * @size: Size of the storage array | ||
363 | * | ||
364 | * Return: Number of trace entries stored | ||
365 | */ | ||
366 | unsigned int stack_trace_save_user(unsigned long *store, unsigned int size) | ||
367 | { | ||
368 | struct stack_trace trace = { | ||
369 | .entries = store, | ||
370 | .max_entries = size, | ||
371 | }; | ||
372 | |||
373 | save_stack_trace_user(&trace); | ||
374 | return trace.nr_entries; | ||
375 | } | ||
376 | #endif /* CONFIG_USER_STACKTRACE_SUPPORT */ | ||
377 | |||
378 | #endif /* !CONFIG_ARCH_STACKWALK */ | ||
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 067cb83f37ea..7231fb5953fc 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
@@ -513,7 +513,7 @@ repeat: | |||
513 | } | 513 | } |
514 | preempt_count_dec(); | 514 | preempt_count_dec(); |
515 | WARN_ONCE(preempt_count(), | 515 | WARN_ONCE(preempt_count(), |
516 | "cpu_stop: %pf(%p) leaked preempt count\n", fn, arg); | 516 | "cpu_stop: %ps(%p) leaked preempt count\n", fn, arg); |
517 | goto repeat; | 517 | goto repeat; |
518 | } | 518 | } |
519 | } | 519 | } |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index d21f4befaea4..4d9ae5ea6caf 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
@@ -167,9 +167,6 @@ COND_SYSCALL(syslog); | |||
167 | 167 | ||
168 | /* kernel/sched/core.c */ | 168 | /* kernel/sched/core.c */ |
169 | 169 | ||
170 | /* kernel/signal.c */ | ||
171 | COND_SYSCALL(pidfd_send_signal); | ||
172 | |||
173 | /* kernel/sys.c */ | 170 | /* kernel/sys.c */ |
174 | COND_SYSCALL(setregid); | 171 | COND_SYSCALL(setregid); |
175 | COND_SYSCALL(setgid); | 172 | COND_SYSCALL(setgid); |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e5da394d1ca3..c9ec050bcf46 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -128,6 +128,7 @@ static int zero; | |||
128 | static int __maybe_unused one = 1; | 128 | static int __maybe_unused one = 1; |
129 | static int __maybe_unused two = 2; | 129 | static int __maybe_unused two = 2; |
130 | static int __maybe_unused four = 4; | 130 | static int __maybe_unused four = 4; |
131 | static unsigned long zero_ul; | ||
131 | static unsigned long one_ul = 1; | 132 | static unsigned long one_ul = 1; |
132 | static unsigned long long_max = LONG_MAX; | 133 | static unsigned long long_max = LONG_MAX; |
133 | static int one_hundred = 100; | 134 | static int one_hundred = 100; |
@@ -1750,7 +1751,7 @@ static struct ctl_table fs_table[] = { | |||
1750 | .maxlen = sizeof(files_stat.max_files), | 1751 | .maxlen = sizeof(files_stat.max_files), |
1751 | .mode = 0644, | 1752 | .mode = 0644, |
1752 | .proc_handler = proc_doulongvec_minmax, | 1753 | .proc_handler = proc_doulongvec_minmax, |
1753 | .extra1 = &zero, | 1754 | .extra1 = &zero_ul, |
1754 | .extra2 = &long_max, | 1755 | .extra2 = &long_max, |
1755 | }, | 1756 | }, |
1756 | { | 1757 | { |
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 2c97e8c2d29f..0519a8805aab 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c | |||
@@ -594,7 +594,7 @@ static ktime_t alarm_timer_remaining(struct k_itimer *timr, ktime_t now) | |||
594 | { | 594 | { |
595 | struct alarm *alarm = &timr->it.alarm.alarmtimer; | 595 | struct alarm *alarm = &timr->it.alarm.alarmtimer; |
596 | 596 | ||
597 | return ktime_sub(now, alarm->node.expires); | 597 | return ktime_sub(alarm->node.expires, now); |
598 | } | 598 | } |
599 | 599 | ||
600 | /** | 600 | /** |
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 5e77662dd2d9..f5490222e134 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c | |||
@@ -611,6 +611,22 @@ void clockevents_resume(void) | |||
611 | } | 611 | } |
612 | 612 | ||
613 | #ifdef CONFIG_HOTPLUG_CPU | 613 | #ifdef CONFIG_HOTPLUG_CPU |
614 | |||
615 | # ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST | ||
616 | /** | ||
617 | * tick_offline_cpu - Take CPU out of the broadcast mechanism | ||
618 | * @cpu: The outgoing CPU | ||
619 | * | ||
620 | * Called on the outgoing CPU after it took itself offline. | ||
621 | */ | ||
622 | void tick_offline_cpu(unsigned int cpu) | ||
623 | { | ||
624 | raw_spin_lock(&clockevents_lock); | ||
625 | tick_broadcast_offline(cpu); | ||
626 | raw_spin_unlock(&clockevents_lock); | ||
627 | } | ||
628 | # endif | ||
629 | |||
614 | /** | 630 | /** |
615 | * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu | 631 | * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu |
616 | */ | 632 | */ |
@@ -621,8 +637,6 @@ void tick_cleanup_dead_cpu(int cpu) | |||
621 | 637 | ||
622 | raw_spin_lock_irqsave(&clockevents_lock, flags); | 638 | raw_spin_lock_irqsave(&clockevents_lock, flags); |
623 | 639 | ||
624 | tick_shutdown_broadcast_oneshot(cpu); | ||
625 | tick_shutdown_broadcast(cpu); | ||
626 | tick_shutdown(cpu); | 640 | tick_shutdown(cpu); |
627 | /* | 641 | /* |
628 | * Unregister the clock event devices which were | 642 | * Unregister the clock event devices which were |
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index dc1b6f1929f9..d23b434c2ca7 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c | |||
@@ -63,7 +63,7 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock); | |||
63 | #if (BITS_PER_LONG < 64) | 63 | #if (BITS_PER_LONG < 64) |
64 | u64 get_jiffies_64(void) | 64 | u64 get_jiffies_64(void) |
65 | { | 65 | { |
66 | unsigned long seq; | 66 | unsigned int seq; |
67 | u64 ret; | 67 | u64 ret; |
68 | 68 | ||
69 | do { | 69 | do { |
@@ -89,7 +89,7 @@ struct clocksource * __init __weak clocksource_default_clock(void) | |||
89 | return &clocksource_jiffies; | 89 | return &clocksource_jiffies; |
90 | } | 90 | } |
91 | 91 | ||
92 | struct clocksource refined_jiffies; | 92 | static struct clocksource refined_jiffies; |
93 | 93 | ||
94 | int register_refined_jiffies(long cycles_per_second) | 94 | int register_refined_jiffies(long cycles_per_second) |
95 | { | 95 | { |
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 094b82ca95e5..142b07619918 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c | |||
@@ -94,7 +94,7 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) | |||
94 | unsigned long long notrace sched_clock(void) | 94 | unsigned long long notrace sched_clock(void) |
95 | { | 95 | { |
96 | u64 cyc, res; | 96 | u64 cyc, res; |
97 | unsigned long seq; | 97 | unsigned int seq; |
98 | struct clock_read_data *rd; | 98 | struct clock_read_data *rd; |
99 | 99 | ||
100 | do { | 100 | do { |
@@ -231,7 +231,7 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) | |||
231 | if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) | 231 | if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) |
232 | enable_sched_clock_irqtime(); | 232 | enable_sched_clock_irqtime(); |
233 | 233 | ||
234 | pr_debug("Registered %pF as sched_clock source\n", read); | 234 | pr_debug("Registered %pS as sched_clock source\n", read); |
235 | } | 235 | } |
236 | 236 | ||
237 | void __init generic_sched_clock_init(void) | 237 | void __init generic_sched_clock_init(void) |
@@ -267,12 +267,12 @@ void __init generic_sched_clock_init(void) | |||
267 | */ | 267 | */ |
268 | static u64 notrace suspended_sched_clock_read(void) | 268 | static u64 notrace suspended_sched_clock_read(void) |
269 | { | 269 | { |
270 | unsigned long seq = raw_read_seqcount(&cd.seq); | 270 | unsigned int seq = raw_read_seqcount(&cd.seq); |
271 | 271 | ||
272 | return cd.read_data[seq & 1].epoch_cyc; | 272 | return cd.read_data[seq & 1].epoch_cyc; |
273 | } | 273 | } |
274 | 274 | ||
275 | static int sched_clock_suspend(void) | 275 | int sched_clock_suspend(void) |
276 | { | 276 | { |
277 | struct clock_read_data *rd = &cd.read_data[0]; | 277 | struct clock_read_data *rd = &cd.read_data[0]; |
278 | 278 | ||
@@ -283,7 +283,7 @@ static int sched_clock_suspend(void) | |||
283 | return 0; | 283 | return 0; |
284 | } | 284 | } |
285 | 285 | ||
286 | static void sched_clock_resume(void) | 286 | void sched_clock_resume(void) |
287 | { | 287 | { |
288 | struct clock_read_data *rd = &cd.read_data[0]; | 288 | struct clock_read_data *rd = &cd.read_data[0]; |
289 | 289 | ||
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index ee834d4fb814..e51778c312f1 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
@@ -36,10 +36,16 @@ static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock); | |||
36 | static void tick_broadcast_setup_oneshot(struct clock_event_device *bc); | 36 | static void tick_broadcast_setup_oneshot(struct clock_event_device *bc); |
37 | static void tick_broadcast_clear_oneshot(int cpu); | 37 | static void tick_broadcast_clear_oneshot(int cpu); |
38 | static void tick_resume_broadcast_oneshot(struct clock_event_device *bc); | 38 | static void tick_resume_broadcast_oneshot(struct clock_event_device *bc); |
39 | # ifdef CONFIG_HOTPLUG_CPU | ||
40 | static void tick_broadcast_oneshot_offline(unsigned int cpu); | ||
41 | # endif | ||
39 | #else | 42 | #else |
40 | static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } | 43 | static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } |
41 | static inline void tick_broadcast_clear_oneshot(int cpu) { } | 44 | static inline void tick_broadcast_clear_oneshot(int cpu) { } |
42 | static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { } | 45 | static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { } |
46 | # ifdef CONFIG_HOTPLUG_CPU | ||
47 | static inline void tick_broadcast_oneshot_offline(unsigned int cpu) { } | ||
48 | # endif | ||
43 | #endif | 49 | #endif |
44 | 50 | ||
45 | /* | 51 | /* |
@@ -433,27 +439,29 @@ void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast) | |||
433 | } | 439 | } |
434 | 440 | ||
435 | #ifdef CONFIG_HOTPLUG_CPU | 441 | #ifdef CONFIG_HOTPLUG_CPU |
436 | /* | 442 | static void tick_shutdown_broadcast(void) |
437 | * Remove a CPU from broadcasting | ||
438 | */ | ||
439 | void tick_shutdown_broadcast(unsigned int cpu) | ||
440 | { | 443 | { |
441 | struct clock_event_device *bc; | 444 | struct clock_event_device *bc = tick_broadcast_device.evtdev; |
442 | unsigned long flags; | ||
443 | |||
444 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); | ||
445 | |||
446 | bc = tick_broadcast_device.evtdev; | ||
447 | cpumask_clear_cpu(cpu, tick_broadcast_mask); | ||
448 | cpumask_clear_cpu(cpu, tick_broadcast_on); | ||
449 | 445 | ||
450 | if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { | 446 | if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { |
451 | if (bc && cpumask_empty(tick_broadcast_mask)) | 447 | if (bc && cpumask_empty(tick_broadcast_mask)) |
452 | clockevents_shutdown(bc); | 448 | clockevents_shutdown(bc); |
453 | } | 449 | } |
450 | } | ||
454 | 451 | ||
455 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); | 452 | /* |
453 | * Remove a CPU from broadcasting | ||
454 | */ | ||
455 | void tick_broadcast_offline(unsigned int cpu) | ||
456 | { | ||
457 | raw_spin_lock(&tick_broadcast_lock); | ||
458 | cpumask_clear_cpu(cpu, tick_broadcast_mask); | ||
459 | cpumask_clear_cpu(cpu, tick_broadcast_on); | ||
460 | tick_broadcast_oneshot_offline(cpu); | ||
461 | tick_shutdown_broadcast(); | ||
462 | raw_spin_unlock(&tick_broadcast_lock); | ||
456 | } | 463 | } |
464 | |||
457 | #endif | 465 | #endif |
458 | 466 | ||
459 | void tick_suspend_broadcast(void) | 467 | void tick_suspend_broadcast(void) |
@@ -801,13 +809,13 @@ int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) | |||
801 | * either the CPU handling the broadcast | 809 | * either the CPU handling the broadcast |
802 | * interrupt or we got woken by something else. | 810 | * interrupt or we got woken by something else. |
803 | * | 811 | * |
804 | * We are not longer in the broadcast mask, so | 812 | * We are no longer in the broadcast mask, so |
805 | * if the cpu local expiry time is already | 813 | * if the cpu local expiry time is already |
806 | * reached, we would reprogram the cpu local | 814 | * reached, we would reprogram the cpu local |
807 | * timer with an already expired event. | 815 | * timer with an already expired event. |
808 | * | 816 | * |
809 | * This can lead to a ping-pong when we return | 817 | * This can lead to a ping-pong when we return |
810 | * to idle and therefor rearm the broadcast | 818 | * to idle and therefore rearm the broadcast |
811 | * timer before the cpu local timer was able | 819 | * timer before the cpu local timer was able |
812 | * to fire. This happens because the forced | 820 | * to fire. This happens because the forced |
813 | * reprogramming makes sure that the event | 821 | * reprogramming makes sure that the event |
@@ -950,14 +958,10 @@ void hotplug_cpu__broadcast_tick_pull(int deadcpu) | |||
950 | } | 958 | } |
951 | 959 | ||
952 | /* | 960 | /* |
953 | * Remove a dead CPU from broadcasting | 961 | * Remove a dying CPU from broadcasting |
954 | */ | 962 | */ |
955 | void tick_shutdown_broadcast_oneshot(unsigned int cpu) | 963 | static void tick_broadcast_oneshot_offline(unsigned int cpu) |
956 | { | 964 | { |
957 | unsigned long flags; | ||
958 | |||
959 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); | ||
960 | |||
961 | /* | 965 | /* |
962 | * Clear the broadcast masks for the dead cpu, but do not stop | 966 | * Clear the broadcast masks for the dead cpu, but do not stop |
963 | * the broadcast device! | 967 | * the broadcast device! |
@@ -965,8 +969,6 @@ void tick_shutdown_broadcast_oneshot(unsigned int cpu) | |||
965 | cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); | 969 | cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); |
966 | cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); | 970 | cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); |
967 | cpumask_clear_cpu(cpu, tick_broadcast_force_mask); | 971 | cpumask_clear_cpu(cpu, tick_broadcast_force_mask); |
968 | |||
969 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); | ||
970 | } | 972 | } |
971 | #endif | 973 | #endif |
972 | 974 | ||
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 529143b4c8d2..59225b484e4e 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c | |||
@@ -46,6 +46,14 @@ ktime_t tick_period; | |||
46 | * procedure also covers cpu hotplug. | 46 | * procedure also covers cpu hotplug. |
47 | */ | 47 | */ |
48 | int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; | 48 | int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; |
49 | #ifdef CONFIG_NO_HZ_FULL | ||
50 | /* | ||
51 | * tick_do_timer_boot_cpu indicates the boot CPU temporarily owns | ||
52 | * tick_do_timer_cpu and it should be taken over by an eligible secondary | ||
53 | * when one comes online. | ||
54 | */ | ||
55 | static int tick_do_timer_boot_cpu __read_mostly = -1; | ||
56 | #endif | ||
49 | 57 | ||
50 | /* | 58 | /* |
51 | * Debugging: see timer_list.c | 59 | * Debugging: see timer_list.c |
@@ -149,7 +157,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) | |||
149 | !tick_broadcast_oneshot_active()) { | 157 | !tick_broadcast_oneshot_active()) { |
150 | clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC); | 158 | clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC); |
151 | } else { | 159 | } else { |
152 | unsigned long seq; | 160 | unsigned int seq; |
153 | ktime_t next; | 161 | ktime_t next; |
154 | 162 | ||
155 | do { | 163 | do { |
@@ -167,6 +175,26 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) | |||
167 | } | 175 | } |
168 | } | 176 | } |
169 | 177 | ||
178 | #ifdef CONFIG_NO_HZ_FULL | ||
179 | static void giveup_do_timer(void *info) | ||
180 | { | ||
181 | int cpu = *(unsigned int *)info; | ||
182 | |||
183 | WARN_ON(tick_do_timer_cpu != smp_processor_id()); | ||
184 | |||
185 | tick_do_timer_cpu = cpu; | ||
186 | } | ||
187 | |||
188 | static void tick_take_do_timer_from_boot(void) | ||
189 | { | ||
190 | int cpu = smp_processor_id(); | ||
191 | int from = tick_do_timer_boot_cpu; | ||
192 | |||
193 | if (from >= 0 && from != cpu) | ||
194 | smp_call_function_single(from, giveup_do_timer, &cpu, 1); | ||
195 | } | ||
196 | #endif | ||
197 | |||
170 | /* | 198 | /* |
171 | * Setup the tick device | 199 | * Setup the tick device |
172 | */ | 200 | */ |
@@ -186,12 +214,26 @@ static void tick_setup_device(struct tick_device *td, | |||
186 | * this cpu: | 214 | * this cpu: |
187 | */ | 215 | */ |
188 | if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { | 216 | if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { |
189 | if (!tick_nohz_full_cpu(cpu)) | 217 | tick_do_timer_cpu = cpu; |
190 | tick_do_timer_cpu = cpu; | 218 | |
191 | else | ||
192 | tick_do_timer_cpu = TICK_DO_TIMER_NONE; | ||
193 | tick_next_period = ktime_get(); | 219 | tick_next_period = ktime_get(); |
194 | tick_period = NSEC_PER_SEC / HZ; | 220 | tick_period = NSEC_PER_SEC / HZ; |
221 | #ifdef CONFIG_NO_HZ_FULL | ||
222 | /* | ||
223 | * The boot CPU may be nohz_full, in which case set | ||
224 | * tick_do_timer_boot_cpu so the first housekeeping | ||
225 | * secondary that comes up will take do_timer from | ||
226 | * us. | ||
227 | */ | ||
228 | if (tick_nohz_full_cpu(cpu)) | ||
229 | tick_do_timer_boot_cpu = cpu; | ||
230 | |||
231 | } else if (tick_do_timer_boot_cpu != -1 && | ||
232 | !tick_nohz_full_cpu(cpu)) { | ||
233 | tick_take_do_timer_from_boot(); | ||
234 | tick_do_timer_boot_cpu = -1; | ||
235 | WARN_ON(tick_do_timer_cpu != cpu); | ||
236 | #endif | ||
195 | } | 237 | } |
196 | 238 | ||
197 | /* | 239 | /* |
@@ -487,6 +529,7 @@ void tick_freeze(void) | |||
487 | trace_suspend_resume(TPS("timekeeping_freeze"), | 529 | trace_suspend_resume(TPS("timekeeping_freeze"), |
488 | smp_processor_id(), true); | 530 | smp_processor_id(), true); |
489 | system_state = SYSTEM_SUSPEND; | 531 | system_state = SYSTEM_SUSPEND; |
532 | sched_clock_suspend(); | ||
490 | timekeeping_suspend(); | 533 | timekeeping_suspend(); |
491 | } else { | 534 | } else { |
492 | tick_suspend_local(); | 535 | tick_suspend_local(); |
@@ -510,6 +553,7 @@ void tick_unfreeze(void) | |||
510 | 553 | ||
511 | if (tick_freeze_depth == num_online_cpus()) { | 554 | if (tick_freeze_depth == num_online_cpus()) { |
512 | timekeeping_resume(); | 555 | timekeeping_resume(); |
556 | sched_clock_resume(); | ||
513 | system_state = SYSTEM_RUNNING; | 557 | system_state = SYSTEM_RUNNING; |
514 | trace_suspend_resume(TPS("timekeeping_freeze"), | 558 | trace_suspend_resume(TPS("timekeeping_freeze"), |
515 | smp_processor_id(), false); | 559 | smp_processor_id(), false); |
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index e277284c2831..7b2496136729 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h | |||
@@ -64,7 +64,6 @@ extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); | |||
64 | extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); | 64 | extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); |
65 | extern void tick_install_broadcast_device(struct clock_event_device *dev); | 65 | extern void tick_install_broadcast_device(struct clock_event_device *dev); |
66 | extern int tick_is_broadcast_device(struct clock_event_device *dev); | 66 | extern int tick_is_broadcast_device(struct clock_event_device *dev); |
67 | extern void tick_shutdown_broadcast(unsigned int cpu); | ||
68 | extern void tick_suspend_broadcast(void); | 67 | extern void tick_suspend_broadcast(void); |
69 | extern void tick_resume_broadcast(void); | 68 | extern void tick_resume_broadcast(void); |
70 | extern bool tick_resume_check_broadcast(void); | 69 | extern bool tick_resume_check_broadcast(void); |
@@ -78,7 +77,6 @@ static inline void tick_install_broadcast_device(struct clock_event_device *dev) | |||
78 | static inline int tick_is_broadcast_device(struct clock_event_device *dev) { return 0; } | 77 | static inline int tick_is_broadcast_device(struct clock_event_device *dev) { return 0; } |
79 | static inline int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { return 0; } | 78 | static inline int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { return 0; } |
80 | static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { } | 79 | static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { } |
81 | static inline void tick_shutdown_broadcast(unsigned int cpu) { } | ||
82 | static inline void tick_suspend_broadcast(void) { } | 80 | static inline void tick_suspend_broadcast(void) { } |
83 | static inline void tick_resume_broadcast(void) { } | 81 | static inline void tick_resume_broadcast(void) { } |
84 | static inline bool tick_resume_check_broadcast(void) { return false; } | 82 | static inline bool tick_resume_check_broadcast(void) { return false; } |
@@ -128,19 +126,23 @@ static inline int tick_check_oneshot_change(int allow_nohz) { return 0; } | |||
128 | /* Functions related to oneshot broadcasting */ | 126 | /* Functions related to oneshot broadcasting */ |
129 | #if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT) | 127 | #if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT) |
130 | extern void tick_broadcast_switch_to_oneshot(void); | 128 | extern void tick_broadcast_switch_to_oneshot(void); |
131 | extern void tick_shutdown_broadcast_oneshot(unsigned int cpu); | ||
132 | extern int tick_broadcast_oneshot_active(void); | 129 | extern int tick_broadcast_oneshot_active(void); |
133 | extern void tick_check_oneshot_broadcast_this_cpu(void); | 130 | extern void tick_check_oneshot_broadcast_this_cpu(void); |
134 | bool tick_broadcast_oneshot_available(void); | 131 | bool tick_broadcast_oneshot_available(void); |
135 | extern struct cpumask *tick_get_broadcast_oneshot_mask(void); | 132 | extern struct cpumask *tick_get_broadcast_oneshot_mask(void); |
136 | #else /* !(BROADCAST && ONESHOT): */ | 133 | #else /* !(BROADCAST && ONESHOT): */ |
137 | static inline void tick_broadcast_switch_to_oneshot(void) { } | 134 | static inline void tick_broadcast_switch_to_oneshot(void) { } |
138 | static inline void tick_shutdown_broadcast_oneshot(unsigned int cpu) { } | ||
139 | static inline int tick_broadcast_oneshot_active(void) { return 0; } | 135 | static inline int tick_broadcast_oneshot_active(void) { return 0; } |
140 | static inline void tick_check_oneshot_broadcast_this_cpu(void) { } | 136 | static inline void tick_check_oneshot_broadcast_this_cpu(void) { } |
141 | static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); } | 137 | static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); } |
142 | #endif /* !(BROADCAST && ONESHOT) */ | 138 | #endif /* !(BROADCAST && ONESHOT) */ |
143 | 139 | ||
140 | #if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_HOTPLUG_CPU) | ||
141 | extern void tick_broadcast_offline(unsigned int cpu); | ||
142 | #else | ||
143 | static inline void tick_broadcast_offline(unsigned int cpu) { } | ||
144 | #endif | ||
145 | |||
144 | /* NO_HZ_FULL internal */ | 146 | /* NO_HZ_FULL internal */ |
145 | #ifdef CONFIG_NO_HZ_FULL | 147 | #ifdef CONFIG_NO_HZ_FULL |
146 | extern void tick_nohz_init(void); | 148 | extern void tick_nohz_init(void); |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6fa52cd6df0b..f4ee1a3428ae 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -121,10 +121,16 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) | |||
121 | * into a long sleep. If two CPUs happen to assign themselves to | 121 | * into a long sleep. If two CPUs happen to assign themselves to |
122 | * this duty, then the jiffies update is still serialized by | 122 | * this duty, then the jiffies update is still serialized by |
123 | * jiffies_lock. | 123 | * jiffies_lock. |
124 | * | ||
125 | * If nohz_full is enabled, this should not happen because the | ||
126 | * tick_do_timer_cpu never relinquishes. | ||
124 | */ | 127 | */ |
125 | if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE) | 128 | if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) { |
126 | && !tick_nohz_full_cpu(cpu)) | 129 | #ifdef CONFIG_NO_HZ_FULL |
130 | WARN_ON(tick_nohz_full_running); | ||
131 | #endif | ||
127 | tick_do_timer_cpu = cpu; | 132 | tick_do_timer_cpu = cpu; |
133 | } | ||
128 | #endif | 134 | #endif |
129 | 135 | ||
130 | /* Check, if the jiffies need an update */ | 136 | /* Check, if the jiffies need an update */ |
@@ -395,8 +401,8 @@ void __init tick_nohz_full_setup(cpumask_var_t cpumask) | |||
395 | static int tick_nohz_cpu_down(unsigned int cpu) | 401 | static int tick_nohz_cpu_down(unsigned int cpu) |
396 | { | 402 | { |
397 | /* | 403 | /* |
398 | * The boot CPU handles housekeeping duty (unbound timers, | 404 | * The tick_do_timer_cpu CPU handles housekeeping duty (unbound |
399 | * workqueues, timekeeping, ...) on behalf of full dynticks | 405 | * timers, workqueues, timekeeping, ...) on behalf of full dynticks |
400 | * CPUs. It must remain online when nohz full is enabled. | 406 | * CPUs. It must remain online when nohz full is enabled. |
401 | */ | 407 | */ |
402 | if (tick_nohz_full_running && tick_do_timer_cpu == cpu) | 408 | if (tick_nohz_full_running && tick_do_timer_cpu == cpu) |
@@ -423,12 +429,15 @@ void __init tick_nohz_init(void) | |||
423 | return; | 429 | return; |
424 | } | 430 | } |
425 | 431 | ||
426 | cpu = smp_processor_id(); | 432 | if (IS_ENABLED(CONFIG_PM_SLEEP_SMP) && |
433 | !IS_ENABLED(CONFIG_PM_SLEEP_SMP_NONZERO_CPU)) { | ||
434 | cpu = smp_processor_id(); | ||
427 | 435 | ||
428 | if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { | 436 | if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { |
429 | pr_warn("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", | 437 | pr_warn("NO_HZ: Clearing %d from nohz_full range " |
430 | cpu); | 438 | "for timekeeping\n", cpu); |
431 | cpumask_clear_cpu(cpu, tick_nohz_full_mask); | 439 | cpumask_clear_cpu(cpu, tick_nohz_full_mask); |
440 | } | ||
432 | } | 441 | } |
433 | 442 | ||
434 | for_each_cpu(cpu, tick_nohz_full_mask) | 443 | for_each_cpu(cpu, tick_nohz_full_mask) |
@@ -645,7 +654,8 @@ static inline bool local_timer_softirq_pending(void) | |||
645 | static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) | 654 | static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) |
646 | { | 655 | { |
647 | u64 basemono, next_tick, next_tmr, next_rcu, delta, expires; | 656 | u64 basemono, next_tick, next_tmr, next_rcu, delta, expires; |
648 | unsigned long seq, basejiff; | 657 | unsigned long basejiff; |
658 | unsigned int seq; | ||
649 | 659 | ||
650 | /* Read jiffies and the time when jiffies were updated last */ | 660 | /* Read jiffies and the time when jiffies were updated last */ |
651 | do { | 661 | do { |
@@ -904,8 +914,13 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) | |||
904 | /* | 914 | /* |
905 | * Boot safety: make sure the timekeeping duty has been | 915 | * Boot safety: make sure the timekeeping duty has been |
906 | * assigned before entering dyntick-idle mode, | 916 | * assigned before entering dyntick-idle mode, |
917 | * tick_do_timer_cpu is TICK_DO_TIMER_BOOT | ||
907 | */ | 918 | */ |
908 | if (tick_do_timer_cpu == TICK_DO_TIMER_NONE) | 919 | if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_BOOT)) |
920 | return false; | ||
921 | |||
922 | /* Should not happen for nohz-full */ | ||
923 | if (WARN_ON_ONCE(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) | ||
909 | return false; | 924 | return false; |
910 | } | 925 | } |
911 | 926 | ||
@@ -1023,6 +1038,18 @@ bool tick_nohz_idle_got_tick(void) | |||
1023 | } | 1038 | } |
1024 | 1039 | ||
1025 | /** | 1040 | /** |
1041 | * tick_nohz_get_next_hrtimer - return the next expiration time for the hrtimer | ||
1042 | * or the tick, whatever that expires first. Note that, if the tick has been | ||
1043 | * stopped, it returns the next hrtimer. | ||
1044 | * | ||
1045 | * Called from power state control code with interrupts disabled | ||
1046 | */ | ||
1047 | ktime_t tick_nohz_get_next_hrtimer(void) | ||
1048 | { | ||
1049 | return __this_cpu_read(tick_cpu_device.evtdev)->next_event; | ||
1050 | } | ||
1051 | |||
1052 | /** | ||
1026 | * tick_nohz_get_sleep_length - return the expected length of the current sleep | 1053 | * tick_nohz_get_sleep_length - return the expected length of the current sleep |
1027 | * @delta_next: duration until the next event if the tick cannot be stopped | 1054 | * @delta_next: duration until the next event if the tick cannot be stopped |
1028 | * | 1055 | * |
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index 6de959a854b2..4fb06527cf64 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h | |||
@@ -24,12 +24,19 @@ enum tick_nohz_mode { | |||
24 | * struct tick_sched - sched tick emulation and no idle tick control/stats | 24 | * struct tick_sched - sched tick emulation and no idle tick control/stats |
25 | * @sched_timer: hrtimer to schedule the periodic tick in high | 25 | * @sched_timer: hrtimer to schedule the periodic tick in high |
26 | * resolution mode | 26 | * resolution mode |
27 | * @check_clocks: Notification mechanism about clocksource changes | ||
28 | * @nohz_mode: Mode - one state of tick_nohz_mode | ||
29 | * @inidle: Indicator that the CPU is in the tick idle mode | ||
30 | * @tick_stopped: Indicator that the idle tick has been stopped | ||
31 | * @idle_active: Indicator that the CPU is actively in the tick idle mode; | ||
32 | * it is resetted during irq handling phases. | ||
33 | * @do_timer_lst: CPU was the last one doing do_timer before going idle | ||
34 | * @got_idle_tick: Tick timer function has run with @inidle set | ||
27 | * @last_tick: Store the last tick expiry time when the tick | 35 | * @last_tick: Store the last tick expiry time when the tick |
28 | * timer is modified for nohz sleeps. This is necessary | 36 | * timer is modified for nohz sleeps. This is necessary |
29 | * to resume the tick timer operation in the timeline | 37 | * to resume the tick timer operation in the timeline |
30 | * when the CPU returns from nohz sleep. | 38 | * when the CPU returns from nohz sleep. |
31 | * @next_tick: Next tick to be fired when in dynticks mode. | 39 | * @next_tick: Next tick to be fired when in dynticks mode. |
32 | * @tick_stopped: Indicator that the idle tick has been stopped | ||
33 | * @idle_jiffies: jiffies at the entry to idle for idle time accounting | 40 | * @idle_jiffies: jiffies at the entry to idle for idle time accounting |
34 | * @idle_calls: Total number of idle calls | 41 | * @idle_calls: Total number of idle calls |
35 | * @idle_sleeps: Number of idle calls, where the sched tick was stopped | 42 | * @idle_sleeps: Number of idle calls, where the sched tick was stopped |
@@ -40,8 +47,8 @@ enum tick_nohz_mode { | |||
40 | * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding | 47 | * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding |
41 | * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped) | 48 | * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped) |
42 | * @timer_expires_base: Base time clock monotonic for @timer_expires | 49 | * @timer_expires_base: Base time clock monotonic for @timer_expires |
43 | * @do_timer_lst: CPU was the last one doing do_timer before going idle | 50 | * @next_timer: Expiry time of next expiring timer for debugging purpose only |
44 | * @got_idle_tick: Tick timer function has run with @inidle set | 51 | * @tick_dep_mask: Tick dependency mask - is set, if someone needs the tick |
45 | */ | 52 | */ |
46 | struct tick_sched { | 53 | struct tick_sched { |
47 | struct hrtimer sched_timer; | 54 | struct hrtimer sched_timer; |
diff --git a/kernel/time/time.c b/kernel/time/time.c index c3f756f8534b..86656bbac232 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c | |||
@@ -171,7 +171,7 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz | |||
171 | static int firsttime = 1; | 171 | static int firsttime = 1; |
172 | int error = 0; | 172 | int error = 0; |
173 | 173 | ||
174 | if (tv && !timespec64_valid(tv)) | 174 | if (tv && !timespec64_valid_settod(tv)) |
175 | return -EINVAL; | 175 | return -EINVAL; |
176 | 176 | ||
177 | error = security_settime64(tv, tz); | 177 | error = security_settime64(tv, tz); |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f986e1918d12..5716e28bfa3c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -720,7 +720,7 @@ static void timekeeping_forward_now(struct timekeeper *tk) | |||
720 | void ktime_get_real_ts64(struct timespec64 *ts) | 720 | void ktime_get_real_ts64(struct timespec64 *ts) |
721 | { | 721 | { |
722 | struct timekeeper *tk = &tk_core.timekeeper; | 722 | struct timekeeper *tk = &tk_core.timekeeper; |
723 | unsigned long seq; | 723 | unsigned int seq; |
724 | u64 nsecs; | 724 | u64 nsecs; |
725 | 725 | ||
726 | WARN_ON(timekeeping_suspended); | 726 | WARN_ON(timekeeping_suspended); |
@@ -829,7 +829,7 @@ EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset); | |||
829 | ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs) | 829 | ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs) |
830 | { | 830 | { |
831 | ktime_t *offset = offsets[offs]; | 831 | ktime_t *offset = offsets[offs]; |
832 | unsigned long seq; | 832 | unsigned int seq; |
833 | ktime_t tconv; | 833 | ktime_t tconv; |
834 | 834 | ||
835 | do { | 835 | do { |
@@ -960,7 +960,7 @@ time64_t __ktime_get_real_seconds(void) | |||
960 | void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) | 960 | void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) |
961 | { | 961 | { |
962 | struct timekeeper *tk = &tk_core.timekeeper; | 962 | struct timekeeper *tk = &tk_core.timekeeper; |
963 | unsigned long seq; | 963 | unsigned int seq; |
964 | ktime_t base_raw; | 964 | ktime_t base_raw; |
965 | ktime_t base_real; | 965 | ktime_t base_real; |
966 | u64 nsec_raw; | 966 | u64 nsec_raw; |
@@ -1122,7 +1122,7 @@ int get_device_system_crosststamp(int (*get_time_fn) | |||
1122 | ktime_t base_real, base_raw; | 1122 | ktime_t base_real, base_raw; |
1123 | u64 nsec_real, nsec_raw; | 1123 | u64 nsec_real, nsec_raw; |
1124 | u8 cs_was_changed_seq; | 1124 | u8 cs_was_changed_seq; |
1125 | unsigned long seq; | 1125 | unsigned int seq; |
1126 | bool do_interp; | 1126 | bool do_interp; |
1127 | int ret; | 1127 | int ret; |
1128 | 1128 | ||
@@ -1221,7 +1221,7 @@ int do_settimeofday64(const struct timespec64 *ts) | |||
1221 | unsigned long flags; | 1221 | unsigned long flags; |
1222 | int ret = 0; | 1222 | int ret = 0; |
1223 | 1223 | ||
1224 | if (!timespec64_valid_strict(ts)) | 1224 | if (!timespec64_valid_settod(ts)) |
1225 | return -EINVAL; | 1225 | return -EINVAL; |
1226 | 1226 | ||
1227 | raw_spin_lock_irqsave(&timekeeper_lock, flags); | 1227 | raw_spin_lock_irqsave(&timekeeper_lock, flags); |
@@ -1278,7 +1278,7 @@ static int timekeeping_inject_offset(const struct timespec64 *ts) | |||
1278 | /* Make sure the proposed value is valid */ | 1278 | /* Make sure the proposed value is valid */ |
1279 | tmp = timespec64_add(tk_xtime(tk), *ts); | 1279 | tmp = timespec64_add(tk_xtime(tk), *ts); |
1280 | if (timespec64_compare(&tk->wall_to_monotonic, ts) > 0 || | 1280 | if (timespec64_compare(&tk->wall_to_monotonic, ts) > 0 || |
1281 | !timespec64_valid_strict(&tmp)) { | 1281 | !timespec64_valid_settod(&tmp)) { |
1282 | ret = -EINVAL; | 1282 | ret = -EINVAL; |
1283 | goto error; | 1283 | goto error; |
1284 | } | 1284 | } |
@@ -1409,7 +1409,7 @@ int timekeeping_notify(struct clocksource *clock) | |||
1409 | void ktime_get_raw_ts64(struct timespec64 *ts) | 1409 | void ktime_get_raw_ts64(struct timespec64 *ts) |
1410 | { | 1410 | { |
1411 | struct timekeeper *tk = &tk_core.timekeeper; | 1411 | struct timekeeper *tk = &tk_core.timekeeper; |
1412 | unsigned long seq; | 1412 | unsigned int seq; |
1413 | u64 nsecs; | 1413 | u64 nsecs; |
1414 | 1414 | ||
1415 | do { | 1415 | do { |
@@ -1431,7 +1431,7 @@ EXPORT_SYMBOL(ktime_get_raw_ts64); | |||
1431 | int timekeeping_valid_for_hres(void) | 1431 | int timekeeping_valid_for_hres(void) |
1432 | { | 1432 | { |
1433 | struct timekeeper *tk = &tk_core.timekeeper; | 1433 | struct timekeeper *tk = &tk_core.timekeeper; |
1434 | unsigned long seq; | 1434 | unsigned int seq; |
1435 | int ret; | 1435 | int ret; |
1436 | 1436 | ||
1437 | do { | 1437 | do { |
@@ -1450,7 +1450,7 @@ int timekeeping_valid_for_hres(void) | |||
1450 | u64 timekeeping_max_deferment(void) | 1450 | u64 timekeeping_max_deferment(void) |
1451 | { | 1451 | { |
1452 | struct timekeeper *tk = &tk_core.timekeeper; | 1452 | struct timekeeper *tk = &tk_core.timekeeper; |
1453 | unsigned long seq; | 1453 | unsigned int seq; |
1454 | u64 ret; | 1454 | u64 ret; |
1455 | 1455 | ||
1456 | do { | 1456 | do { |
@@ -1527,7 +1527,7 @@ void __init timekeeping_init(void) | |||
1527 | unsigned long flags; | 1527 | unsigned long flags; |
1528 | 1528 | ||
1529 | read_persistent_wall_and_boot_offset(&wall_time, &boot_offset); | 1529 | read_persistent_wall_and_boot_offset(&wall_time, &boot_offset); |
1530 | if (timespec64_valid_strict(&wall_time) && | 1530 | if (timespec64_valid_settod(&wall_time) && |
1531 | timespec64_to_ns(&wall_time) > 0) { | 1531 | timespec64_to_ns(&wall_time) > 0) { |
1532 | persistent_clock_exists = true; | 1532 | persistent_clock_exists = true; |
1533 | } else if (timespec64_to_ns(&wall_time) != 0) { | 1533 | } else if (timespec64_to_ns(&wall_time) != 0) { |
@@ -2150,7 +2150,7 @@ EXPORT_SYMBOL_GPL(getboottime64); | |||
2150 | void ktime_get_coarse_real_ts64(struct timespec64 *ts) | 2150 | void ktime_get_coarse_real_ts64(struct timespec64 *ts) |
2151 | { | 2151 | { |
2152 | struct timekeeper *tk = &tk_core.timekeeper; | 2152 | struct timekeeper *tk = &tk_core.timekeeper; |
2153 | unsigned long seq; | 2153 | unsigned int seq; |
2154 | 2154 | ||
2155 | do { | 2155 | do { |
2156 | seq = read_seqcount_begin(&tk_core.seq); | 2156 | seq = read_seqcount_begin(&tk_core.seq); |
@@ -2164,7 +2164,7 @@ void ktime_get_coarse_ts64(struct timespec64 *ts) | |||
2164 | { | 2164 | { |
2165 | struct timekeeper *tk = &tk_core.timekeeper; | 2165 | struct timekeeper *tk = &tk_core.timekeeper; |
2166 | struct timespec64 now, mono; | 2166 | struct timespec64 now, mono; |
2167 | unsigned long seq; | 2167 | unsigned int seq; |
2168 | 2168 | ||
2169 | do { | 2169 | do { |
2170 | seq = read_seqcount_begin(&tk_core.seq); | 2170 | seq = read_seqcount_begin(&tk_core.seq); |
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index 7a9b4eb7a1d5..141ab3ab0354 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h | |||
@@ -14,6 +14,13 @@ extern u64 timekeeping_max_deferment(void); | |||
14 | extern void timekeeping_warp_clock(void); | 14 | extern void timekeeping_warp_clock(void); |
15 | extern int timekeeping_suspend(void); | 15 | extern int timekeeping_suspend(void); |
16 | extern void timekeeping_resume(void); | 16 | extern void timekeeping_resume(void); |
17 | #ifdef CONFIG_GENERIC_SCHED_CLOCK | ||
18 | extern int sched_clock_suspend(void); | ||
19 | extern void sched_clock_resume(void); | ||
20 | #else | ||
21 | static inline int sched_clock_suspend(void) { return 0; } | ||
22 | static inline void sched_clock_resume(void) { } | ||
23 | #endif | ||
17 | 24 | ||
18 | extern void do_timer(unsigned long ticks); | 25 | extern void do_timer(unsigned long ticks); |
19 | extern void update_wall_time(void); | 26 | extern void update_wall_time(void); |
diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 2fce056f8a49..343c7ba33b1c 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c | |||
@@ -536,6 +536,8 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer, | |||
536 | hlist_add_head(&timer->entry, base->vectors + idx); | 536 | hlist_add_head(&timer->entry, base->vectors + idx); |
537 | __set_bit(idx, base->pending_map); | 537 | __set_bit(idx, base->pending_map); |
538 | timer_set_idx(timer, idx); | 538 | timer_set_idx(timer, idx); |
539 | |||
540 | trace_timer_start(timer, timer->expires, timer->flags); | ||
539 | } | 541 | } |
540 | 542 | ||
541 | static void | 543 | static void |
@@ -757,13 +759,6 @@ static inline void debug_init(struct timer_list *timer) | |||
757 | trace_timer_init(timer); | 759 | trace_timer_init(timer); |
758 | } | 760 | } |
759 | 761 | ||
760 | static inline void | ||
761 | debug_activate(struct timer_list *timer, unsigned long expires) | ||
762 | { | ||
763 | debug_timer_activate(timer); | ||
764 | trace_timer_start(timer, expires, timer->flags); | ||
765 | } | ||
766 | |||
767 | static inline void debug_deactivate(struct timer_list *timer) | 762 | static inline void debug_deactivate(struct timer_list *timer) |
768 | { | 763 | { |
769 | debug_timer_deactivate(timer); | 764 | debug_timer_deactivate(timer); |
@@ -1037,7 +1032,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option | |||
1037 | } | 1032 | } |
1038 | } | 1033 | } |
1039 | 1034 | ||
1040 | debug_activate(timer, expires); | 1035 | debug_timer_activate(timer); |
1041 | 1036 | ||
1042 | timer->expires = expires; | 1037 | timer->expires = expires; |
1043 | /* | 1038 | /* |
@@ -1171,7 +1166,7 @@ void add_timer_on(struct timer_list *timer, int cpu) | |||
1171 | } | 1166 | } |
1172 | forward_timer_base(base); | 1167 | forward_timer_base(base); |
1173 | 1168 | ||
1174 | debug_activate(timer, timer->expires); | 1169 | debug_timer_activate(timer); |
1175 | internal_add_timer(base, timer); | 1170 | internal_add_timer(base, timer); |
1176 | raw_spin_unlock_irqrestore(&base->lock, flags); | 1171 | raw_spin_unlock_irqrestore(&base->lock, flags); |
1177 | } | 1172 | } |
@@ -1298,7 +1293,9 @@ int del_timer_sync(struct timer_list *timer) | |||
1298 | EXPORT_SYMBOL(del_timer_sync); | 1293 | EXPORT_SYMBOL(del_timer_sync); |
1299 | #endif | 1294 | #endif |
1300 | 1295 | ||
1301 | static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list *)) | 1296 | static void call_timer_fn(struct timer_list *timer, |
1297 | void (*fn)(struct timer_list *), | ||
1298 | unsigned long baseclk) | ||
1302 | { | 1299 | { |
1303 | int count = preempt_count(); | 1300 | int count = preempt_count(); |
1304 | 1301 | ||
@@ -1321,14 +1318,14 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list | |||
1321 | */ | 1318 | */ |
1322 | lock_map_acquire(&lockdep_map); | 1319 | lock_map_acquire(&lockdep_map); |
1323 | 1320 | ||
1324 | trace_timer_expire_entry(timer); | 1321 | trace_timer_expire_entry(timer, baseclk); |
1325 | fn(timer); | 1322 | fn(timer); |
1326 | trace_timer_expire_exit(timer); | 1323 | trace_timer_expire_exit(timer); |
1327 | 1324 | ||
1328 | lock_map_release(&lockdep_map); | 1325 | lock_map_release(&lockdep_map); |
1329 | 1326 | ||
1330 | if (count != preempt_count()) { | 1327 | if (count != preempt_count()) { |
1331 | WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n", | 1328 | WARN_ONCE(1, "timer: %pS preempt leak: %08x -> %08x\n", |
1332 | fn, count, preempt_count()); | 1329 | fn, count, preempt_count()); |
1333 | /* | 1330 | /* |
1334 | * Restore the preempt count. That gives us a decent | 1331 | * Restore the preempt count. That gives us a decent |
@@ -1342,6 +1339,13 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list | |||
1342 | 1339 | ||
1343 | static void expire_timers(struct timer_base *base, struct hlist_head *head) | 1340 | static void expire_timers(struct timer_base *base, struct hlist_head *head) |
1344 | { | 1341 | { |
1342 | /* | ||
1343 | * This value is required only for tracing. base->clk was | ||
1344 | * incremented directly before expire_timers was called. But expiry | ||
1345 | * is related to the old base->clk value. | ||
1346 | */ | ||
1347 | unsigned long baseclk = base->clk - 1; | ||
1348 | |||
1345 | while (!hlist_empty(head)) { | 1349 | while (!hlist_empty(head)) { |
1346 | struct timer_list *timer; | 1350 | struct timer_list *timer; |
1347 | void (*fn)(struct timer_list *); | 1351 | void (*fn)(struct timer_list *); |
@@ -1355,11 +1359,11 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head) | |||
1355 | 1359 | ||
1356 | if (timer->flags & TIMER_IRQSAFE) { | 1360 | if (timer->flags & TIMER_IRQSAFE) { |
1357 | raw_spin_unlock(&base->lock); | 1361 | raw_spin_unlock(&base->lock); |
1358 | call_timer_fn(timer, fn); | 1362 | call_timer_fn(timer, fn, baseclk); |
1359 | raw_spin_lock(&base->lock); | 1363 | raw_spin_lock(&base->lock); |
1360 | } else { | 1364 | } else { |
1361 | raw_spin_unlock_irq(&base->lock); | 1365 | raw_spin_unlock_irq(&base->lock); |
1362 | call_timer_fn(timer, fn); | 1366 | call_timer_fn(timer, fn, baseclk); |
1363 | raw_spin_lock_irq(&base->lock); | 1367 | raw_spin_lock_irq(&base->lock); |
1364 | } | 1368 | } |
1365 | } | 1369 | } |
diff --git a/kernel/torture.c b/kernel/torture.c index 8faa1a9aaeb9..17b2be9bde12 100644 --- a/kernel/torture.c +++ b/kernel/torture.c | |||
@@ -88,6 +88,8 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes, | |||
88 | 88 | ||
89 | if (!cpu_online(cpu) || !cpu_is_hotpluggable(cpu)) | 89 | if (!cpu_online(cpu) || !cpu_is_hotpluggable(cpu)) |
90 | return false; | 90 | return false; |
91 | if (num_online_cpus() <= 1) | ||
92 | return false; /* Can't offline the last CPU. */ | ||
91 | 93 | ||
92 | if (verbose > 1) | 94 | if (verbose > 1) |
93 | pr_alert("%s" TORTURE_FLAG | 95 | pr_alert("%s" TORTURE_FLAG |
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d64c00afceb5..94b0e37d90ef 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c | |||
@@ -14,6 +14,8 @@ | |||
14 | #include <linux/syscalls.h> | 14 | #include <linux/syscalls.h> |
15 | #include <linux/error-injection.h> | 15 | #include <linux/error-injection.h> |
16 | 16 | ||
17 | #include <asm/tlb.h> | ||
18 | |||
17 | #include "trace_probe.h" | 19 | #include "trace_probe.h" |
18 | #include "trace.h" | 20 | #include "trace.h" |
19 | 21 | ||
@@ -163,6 +165,10 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src, | |||
163 | * access_ok() should prevent writing to non-user memory, but in | 165 | * access_ok() should prevent writing to non-user memory, but in |
164 | * some situations (nommu, temporary switch, etc) access_ok() does | 166 | * some situations (nommu, temporary switch, etc) access_ok() does |
165 | * not provide enough validation, hence the check on KERNEL_DS. | 167 | * not provide enough validation, hence the check on KERNEL_DS. |
168 | * | ||
169 | * nmi_uaccess_okay() ensures the probe is not run in an interim | ||
170 | * state, when the task or mm are switched. This is specifically | ||
171 | * required to prevent the use of temporary mm. | ||
166 | */ | 172 | */ |
167 | 173 | ||
168 | if (unlikely(in_interrupt() || | 174 | if (unlikely(in_interrupt() || |
@@ -170,6 +176,8 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src, | |||
170 | return -EPERM; | 176 | return -EPERM; |
171 | if (unlikely(uaccess_kernel())) | 177 | if (unlikely(uaccess_kernel())) |
172 | return -EPERM; | 178 | return -EPERM; |
179 | if (unlikely(!nmi_uaccess_okay())) | ||
180 | return -EPERM; | ||
173 | if (!access_ok(unsafe_ptr, size)) | 181 | if (!access_ok(unsafe_ptr, size)) |
174 | return -EPERM; | 182 | return -EPERM; |
175 | 183 | ||
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index fa79323331b2..b920358dd8f7 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -33,6 +33,7 @@ | |||
33 | #include <linux/list.h> | 33 | #include <linux/list.h> |
34 | #include <linux/hash.h> | 34 | #include <linux/hash.h> |
35 | #include <linux/rcupdate.h> | 35 | #include <linux/rcupdate.h> |
36 | #include <linux/kprobes.h> | ||
36 | 37 | ||
37 | #include <trace/events/sched.h> | 38 | #include <trace/events/sched.h> |
38 | 39 | ||
@@ -1992,7 +1993,7 @@ static void print_bug_type(void) | |||
1992 | * modifying the code. @failed should be one of either: | 1993 | * modifying the code. @failed should be one of either: |
1993 | * EFAULT - if the problem happens on reading the @ip address | 1994 | * EFAULT - if the problem happens on reading the @ip address |
1994 | * EINVAL - if what is read at @ip is not what was expected | 1995 | * EINVAL - if what is read at @ip is not what was expected |
1995 | * EPERM - if the problem happens on writting to the @ip address | 1996 | * EPERM - if the problem happens on writing to the @ip address |
1996 | */ | 1997 | */ |
1997 | void ftrace_bug(int failed, struct dyn_ftrace *rec) | 1998 | void ftrace_bug(int failed, struct dyn_ftrace *rec) |
1998 | { | 1999 | { |
@@ -2391,7 +2392,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) | |||
2391 | return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr); | 2392 | return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr); |
2392 | } | 2393 | } |
2393 | 2394 | ||
2394 | return -1; /* unknow ftrace bug */ | 2395 | return -1; /* unknown ftrace bug */ |
2395 | } | 2396 | } |
2396 | 2397 | ||
2397 | void __weak ftrace_replace_code(int mod_flags) | 2398 | void __weak ftrace_replace_code(int mod_flags) |
@@ -3004,7 +3005,7 @@ ftrace_allocate_pages(unsigned long num_to_init) | |||
3004 | int cnt; | 3005 | int cnt; |
3005 | 3006 | ||
3006 | if (!num_to_init) | 3007 | if (!num_to_init) |
3007 | return 0; | 3008 | return NULL; |
3008 | 3009 | ||
3009 | start_pg = pg = kzalloc(sizeof(*pg), GFP_KERNEL); | 3010 | start_pg = pg = kzalloc(sizeof(*pg), GFP_KERNEL); |
3010 | if (!pg) | 3011 | if (!pg) |
@@ -4755,7 +4756,7 @@ static int | |||
4755 | ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove, | 4756 | ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove, |
4756 | int reset, int enable) | 4757 | int reset, int enable) |
4757 | { | 4758 | { |
4758 | return ftrace_set_hash(ops, 0, 0, ip, remove, reset, enable); | 4759 | return ftrace_set_hash(ops, NULL, 0, ip, remove, reset, enable); |
4759 | } | 4760 | } |
4760 | 4761 | ||
4761 | /** | 4762 | /** |
@@ -5463,7 +5464,7 @@ void ftrace_create_filter_files(struct ftrace_ops *ops, | |||
5463 | 5464 | ||
5464 | /* | 5465 | /* |
5465 | * The name "destroy_filter_files" is really a misnomer. Although | 5466 | * The name "destroy_filter_files" is really a misnomer. Although |
5466 | * in the future, it may actualy delete the files, but this is | 5467 | * in the future, it may actually delete the files, but this is |
5467 | * really intended to make sure the ops passed in are disabled | 5468 | * really intended to make sure the ops passed in are disabled |
5468 | * and that when this function returns, the caller is free to | 5469 | * and that when this function returns, the caller is free to |
5469 | * free the ops. | 5470 | * free the ops. |
@@ -5786,7 +5787,7 @@ void ftrace_module_enable(struct module *mod) | |||
5786 | /* | 5787 | /* |
5787 | * If the tracing is enabled, go ahead and enable the record. | 5788 | * If the tracing is enabled, go ahead and enable the record. |
5788 | * | 5789 | * |
5789 | * The reason not to enable the record immediatelly is the | 5790 | * The reason not to enable the record immediately is the |
5790 | * inherent check of ftrace_make_nop/ftrace_make_call for | 5791 | * inherent check of ftrace_make_nop/ftrace_make_call for |
5791 | * correct previous instructions. Making first the NOP | 5792 | * correct previous instructions. Making first the NOP |
5792 | * conversion puts the module to the correct state, thus | 5793 | * conversion puts the module to the correct state, thus |
@@ -6246,7 +6247,7 @@ void ftrace_reset_array_ops(struct trace_array *tr) | |||
6246 | tr->ops->func = ftrace_stub; | 6247 | tr->ops->func = ftrace_stub; |
6247 | } | 6248 | } |
6248 | 6249 | ||
6249 | static inline void | 6250 | static nokprobe_inline void |
6250 | __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, | 6251 | __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, |
6251 | struct ftrace_ops *ignored, struct pt_regs *regs) | 6252 | struct ftrace_ops *ignored, struct pt_regs *regs) |
6252 | { | 6253 | { |
@@ -6306,11 +6307,13 @@ static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, | |||
6306 | { | 6307 | { |
6307 | __ftrace_ops_list_func(ip, parent_ip, NULL, regs); | 6308 | __ftrace_ops_list_func(ip, parent_ip, NULL, regs); |
6308 | } | 6309 | } |
6310 | NOKPROBE_SYMBOL(ftrace_ops_list_func); | ||
6309 | #else | 6311 | #else |
6310 | static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip) | 6312 | static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip) |
6311 | { | 6313 | { |
6312 | __ftrace_ops_list_func(ip, parent_ip, NULL, NULL); | 6314 | __ftrace_ops_list_func(ip, parent_ip, NULL, NULL); |
6313 | } | 6315 | } |
6316 | NOKPROBE_SYMBOL(ftrace_ops_no_ops); | ||
6314 | #endif | 6317 | #endif |
6315 | 6318 | ||
6316 | /* | 6319 | /* |
@@ -6337,6 +6340,7 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip, | |||
6337 | preempt_enable_notrace(); | 6340 | preempt_enable_notrace(); |
6338 | trace_clear_recursion(bit); | 6341 | trace_clear_recursion(bit); |
6339 | } | 6342 | } |
6343 | NOKPROBE_SYMBOL(ftrace_ops_assist_func); | ||
6340 | 6344 | ||
6341 | /** | 6345 | /** |
6342 | * ftrace_ops_get_func - get the function a trampoline should call | 6346 | * ftrace_ops_get_func - get the function a trampoline should call |
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 41b6f96e5366..4ee8d8aa3d0f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -762,7 +762,7 @@ u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu) | |||
762 | 762 | ||
763 | preempt_disable_notrace(); | 763 | preempt_disable_notrace(); |
764 | time = rb_time_stamp(buffer); | 764 | time = rb_time_stamp(buffer); |
765 | preempt_enable_no_resched_notrace(); | 765 | preempt_enable_notrace(); |
766 | 766 | ||
767 | return time; | 767 | return time; |
768 | } | 768 | } |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 21153e64bf1c..ec439999f387 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -159,6 +159,8 @@ static union trace_eval_map_item *trace_eval_maps; | |||
159 | #endif /* CONFIG_TRACE_EVAL_MAP_FILE */ | 159 | #endif /* CONFIG_TRACE_EVAL_MAP_FILE */ |
160 | 160 | ||
161 | static int tracing_set_tracer(struct trace_array *tr, const char *buf); | 161 | static int tracing_set_tracer(struct trace_array *tr, const char *buf); |
162 | static void ftrace_trace_userstack(struct ring_buffer *buffer, | ||
163 | unsigned long flags, int pc); | ||
162 | 164 | ||
163 | #define MAX_TRACER_SIZE 100 | 165 | #define MAX_TRACER_SIZE 100 |
164 | static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; | 166 | static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; |
@@ -496,8 +498,10 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, | |||
496 | * not modified. | 498 | * not modified. |
497 | */ | 499 | */ |
498 | pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); | 500 | pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); |
499 | if (!pid_list) | 501 | if (!pid_list) { |
502 | trace_parser_put(&parser); | ||
500 | return -ENOMEM; | 503 | return -ENOMEM; |
504 | } | ||
501 | 505 | ||
502 | pid_list->pid_max = READ_ONCE(pid_max); | 506 | pid_list->pid_max = READ_ONCE(pid_max); |
503 | 507 | ||
@@ -507,6 +511,7 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, | |||
507 | 511 | ||
508 | pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3); | 512 | pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3); |
509 | if (!pid_list->pids) { | 513 | if (!pid_list->pids) { |
514 | trace_parser_put(&parser); | ||
510 | kfree(pid_list); | 515 | kfree(pid_list); |
511 | return -ENOMEM; | 516 | return -ENOMEM; |
512 | } | 517 | } |
@@ -2749,12 +2754,21 @@ trace_function(struct trace_array *tr, | |||
2749 | 2754 | ||
2750 | #ifdef CONFIG_STACKTRACE | 2755 | #ifdef CONFIG_STACKTRACE |
2751 | 2756 | ||
2752 | #define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long)) | 2757 | /* Allow 4 levels of nesting: normal, softirq, irq, NMI */ |
2758 | #define FTRACE_KSTACK_NESTING 4 | ||
2759 | |||
2760 | #define FTRACE_KSTACK_ENTRIES (PAGE_SIZE / FTRACE_KSTACK_NESTING) | ||
2761 | |||
2753 | struct ftrace_stack { | 2762 | struct ftrace_stack { |
2754 | unsigned long calls[FTRACE_STACK_MAX_ENTRIES]; | 2763 | unsigned long calls[FTRACE_KSTACK_ENTRIES]; |
2755 | }; | 2764 | }; |
2756 | 2765 | ||
2757 | static DEFINE_PER_CPU(struct ftrace_stack, ftrace_stack); | 2766 | |
2767 | struct ftrace_stacks { | ||
2768 | struct ftrace_stack stacks[FTRACE_KSTACK_NESTING]; | ||
2769 | }; | ||
2770 | |||
2771 | static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks); | ||
2758 | static DEFINE_PER_CPU(int, ftrace_stack_reserve); | 2772 | static DEFINE_PER_CPU(int, ftrace_stack_reserve); |
2759 | 2773 | ||
2760 | static void __ftrace_trace_stack(struct ring_buffer *buffer, | 2774 | static void __ftrace_trace_stack(struct ring_buffer *buffer, |
@@ -2763,13 +2777,10 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, | |||
2763 | { | 2777 | { |
2764 | struct trace_event_call *call = &event_kernel_stack; | 2778 | struct trace_event_call *call = &event_kernel_stack; |
2765 | struct ring_buffer_event *event; | 2779 | struct ring_buffer_event *event; |
2780 | unsigned int size, nr_entries; | ||
2781 | struct ftrace_stack *fstack; | ||
2766 | struct stack_entry *entry; | 2782 | struct stack_entry *entry; |
2767 | struct stack_trace trace; | 2783 | int stackidx; |
2768 | int use_stack; | ||
2769 | int size = FTRACE_STACK_ENTRIES; | ||
2770 | |||
2771 | trace.nr_entries = 0; | ||
2772 | trace.skip = skip; | ||
2773 | 2784 | ||
2774 | /* | 2785 | /* |
2775 | * Add one, for this function and the call to save_stack_trace() | 2786 | * Add one, for this function and the call to save_stack_trace() |
@@ -2777,7 +2788,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, | |||
2777 | */ | 2788 | */ |
2778 | #ifndef CONFIG_UNWINDER_ORC | 2789 | #ifndef CONFIG_UNWINDER_ORC |
2779 | if (!regs) | 2790 | if (!regs) |
2780 | trace.skip++; | 2791 | skip++; |
2781 | #endif | 2792 | #endif |
2782 | 2793 | ||
2783 | /* | 2794 | /* |
@@ -2788,53 +2799,40 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, | |||
2788 | */ | 2799 | */ |
2789 | preempt_disable_notrace(); | 2800 | preempt_disable_notrace(); |
2790 | 2801 | ||
2791 | use_stack = __this_cpu_inc_return(ftrace_stack_reserve); | 2802 | stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1; |
2803 | |||
2804 | /* This should never happen. If it does, yell once and skip */ | ||
2805 | if (WARN_ON_ONCE(stackidx > FTRACE_KSTACK_NESTING)) | ||
2806 | goto out; | ||
2807 | |||
2792 | /* | 2808 | /* |
2793 | * We don't need any atomic variables, just a barrier. | 2809 | * The above __this_cpu_inc_return() is 'atomic' cpu local. An |
2794 | * If an interrupt comes in, we don't care, because it would | 2810 | * interrupt will either see the value pre increment or post |
2795 | * have exited and put the counter back to what we want. | 2811 | * increment. If the interrupt happens pre increment it will have |
2796 | * We just need a barrier to keep gcc from moving things | 2812 | * restored the counter when it returns. We just need a barrier to |
2797 | * around. | 2813 | * keep gcc from moving things around. |
2798 | */ | 2814 | */ |
2799 | barrier(); | 2815 | barrier(); |
2800 | if (use_stack == 1) { | ||
2801 | trace.entries = this_cpu_ptr(ftrace_stack.calls); | ||
2802 | trace.max_entries = FTRACE_STACK_MAX_ENTRIES; | ||
2803 | 2816 | ||
2804 | if (regs) | 2817 | fstack = this_cpu_ptr(ftrace_stacks.stacks) + stackidx; |
2805 | save_stack_trace_regs(regs, &trace); | 2818 | size = ARRAY_SIZE(fstack->calls); |
2806 | else | ||
2807 | save_stack_trace(&trace); | ||
2808 | |||
2809 | if (trace.nr_entries > size) | ||
2810 | size = trace.nr_entries; | ||
2811 | } else | ||
2812 | /* From now on, use_stack is a boolean */ | ||
2813 | use_stack = 0; | ||
2814 | 2819 | ||
2815 | size *= sizeof(unsigned long); | 2820 | if (regs) { |
2821 | nr_entries = stack_trace_save_regs(regs, fstack->calls, | ||
2822 | size, skip); | ||
2823 | } else { | ||
2824 | nr_entries = stack_trace_save(fstack->calls, size, skip); | ||
2825 | } | ||
2816 | 2826 | ||
2827 | size = nr_entries * sizeof(unsigned long); | ||
2817 | event = __trace_buffer_lock_reserve(buffer, TRACE_STACK, | 2828 | event = __trace_buffer_lock_reserve(buffer, TRACE_STACK, |
2818 | sizeof(*entry) + size, flags, pc); | 2829 | sizeof(*entry) + size, flags, pc); |
2819 | if (!event) | 2830 | if (!event) |
2820 | goto out; | 2831 | goto out; |
2821 | entry = ring_buffer_event_data(event); | 2832 | entry = ring_buffer_event_data(event); |
2822 | 2833 | ||
2823 | memset(&entry->caller, 0, size); | 2834 | memcpy(&entry->caller, fstack->calls, size); |
2824 | 2835 | entry->size = nr_entries; | |
2825 | if (use_stack) | ||
2826 | memcpy(&entry->caller, trace.entries, | ||
2827 | trace.nr_entries * sizeof(unsigned long)); | ||
2828 | else { | ||
2829 | trace.max_entries = FTRACE_STACK_ENTRIES; | ||
2830 | trace.entries = entry->caller; | ||
2831 | if (regs) | ||
2832 | save_stack_trace_regs(regs, &trace); | ||
2833 | else | ||
2834 | save_stack_trace(&trace); | ||
2835 | } | ||
2836 | |||
2837 | entry->size = trace.nr_entries; | ||
2838 | 2836 | ||
2839 | if (!call_filter_check_discard(call, entry, buffer, event)) | 2837 | if (!call_filter_check_discard(call, entry, buffer, event)) |
2840 | __buffer_unlock_commit(buffer, event); | 2838 | __buffer_unlock_commit(buffer, event); |
@@ -2904,15 +2902,15 @@ void trace_dump_stack(int skip) | |||
2904 | } | 2902 | } |
2905 | EXPORT_SYMBOL_GPL(trace_dump_stack); | 2903 | EXPORT_SYMBOL_GPL(trace_dump_stack); |
2906 | 2904 | ||
2905 | #ifdef CONFIG_USER_STACKTRACE_SUPPORT | ||
2907 | static DEFINE_PER_CPU(int, user_stack_count); | 2906 | static DEFINE_PER_CPU(int, user_stack_count); |
2908 | 2907 | ||
2909 | void | 2908 | static void |
2910 | ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) | 2909 | ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) |
2911 | { | 2910 | { |
2912 | struct trace_event_call *call = &event_user_stack; | 2911 | struct trace_event_call *call = &event_user_stack; |
2913 | struct ring_buffer_event *event; | 2912 | struct ring_buffer_event *event; |
2914 | struct userstack_entry *entry; | 2913 | struct userstack_entry *entry; |
2915 | struct stack_trace trace; | ||
2916 | 2914 | ||
2917 | if (!(global_trace.trace_flags & TRACE_ITER_USERSTACKTRACE)) | 2915 | if (!(global_trace.trace_flags & TRACE_ITER_USERSTACKTRACE)) |
2918 | return; | 2916 | return; |
@@ -2943,12 +2941,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) | |||
2943 | entry->tgid = current->tgid; | 2941 | entry->tgid = current->tgid; |
2944 | memset(&entry->caller, 0, sizeof(entry->caller)); | 2942 | memset(&entry->caller, 0, sizeof(entry->caller)); |
2945 | 2943 | ||
2946 | trace.nr_entries = 0; | 2944 | stack_trace_save_user(entry->caller, FTRACE_STACK_ENTRIES); |
2947 | trace.max_entries = FTRACE_STACK_ENTRIES; | ||
2948 | trace.skip = 0; | ||
2949 | trace.entries = entry->caller; | ||
2950 | |||
2951 | save_stack_trace_user(&trace); | ||
2952 | if (!call_filter_check_discard(call, entry, buffer, event)) | 2945 | if (!call_filter_check_discard(call, entry, buffer, event)) |
2953 | __buffer_unlock_commit(buffer, event); | 2946 | __buffer_unlock_commit(buffer, event); |
2954 | 2947 | ||
@@ -2957,13 +2950,12 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) | |||
2957 | out: | 2950 | out: |
2958 | preempt_enable(); | 2951 | preempt_enable(); |
2959 | } | 2952 | } |
2960 | 2953 | #else /* CONFIG_USER_STACKTRACE_SUPPORT */ | |
2961 | #ifdef UNUSED | 2954 | static void ftrace_trace_userstack(struct ring_buffer *buffer, |
2962 | static void __trace_userstack(struct trace_array *tr, unsigned long flags) | 2955 | unsigned long flags, int pc) |
2963 | { | 2956 | { |
2964 | ftrace_trace_userstack(tr, flags, preempt_count()); | ||
2965 | } | 2957 | } |
2966 | #endif /* UNUSED */ | 2958 | #endif /* !CONFIG_USER_STACKTRACE_SUPPORT */ |
2967 | 2959 | ||
2968 | #endif /* CONFIG_STACKTRACE */ | 2960 | #endif /* CONFIG_STACKTRACE */ |
2969 | 2961 | ||
@@ -7025,35 +7017,43 @@ struct buffer_ref { | |||
7025 | struct ring_buffer *buffer; | 7017 | struct ring_buffer *buffer; |
7026 | void *page; | 7018 | void *page; |
7027 | int cpu; | 7019 | int cpu; |
7028 | int ref; | 7020 | refcount_t refcount; |
7029 | }; | 7021 | }; |
7030 | 7022 | ||
7023 | static void buffer_ref_release(struct buffer_ref *ref) | ||
7024 | { | ||
7025 | if (!refcount_dec_and_test(&ref->refcount)) | ||
7026 | return; | ||
7027 | ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); | ||
7028 | kfree(ref); | ||
7029 | } | ||
7030 | |||
7031 | static void buffer_pipe_buf_release(struct pipe_inode_info *pipe, | 7031 | static void buffer_pipe_buf_release(struct pipe_inode_info *pipe, |
7032 | struct pipe_buffer *buf) | 7032 | struct pipe_buffer *buf) |
7033 | { | 7033 | { |
7034 | struct buffer_ref *ref = (struct buffer_ref *)buf->private; | 7034 | struct buffer_ref *ref = (struct buffer_ref *)buf->private; |
7035 | 7035 | ||
7036 | if (--ref->ref) | 7036 | buffer_ref_release(ref); |
7037 | return; | ||
7038 | |||
7039 | ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); | ||
7040 | kfree(ref); | ||
7041 | buf->private = 0; | 7037 | buf->private = 0; |
7042 | } | 7038 | } |
7043 | 7039 | ||
7044 | static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, | 7040 | static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe, |
7045 | struct pipe_buffer *buf) | 7041 | struct pipe_buffer *buf) |
7046 | { | 7042 | { |
7047 | struct buffer_ref *ref = (struct buffer_ref *)buf->private; | 7043 | struct buffer_ref *ref = (struct buffer_ref *)buf->private; |
7048 | 7044 | ||
7049 | ref->ref++; | 7045 | if (refcount_read(&ref->refcount) > INT_MAX/2) |
7046 | return false; | ||
7047 | |||
7048 | refcount_inc(&ref->refcount); | ||
7049 | return true; | ||
7050 | } | 7050 | } |
7051 | 7051 | ||
7052 | /* Pipe buffer operations for a buffer. */ | 7052 | /* Pipe buffer operations for a buffer. */ |
7053 | static const struct pipe_buf_operations buffer_pipe_buf_ops = { | 7053 | static const struct pipe_buf_operations buffer_pipe_buf_ops = { |
7054 | .confirm = generic_pipe_buf_confirm, | 7054 | .confirm = generic_pipe_buf_confirm, |
7055 | .release = buffer_pipe_buf_release, | 7055 | .release = buffer_pipe_buf_release, |
7056 | .steal = generic_pipe_buf_steal, | 7056 | .steal = generic_pipe_buf_nosteal, |
7057 | .get = buffer_pipe_buf_get, | 7057 | .get = buffer_pipe_buf_get, |
7058 | }; | 7058 | }; |
7059 | 7059 | ||
@@ -7066,11 +7066,7 @@ static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i) | |||
7066 | struct buffer_ref *ref = | 7066 | struct buffer_ref *ref = |
7067 | (struct buffer_ref *)spd->partial[i].private; | 7067 | (struct buffer_ref *)spd->partial[i].private; |
7068 | 7068 | ||
7069 | if (--ref->ref) | 7069 | buffer_ref_release(ref); |
7070 | return; | ||
7071 | |||
7072 | ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); | ||
7073 | kfree(ref); | ||
7074 | spd->partial[i].private = 0; | 7070 | spd->partial[i].private = 0; |
7075 | } | 7071 | } |
7076 | 7072 | ||
@@ -7125,7 +7121,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
7125 | break; | 7121 | break; |
7126 | } | 7122 | } |
7127 | 7123 | ||
7128 | ref->ref = 1; | 7124 | refcount_set(&ref->refcount, 1); |
7129 | ref->buffer = iter->trace_buffer->buffer; | 7125 | ref->buffer = iter->trace_buffer->buffer; |
7130 | ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file); | 7126 | ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file); |
7131 | if (IS_ERR(ref->page)) { | 7127 | if (IS_ERR(ref->page)) { |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d80cee49e0eb..639047b259d7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -782,17 +782,9 @@ void update_max_tr_single(struct trace_array *tr, | |||
782 | #endif /* CONFIG_TRACER_MAX_TRACE */ | 782 | #endif /* CONFIG_TRACER_MAX_TRACE */ |
783 | 783 | ||
784 | #ifdef CONFIG_STACKTRACE | 784 | #ifdef CONFIG_STACKTRACE |
785 | void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, | ||
786 | int pc); | ||
787 | |||
788 | void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, | 785 | void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, |
789 | int pc); | 786 | int pc); |
790 | #else | 787 | #else |
791 | static inline void ftrace_trace_userstack(struct ring_buffer *buffer, | ||
792 | unsigned long flags, int pc) | ||
793 | { | ||
794 | } | ||
795 | |||
796 | static inline void __trace_stack(struct trace_array *tr, unsigned long flags, | 788 | static inline void __trace_stack(struct trace_array *tr, unsigned long flags, |
797 | int skip, int pc) | 789 | int skip, int pc) |
798 | { | 790 | { |
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 4ad967453b6f..3ea65cdff30d 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c | |||
@@ -205,6 +205,8 @@ void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect) | |||
205 | void ftrace_likely_update(struct ftrace_likely_data *f, int val, | 205 | void ftrace_likely_update(struct ftrace_likely_data *f, int val, |
206 | int expect, int is_constant) | 206 | int expect, int is_constant) |
207 | { | 207 | { |
208 | unsigned long flags = user_access_save(); | ||
209 | |||
208 | /* A constant is always correct */ | 210 | /* A constant is always correct */ |
209 | if (is_constant) { | 211 | if (is_constant) { |
210 | f->constant++; | 212 | f->constant++; |
@@ -223,6 +225,8 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, | |||
223 | f->data.correct++; | 225 | f->data.correct++; |
224 | else | 226 | else |
225 | f->data.incorrect++; | 227 | f->data.incorrect++; |
228 | |||
229 | user_access_restore(flags); | ||
226 | } | 230 | } |
227 | EXPORT_SYMBOL(ftrace_likely_update); | 231 | EXPORT_SYMBOL(ftrace_likely_update); |
228 | 232 | ||
diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index dd1f43588d70..fa100ed3b4de 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c | |||
@@ -74,7 +74,7 @@ int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type) | |||
74 | static int create_dyn_event(int argc, char **argv) | 74 | static int create_dyn_event(int argc, char **argv) |
75 | { | 75 | { |
76 | struct dyn_event_operations *ops; | 76 | struct dyn_event_operations *ops; |
77 | int ret; | 77 | int ret = -ENODEV; |
78 | 78 | ||
79 | if (argv[0][0] == '-' || argv[0][0] == '!') | 79 | if (argv[0][0] == '-' || argv[0][0] == '!') |
80 | return dyn_event_release(argc, argv, NULL); | 80 | return dyn_event_release(argc, argv, NULL); |
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index ca46339f3009..a1d20421f4b0 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c | |||
@@ -3713,7 +3713,6 @@ static void track_data_destroy(struct hist_trigger_data *hist_data, | |||
3713 | struct trace_event_file *file = hist_data->event_file; | 3713 | struct trace_event_file *file = hist_data->event_file; |
3714 | 3714 | ||
3715 | destroy_hist_field(data->track_data.track_var, 0); | 3715 | destroy_hist_field(data->track_data.track_var, 0); |
3716 | destroy_hist_field(data->track_data.var_ref, 0); | ||
3717 | 3716 | ||
3718 | if (data->action == ACTION_SNAPSHOT) { | 3717 | if (data->action == ACTION_SNAPSHOT) { |
3719 | struct track_data *track_data; | 3718 | struct track_data *track_data; |
@@ -5187,7 +5186,6 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec, | |||
5187 | u64 var_ref_vals[TRACING_MAP_VARS_MAX]; | 5186 | u64 var_ref_vals[TRACING_MAP_VARS_MAX]; |
5188 | char compound_key[HIST_KEY_SIZE_MAX]; | 5187 | char compound_key[HIST_KEY_SIZE_MAX]; |
5189 | struct tracing_map_elt *elt = NULL; | 5188 | struct tracing_map_elt *elt = NULL; |
5190 | struct stack_trace stacktrace; | ||
5191 | struct hist_field *key_field; | 5189 | struct hist_field *key_field; |
5192 | u64 field_contents; | 5190 | u64 field_contents; |
5193 | void *key = NULL; | 5191 | void *key = NULL; |
@@ -5199,14 +5197,9 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec, | |||
5199 | key_field = hist_data->fields[i]; | 5197 | key_field = hist_data->fields[i]; |
5200 | 5198 | ||
5201 | if (key_field->flags & HIST_FIELD_FL_STACKTRACE) { | 5199 | if (key_field->flags & HIST_FIELD_FL_STACKTRACE) { |
5202 | stacktrace.max_entries = HIST_STACKTRACE_DEPTH; | 5200 | memset(entries, 0, HIST_STACKTRACE_SIZE); |
5203 | stacktrace.entries = entries; | 5201 | stack_trace_save(entries, HIST_STACKTRACE_DEPTH, |
5204 | stacktrace.nr_entries = 0; | 5202 | HIST_STACKTRACE_SKIP); |
5205 | stacktrace.skip = HIST_STACKTRACE_SKIP; | ||
5206 | |||
5207 | memset(stacktrace.entries, 0, HIST_STACKTRACE_SIZE); | ||
5208 | save_stack_trace(&stacktrace); | ||
5209 | |||
5210 | key = entries; | 5203 | key = entries; |
5211 | } else { | 5204 | } else { |
5212 | field_contents = key_field->fn(key_field, elt, rbe, rec); | 5205 | field_contents = key_field->fn(key_field, elt, rbe, rec); |
@@ -5247,7 +5240,7 @@ static void hist_trigger_stacktrace_print(struct seq_file *m, | |||
5247 | unsigned int i; | 5240 | unsigned int i; |
5248 | 5241 | ||
5249 | for (i = 0; i < max_entries; i++) { | 5242 | for (i = 0; i < max_entries; i++) { |
5250 | if (stacktrace_entries[i] == ULONG_MAX) | 5243 | if (!stacktrace_entries[i]) |
5251 | return; | 5244 | return; |
5252 | 5245 | ||
5253 | seq_printf(m, "%*c", 1 + spaces, ' '); | 5246 | seq_printf(m, "%*c", 1 + spaces, ' '); |
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index eec648a0d673..5d16f73898db 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c | |||
@@ -18,44 +18,32 @@ | |||
18 | 18 | ||
19 | #include "trace.h" | 19 | #include "trace.h" |
20 | 20 | ||
21 | static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] = | 21 | #define STACK_TRACE_ENTRIES 500 |
22 | { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX }; | ||
23 | unsigned stack_trace_index[STACK_TRACE_ENTRIES]; | ||
24 | 22 | ||
25 | /* | 23 | static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES]; |
26 | * Reserve one entry for the passed in ip. This will allow | 24 | static unsigned stack_trace_index[STACK_TRACE_ENTRIES]; |
27 | * us to remove most or all of the stack size overhead | ||
28 | * added by the stack tracer itself. | ||
29 | */ | ||
30 | struct stack_trace stack_trace_max = { | ||
31 | .max_entries = STACK_TRACE_ENTRIES - 1, | ||
32 | .entries = &stack_dump_trace[0], | ||
33 | }; | ||
34 | 25 | ||
35 | unsigned long stack_trace_max_size; | 26 | static unsigned int stack_trace_nr_entries; |
36 | arch_spinlock_t stack_trace_max_lock = | 27 | static unsigned long stack_trace_max_size; |
28 | static arch_spinlock_t stack_trace_max_lock = | ||
37 | (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; | 29 | (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; |
38 | 30 | ||
39 | DEFINE_PER_CPU(int, disable_stack_tracer); | 31 | DEFINE_PER_CPU(int, disable_stack_tracer); |
40 | static DEFINE_MUTEX(stack_sysctl_mutex); | 32 | static DEFINE_MUTEX(stack_sysctl_mutex); |
41 | 33 | ||
42 | int stack_tracer_enabled; | 34 | int stack_tracer_enabled; |
43 | static int last_stack_tracer_enabled; | ||
44 | 35 | ||
45 | void stack_trace_print(void) | 36 | static void print_max_stack(void) |
46 | { | 37 | { |
47 | long i; | 38 | long i; |
48 | int size; | 39 | int size; |
49 | 40 | ||
50 | pr_emerg(" Depth Size Location (%d entries)\n" | 41 | pr_emerg(" Depth Size Location (%d entries)\n" |
51 | " ----- ---- --------\n", | 42 | " ----- ---- --------\n", |
52 | stack_trace_max.nr_entries); | 43 | stack_trace_nr_entries); |
53 | 44 | ||
54 | for (i = 0; i < stack_trace_max.nr_entries; i++) { | 45 | for (i = 0; i < stack_trace_nr_entries; i++) { |
55 | if (stack_dump_trace[i] == ULONG_MAX) | 46 | if (i + 1 == stack_trace_nr_entries) |
56 | break; | ||
57 | if (i+1 == stack_trace_max.nr_entries || | ||
58 | stack_dump_trace[i+1] == ULONG_MAX) | ||
59 | size = stack_trace_index[i]; | 47 | size = stack_trace_index[i]; |
60 | else | 48 | else |
61 | size = stack_trace_index[i] - stack_trace_index[i+1]; | 49 | size = stack_trace_index[i] - stack_trace_index[i+1]; |
@@ -65,16 +53,7 @@ void stack_trace_print(void) | |||
65 | } | 53 | } |
66 | } | 54 | } |
67 | 55 | ||
68 | /* | 56 | static void check_stack(unsigned long ip, unsigned long *stack) |
69 | * When arch-specific code overrides this function, the following | ||
70 | * data should be filled up, assuming stack_trace_max_lock is held to | ||
71 | * prevent concurrent updates. | ||
72 | * stack_trace_index[] | ||
73 | * stack_trace_max | ||
74 | * stack_trace_max_size | ||
75 | */ | ||
76 | void __weak | ||
77 | check_stack(unsigned long ip, unsigned long *stack) | ||
78 | { | 57 | { |
79 | unsigned long this_size, flags; unsigned long *p, *top, *start; | 58 | unsigned long this_size, flags; unsigned long *p, *top, *start; |
80 | static int tracer_frame; | 59 | static int tracer_frame; |
@@ -110,13 +89,12 @@ check_stack(unsigned long ip, unsigned long *stack) | |||
110 | 89 | ||
111 | stack_trace_max_size = this_size; | 90 | stack_trace_max_size = this_size; |
112 | 91 | ||
113 | stack_trace_max.nr_entries = 0; | 92 | stack_trace_nr_entries = stack_trace_save(stack_dump_trace, |
114 | stack_trace_max.skip = 0; | 93 | ARRAY_SIZE(stack_dump_trace) - 1, |
115 | 94 | 0); | |
116 | save_stack_trace(&stack_trace_max); | ||
117 | 95 | ||
118 | /* Skip over the overhead of the stack tracer itself */ | 96 | /* Skip over the overhead of the stack tracer itself */ |
119 | for (i = 0; i < stack_trace_max.nr_entries; i++) { | 97 | for (i = 0; i < stack_trace_nr_entries; i++) { |
120 | if (stack_dump_trace[i] == ip) | 98 | if (stack_dump_trace[i] == ip) |
121 | break; | 99 | break; |
122 | } | 100 | } |
@@ -125,7 +103,7 @@ check_stack(unsigned long ip, unsigned long *stack) | |||
125 | * Some archs may not have the passed in ip in the dump. | 103 | * Some archs may not have the passed in ip in the dump. |
126 | * If that happens, we need to show everything. | 104 | * If that happens, we need to show everything. |
127 | */ | 105 | */ |
128 | if (i == stack_trace_max.nr_entries) | 106 | if (i == stack_trace_nr_entries) |
129 | i = 0; | 107 | i = 0; |
130 | 108 | ||
131 | /* | 109 | /* |
@@ -143,15 +121,13 @@ check_stack(unsigned long ip, unsigned long *stack) | |||
143 | * loop will only happen once. This code only takes place | 121 | * loop will only happen once. This code only takes place |
144 | * on a new max, so it is far from a fast path. | 122 | * on a new max, so it is far from a fast path. |
145 | */ | 123 | */ |
146 | while (i < stack_trace_max.nr_entries) { | 124 | while (i < stack_trace_nr_entries) { |
147 | int found = 0; | 125 | int found = 0; |
148 | 126 | ||
149 | stack_trace_index[x] = this_size; | 127 | stack_trace_index[x] = this_size; |
150 | p = start; | 128 | p = start; |
151 | 129 | ||
152 | for (; p < top && i < stack_trace_max.nr_entries; p++) { | 130 | for (; p < top && i < stack_trace_nr_entries; p++) { |
153 | if (stack_dump_trace[i] == ULONG_MAX) | ||
154 | break; | ||
155 | /* | 131 | /* |
156 | * The READ_ONCE_NOCHECK is used to let KASAN know that | 132 | * The READ_ONCE_NOCHECK is used to let KASAN know that |
157 | * this is not a stack-out-of-bounds error. | 133 | * this is not a stack-out-of-bounds error. |
@@ -182,12 +158,10 @@ check_stack(unsigned long ip, unsigned long *stack) | |||
182 | i++; | 158 | i++; |
183 | } | 159 | } |
184 | 160 | ||
185 | stack_trace_max.nr_entries = x; | 161 | stack_trace_nr_entries = x; |
186 | for (; x < i; x++) | ||
187 | stack_dump_trace[x] = ULONG_MAX; | ||
188 | 162 | ||
189 | if (task_stack_end_corrupted(current)) { | 163 | if (task_stack_end_corrupted(current)) { |
190 | stack_trace_print(); | 164 | print_max_stack(); |
191 | BUG(); | 165 | BUG(); |
192 | } | 166 | } |
193 | 167 | ||
@@ -286,7 +260,7 @@ __next(struct seq_file *m, loff_t *pos) | |||
286 | { | 260 | { |
287 | long n = *pos - 1; | 261 | long n = *pos - 1; |
288 | 262 | ||
289 | if (n >= stack_trace_max.nr_entries || stack_dump_trace[n] == ULONG_MAX) | 263 | if (n >= stack_trace_nr_entries) |
290 | return NULL; | 264 | return NULL; |
291 | 265 | ||
292 | m->private = (void *)n; | 266 | m->private = (void *)n; |
@@ -350,7 +324,7 @@ static int t_show(struct seq_file *m, void *v) | |||
350 | seq_printf(m, " Depth Size Location" | 324 | seq_printf(m, " Depth Size Location" |
351 | " (%d entries)\n" | 325 | " (%d entries)\n" |
352 | " ----- ---- --------\n", | 326 | " ----- ---- --------\n", |
353 | stack_trace_max.nr_entries); | 327 | stack_trace_nr_entries); |
354 | 328 | ||
355 | if (!stack_tracer_enabled && !stack_trace_max_size) | 329 | if (!stack_tracer_enabled && !stack_trace_max_size) |
356 | print_disabled(m); | 330 | print_disabled(m); |
@@ -360,12 +334,10 @@ static int t_show(struct seq_file *m, void *v) | |||
360 | 334 | ||
361 | i = *(long *)v; | 335 | i = *(long *)v; |
362 | 336 | ||
363 | if (i >= stack_trace_max.nr_entries || | 337 | if (i >= stack_trace_nr_entries) |
364 | stack_dump_trace[i] == ULONG_MAX) | ||
365 | return 0; | 338 | return 0; |
366 | 339 | ||
367 | if (i+1 == stack_trace_max.nr_entries || | 340 | if (i + 1 == stack_trace_nr_entries) |
368 | stack_dump_trace[i+1] == ULONG_MAX) | ||
369 | size = stack_trace_index[i]; | 341 | size = stack_trace_index[i]; |
370 | else | 342 | else |
371 | size = stack_trace_index[i] - stack_trace_index[i+1]; | 343 | size = stack_trace_index[i] - stack_trace_index[i+1]; |
@@ -422,23 +394,21 @@ stack_trace_sysctl(struct ctl_table *table, int write, | |||
422 | void __user *buffer, size_t *lenp, | 394 | void __user *buffer, size_t *lenp, |
423 | loff_t *ppos) | 395 | loff_t *ppos) |
424 | { | 396 | { |
397 | int was_enabled; | ||
425 | int ret; | 398 | int ret; |
426 | 399 | ||
427 | mutex_lock(&stack_sysctl_mutex); | 400 | mutex_lock(&stack_sysctl_mutex); |
401 | was_enabled = !!stack_tracer_enabled; | ||
428 | 402 | ||
429 | ret = proc_dointvec(table, write, buffer, lenp, ppos); | 403 | ret = proc_dointvec(table, write, buffer, lenp, ppos); |
430 | 404 | ||
431 | if (ret || !write || | 405 | if (ret || !write || (was_enabled == !!stack_tracer_enabled)) |
432 | (last_stack_tracer_enabled == !!stack_tracer_enabled)) | ||
433 | goto out; | 406 | goto out; |
434 | 407 | ||
435 | last_stack_tracer_enabled = !!stack_tracer_enabled; | ||
436 | |||
437 | if (stack_tracer_enabled) | 408 | if (stack_tracer_enabled) |
438 | register_ftrace_function(&trace_ops); | 409 | register_ftrace_function(&trace_ops); |
439 | else | 410 | else |
440 | unregister_ftrace_function(&trace_ops); | 411 | unregister_ftrace_function(&trace_ops); |
441 | |||
442 | out: | 412 | out: |
443 | mutex_unlock(&stack_sysctl_mutex); | 413 | mutex_unlock(&stack_sysctl_mutex); |
444 | return ret; | 414 | return ret; |
@@ -454,7 +424,6 @@ static __init int enable_stacktrace(char *str) | |||
454 | strncpy(stack_trace_filter_buf, str + len, COMMAND_LINE_SIZE); | 424 | strncpy(stack_trace_filter_buf, str + len, COMMAND_LINE_SIZE); |
455 | 425 | ||
456 | stack_tracer_enabled = 1; | 426 | stack_tracer_enabled = 1; |
457 | last_stack_tracer_enabled = 1; | ||
458 | return 1; | 427 | return 1; |
459 | } | 428 | } |
460 | __setup("stacktrace", enable_stacktrace); | 429 | __setup("stacktrace", enable_stacktrace); |
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index f93a56d2db27..fa8fbff736d6 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c | |||
@@ -314,6 +314,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) | |||
314 | struct ring_buffer_event *event; | 314 | struct ring_buffer_event *event; |
315 | struct ring_buffer *buffer; | 315 | struct ring_buffer *buffer; |
316 | unsigned long irq_flags; | 316 | unsigned long irq_flags; |
317 | unsigned long args[6]; | ||
317 | int pc; | 318 | int pc; |
318 | int syscall_nr; | 319 | int syscall_nr; |
319 | int size; | 320 | int size; |
@@ -347,7 +348,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) | |||
347 | 348 | ||
348 | entry = ring_buffer_event_data(event); | 349 | entry = ring_buffer_event_data(event); |
349 | entry->nr = syscall_nr; | 350 | entry->nr = syscall_nr; |
350 | syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); | 351 | syscall_get_arguments(current, regs, args); |
352 | memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args); | ||
351 | 353 | ||
352 | event_trigger_unlock_commit(trace_file, buffer, event, entry, | 354 | event_trigger_unlock_commit(trace_file, buffer, event, entry, |
353 | irq_flags, pc); | 355 | irq_flags, pc); |
@@ -583,6 +585,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) | |||
583 | struct syscall_metadata *sys_data; | 585 | struct syscall_metadata *sys_data; |
584 | struct syscall_trace_enter *rec; | 586 | struct syscall_trace_enter *rec; |
585 | struct hlist_head *head; | 587 | struct hlist_head *head; |
588 | unsigned long args[6]; | ||
586 | bool valid_prog_array; | 589 | bool valid_prog_array; |
587 | int syscall_nr; | 590 | int syscall_nr; |
588 | int rctx; | 591 | int rctx; |
@@ -613,8 +616,8 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) | |||
613 | return; | 616 | return; |
614 | 617 | ||
615 | rec->nr = syscall_nr; | 618 | rec->nr = syscall_nr; |
616 | syscall_get_arguments(current, regs, 0, sys_data->nb_args, | 619 | syscall_get_arguments(current, regs, args); |
617 | (unsigned long *)&rec->args); | 620 | memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args); |
618 | 621 | ||
619 | if ((valid_prog_array && | 622 | if ((valid_prog_array && |
620 | !perf_call_bpf_enter(sys_data->enter_event, regs, sys_data, rec)) || | 623 | !perf_call_bpf_enter(sys_data->enter_event, regs, sys_data, rec)) || |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 8fbfda94a67b..7f9e7b9306fe 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
@@ -42,9 +42,9 @@ int __read_mostly watchdog_user_enabled = 1; | |||
42 | int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT; | 42 | int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT; |
43 | int __read_mostly soft_watchdog_user_enabled = 1; | 43 | int __read_mostly soft_watchdog_user_enabled = 1; |
44 | int __read_mostly watchdog_thresh = 10; | 44 | int __read_mostly watchdog_thresh = 10; |
45 | int __read_mostly nmi_watchdog_available; | 45 | static int __read_mostly nmi_watchdog_available; |
46 | 46 | ||
47 | struct cpumask watchdog_allowed_mask __read_mostly; | 47 | static struct cpumask watchdog_allowed_mask __read_mostly; |
48 | 48 | ||
49 | struct cpumask watchdog_cpumask __read_mostly; | 49 | struct cpumask watchdog_cpumask __read_mostly; |
50 | unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); | 50 | unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); |
@@ -554,13 +554,15 @@ static void softlockup_start_all(void) | |||
554 | 554 | ||
555 | int lockup_detector_online_cpu(unsigned int cpu) | 555 | int lockup_detector_online_cpu(unsigned int cpu) |
556 | { | 556 | { |
557 | watchdog_enable(cpu); | 557 | if (cpumask_test_cpu(cpu, &watchdog_allowed_mask)) |
558 | watchdog_enable(cpu); | ||
558 | return 0; | 559 | return 0; |
559 | } | 560 | } |
560 | 561 | ||
561 | int lockup_detector_offline_cpu(unsigned int cpu) | 562 | int lockup_detector_offline_cpu(unsigned int cpu) |
562 | { | 563 | { |
563 | watchdog_disable(cpu); | 564 | if (cpumask_test_cpu(cpu, &watchdog_allowed_mask)) |
565 | watchdog_disable(cpu); | ||
564 | return 0; | 566 | return 0; |
565 | } | 567 | } |
566 | 568 | ||
@@ -588,7 +590,7 @@ static void lockup_detector_reconfigure(void) | |||
588 | * Create the watchdog thread infrastructure and configure the detector(s). | 590 | * Create the watchdog thread infrastructure and configure the detector(s). |
589 | * | 591 | * |
590 | * The threads are not unparked as watchdog_allowed_mask is empty. When | 592 | * The threads are not unparked as watchdog_allowed_mask is empty. When |
591 | * the threads are sucessfully initialized, take the proper locks and | 593 | * the threads are successfully initialized, take the proper locks and |
592 | * unpark the threads in the watchdog_cpumask if the watchdog is enabled. | 594 | * unpark the threads in the watchdog_cpumask if the watchdog is enabled. |
593 | */ | 595 | */ |
594 | static __init void lockup_detector_setup(void) | 596 | static __init void lockup_detector_setup(void) |
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 71381168dede..247bf0b1582c 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c | |||
@@ -135,7 +135,8 @@ static void watchdog_overflow_callback(struct perf_event *event, | |||
135 | if (__this_cpu_read(hard_watchdog_warn) == true) | 135 | if (__this_cpu_read(hard_watchdog_warn) == true) |
136 | return; | 136 | return; |
137 | 137 | ||
138 | pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu); | 138 | pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", |
139 | this_cpu); | ||
139 | print_modules(); | 140 | print_modules(); |
140 | print_irqtrace_events(current); | 141 | print_irqtrace_events(current); |
141 | if (regs) | 142 | if (regs) |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4026d1871407..faf7622246da 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -841,43 +841,32 @@ static void wake_up_worker(struct worker_pool *pool) | |||
841 | } | 841 | } |
842 | 842 | ||
843 | /** | 843 | /** |
844 | * wq_worker_waking_up - a worker is waking up | 844 | * wq_worker_running - a worker is running again |
845 | * @task: task waking up | 845 | * @task: task waking up |
846 | * @cpu: CPU @task is waking up to | ||
847 | * | 846 | * |
848 | * This function is called during try_to_wake_up() when a worker is | 847 | * This function is called when a worker returns from schedule() |
849 | * being awoken. | ||
850 | * | ||
851 | * CONTEXT: | ||
852 | * spin_lock_irq(rq->lock) | ||
853 | */ | 848 | */ |
854 | void wq_worker_waking_up(struct task_struct *task, int cpu) | 849 | void wq_worker_running(struct task_struct *task) |
855 | { | 850 | { |
856 | struct worker *worker = kthread_data(task); | 851 | struct worker *worker = kthread_data(task); |
857 | 852 | ||
858 | if (!(worker->flags & WORKER_NOT_RUNNING)) { | 853 | if (!worker->sleeping) |
859 | WARN_ON_ONCE(worker->pool->cpu != cpu); | 854 | return; |
855 | if (!(worker->flags & WORKER_NOT_RUNNING)) | ||
860 | atomic_inc(&worker->pool->nr_running); | 856 | atomic_inc(&worker->pool->nr_running); |
861 | } | 857 | worker->sleeping = 0; |
862 | } | 858 | } |
863 | 859 | ||
864 | /** | 860 | /** |
865 | * wq_worker_sleeping - a worker is going to sleep | 861 | * wq_worker_sleeping - a worker is going to sleep |
866 | * @task: task going to sleep | 862 | * @task: task going to sleep |
867 | * | 863 | * |
868 | * This function is called during schedule() when a busy worker is | 864 | * This function is called from schedule() when a busy worker is |
869 | * going to sleep. Worker on the same cpu can be woken up by | 865 | * going to sleep. |
870 | * returning pointer to its task. | ||
871 | * | ||
872 | * CONTEXT: | ||
873 | * spin_lock_irq(rq->lock) | ||
874 | * | ||
875 | * Return: | ||
876 | * Worker task on @cpu to wake up, %NULL if none. | ||
877 | */ | 866 | */ |
878 | struct task_struct *wq_worker_sleeping(struct task_struct *task) | 867 | void wq_worker_sleeping(struct task_struct *task) |
879 | { | 868 | { |
880 | struct worker *worker = kthread_data(task), *to_wakeup = NULL; | 869 | struct worker *next, *worker = kthread_data(task); |
881 | struct worker_pool *pool; | 870 | struct worker_pool *pool; |
882 | 871 | ||
883 | /* | 872 | /* |
@@ -886,13 +875,15 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task) | |||
886 | * checking NOT_RUNNING. | 875 | * checking NOT_RUNNING. |
887 | */ | 876 | */ |
888 | if (worker->flags & WORKER_NOT_RUNNING) | 877 | if (worker->flags & WORKER_NOT_RUNNING) |
889 | return NULL; | 878 | return; |
890 | 879 | ||
891 | pool = worker->pool; | 880 | pool = worker->pool; |
892 | 881 | ||
893 | /* this can only happen on the local cpu */ | 882 | if (WARN_ON_ONCE(worker->sleeping)) |
894 | if (WARN_ON_ONCE(pool->cpu != raw_smp_processor_id())) | 883 | return; |
895 | return NULL; | 884 | |
885 | worker->sleeping = 1; | ||
886 | spin_lock_irq(&pool->lock); | ||
896 | 887 | ||
897 | /* | 888 | /* |
898 | * The counterpart of the following dec_and_test, implied mb, | 889 | * The counterpart of the following dec_and_test, implied mb, |
@@ -906,9 +897,12 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task) | |||
906 | * lock is safe. | 897 | * lock is safe. |
907 | */ | 898 | */ |
908 | if (atomic_dec_and_test(&pool->nr_running) && | 899 | if (atomic_dec_and_test(&pool->nr_running) && |
909 | !list_empty(&pool->worklist)) | 900 | !list_empty(&pool->worklist)) { |
910 | to_wakeup = first_idle_worker(pool); | 901 | next = first_idle_worker(pool); |
911 | return to_wakeup ? to_wakeup->task : NULL; | 902 | if (next) |
903 | wake_up_process(next->task); | ||
904 | } | ||
905 | spin_unlock_irq(&pool->lock); | ||
912 | } | 906 | } |
913 | 907 | ||
914 | /** | 908 | /** |
@@ -2277,7 +2271,7 @@ __acquires(&pool->lock) | |||
2277 | 2271 | ||
2278 | if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { | 2272 | if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { |
2279 | pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n" | 2273 | pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n" |
2280 | " last function: %pf\n", | 2274 | " last function: %ps\n", |
2281 | current->comm, preempt_count(), task_pid_nr(current), | 2275 | current->comm, preempt_count(), task_pid_nr(current), |
2282 | worker->current_func); | 2276 | worker->current_func); |
2283 | debug_show_held_locks(current); | 2277 | debug_show_held_locks(current); |
@@ -2596,11 +2590,11 @@ static void check_flush_dependency(struct workqueue_struct *target_wq, | |||
2596 | worker = current_wq_worker(); | 2590 | worker = current_wq_worker(); |
2597 | 2591 | ||
2598 | WARN_ONCE(current->flags & PF_MEMALLOC, | 2592 | WARN_ONCE(current->flags & PF_MEMALLOC, |
2599 | "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%pf", | 2593 | "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%ps", |
2600 | current->pid, current->comm, target_wq->name, target_func); | 2594 | current->pid, current->comm, target_wq->name, target_func); |
2601 | WARN_ONCE(worker && ((worker->current_pwq->wq->flags & | 2595 | WARN_ONCE(worker && ((worker->current_pwq->wq->flags & |
2602 | (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM), | 2596 | (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM), |
2603 | "workqueue: WQ_MEM_RECLAIM %s:%pf is flushing !WQ_MEM_RECLAIM %s:%pf", | 2597 | "workqueue: WQ_MEM_RECLAIM %s:%ps is flushing !WQ_MEM_RECLAIM %s:%ps", |
2604 | worker->current_pwq->wq->name, worker->current_func, | 2598 | worker->current_pwq->wq->name, worker->current_func, |
2605 | target_wq->name, target_func); | 2599 | target_wq->name, target_func); |
2606 | } | 2600 | } |
@@ -4266,7 +4260,7 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, | |||
4266 | INIT_LIST_HEAD(&wq->list); | 4260 | INIT_LIST_HEAD(&wq->list); |
4267 | 4261 | ||
4268 | if (alloc_and_link_pwqs(wq) < 0) | 4262 | if (alloc_and_link_pwqs(wq) < 0) |
4269 | goto err_free_wq; | 4263 | goto err_unreg_lockdep; |
4270 | 4264 | ||
4271 | if (wq_online && init_rescuer(wq) < 0) | 4265 | if (wq_online && init_rescuer(wq) < 0) |
4272 | goto err_destroy; | 4266 | goto err_destroy; |
@@ -4292,9 +4286,10 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, | |||
4292 | 4286 | ||
4293 | return wq; | 4287 | return wq; |
4294 | 4288 | ||
4295 | err_free_wq: | 4289 | err_unreg_lockdep: |
4296 | wq_unregister_lockdep(wq); | 4290 | wq_unregister_lockdep(wq); |
4297 | wq_free_lockdep(wq); | 4291 | wq_free_lockdep(wq); |
4292 | err_free_wq: | ||
4298 | free_workqueue_attrs(wq->unbound_attrs); | 4293 | free_workqueue_attrs(wq->unbound_attrs); |
4299 | kfree(wq); | 4294 | kfree(wq); |
4300 | return NULL; | 4295 | return NULL; |
@@ -4586,7 +4581,7 @@ void print_worker_info(const char *log_lvl, struct task_struct *task) | |||
4586 | probe_kernel_read(desc, worker->desc, sizeof(desc) - 1); | 4581 | probe_kernel_read(desc, worker->desc, sizeof(desc) - 1); |
4587 | 4582 | ||
4588 | if (fn || name[0] || desc[0]) { | 4583 | if (fn || name[0] || desc[0]) { |
4589 | printk("%sWorkqueue: %s %pf", log_lvl, name, fn); | 4584 | printk("%sWorkqueue: %s %ps", log_lvl, name, fn); |
4590 | if (strcmp(name, desc)) | 4585 | if (strcmp(name, desc)) |
4591 | pr_cont(" (%s)", desc); | 4586 | pr_cont(" (%s)", desc); |
4592 | pr_cont("\n"); | 4587 | pr_cont("\n"); |
@@ -4611,7 +4606,7 @@ static void pr_cont_work(bool comma, struct work_struct *work) | |||
4611 | pr_cont("%s BAR(%d)", comma ? "," : "", | 4606 | pr_cont("%s BAR(%d)", comma ? "," : "", |
4612 | task_pid_nr(barr->task)); | 4607 | task_pid_nr(barr->task)); |
4613 | } else { | 4608 | } else { |
4614 | pr_cont("%s %pf", comma ? "," : "", work->func); | 4609 | pr_cont("%s %ps", comma ? "," : "", work->func); |
4615 | } | 4610 | } |
4616 | } | 4611 | } |
4617 | 4612 | ||
@@ -4643,7 +4638,7 @@ static void show_pwq(struct pool_workqueue *pwq) | |||
4643 | if (worker->current_pwq != pwq) | 4638 | if (worker->current_pwq != pwq) |
4644 | continue; | 4639 | continue; |
4645 | 4640 | ||
4646 | pr_cont("%s %d%s:%pf", comma ? "," : "", | 4641 | pr_cont("%s %d%s:%ps", comma ? "," : "", |
4647 | task_pid_nr(worker->task), | 4642 | task_pid_nr(worker->task), |
4648 | worker == pwq->wq->rescuer ? "(RESCUER)" : "", | 4643 | worker == pwq->wq->rescuer ? "(RESCUER)" : "", |
4649 | worker->current_func); | 4644 | worker->current_func); |
@@ -4928,7 +4923,7 @@ static void rebind_workers(struct worker_pool *pool) | |||
4928 | * | 4923 | * |
4929 | * WRITE_ONCE() is necessary because @worker->flags may be | 4924 | * WRITE_ONCE() is necessary because @worker->flags may be |
4930 | * tested without holding any lock in | 4925 | * tested without holding any lock in |
4931 | * wq_worker_waking_up(). Without it, NOT_RUNNING test may | 4926 | * wq_worker_running(). Without it, NOT_RUNNING test may |
4932 | * fail incorrectly leading to premature concurrency | 4927 | * fail incorrectly leading to premature concurrency |
4933 | * management operations. | 4928 | * management operations. |
4934 | */ | 4929 | */ |
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index cb68b03ca89a..498de0e909a4 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h | |||
@@ -44,6 +44,7 @@ struct worker { | |||
44 | unsigned long last_active; /* L: last active timestamp */ | 44 | unsigned long last_active; /* L: last active timestamp */ |
45 | unsigned int flags; /* X: flags */ | 45 | unsigned int flags; /* X: flags */ |
46 | int id; /* I: worker id */ | 46 | int id; /* I: worker id */ |
47 | int sleeping; /* None */ | ||
47 | 48 | ||
48 | /* | 49 | /* |
49 | * Opaque string set with work_set_desc(). Printed out with task | 50 | * Opaque string set with work_set_desc(). Printed out with task |
@@ -72,8 +73,8 @@ static inline struct worker *current_wq_worker(void) | |||
72 | * Scheduler hooks for concurrency managed workqueue. Only to be used from | 73 | * Scheduler hooks for concurrency managed workqueue. Only to be used from |
73 | * sched/ and workqueue.c. | 74 | * sched/ and workqueue.c. |
74 | */ | 75 | */ |
75 | void wq_worker_waking_up(struct task_struct *task, int cpu); | 76 | void wq_worker_running(struct task_struct *task); |
76 | struct task_struct *wq_worker_sleeping(struct task_struct *task); | 77 | void wq_worker_sleeping(struct task_struct *task); |
77 | work_func_t wq_worker_last_func(struct task_struct *task); | 78 | work_func_t wq_worker_last_func(struct task_struct *task); |
78 | 79 | ||
79 | #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */ | 80 | #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */ |