diff options
Diffstat (limited to 'kernel')
109 files changed, 6218 insertions, 2818 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 1408b3353a3c..0f8f8b0bc1bf 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -9,7 +9,9 @@ obj-y = fork.o exec_domain.o panic.o \ | |||
9 | extable.o params.o \ | 9 | extable.o params.o \ |
10 | kthread.o sys_ni.o nsproxy.o \ | 10 | kthread.o sys_ni.o nsproxy.o \ |
11 | notifier.o ksysfs.o cred.o reboot.o \ | 11 | notifier.o ksysfs.o cred.o reboot.o \ |
12 | async.o range.o groups.o smpboot.o | 12 | async.o range.o smpboot.o |
13 | |||
14 | obj-$(CONFIG_MULTIUSER) += groups.o | ||
13 | 15 | ||
14 | ifdef CONFIG_FUNCTION_TRACER | 16 | ifdef CONFIG_FUNCTION_TRACER |
15 | # Do not trace debug files and internal ftrace files | 17 | # Do not trace debug files and internal ftrace files |
diff --git a/kernel/acct.c b/kernel/acct.c index e6c10d1a4058..74963d192c5d 100644 --- a/kernel/acct.c +++ b/kernel/acct.c | |||
@@ -213,7 +213,7 @@ static int acct_on(struct filename *pathname) | |||
213 | return -EACCES; | 213 | return -EACCES; |
214 | } | 214 | } |
215 | 215 | ||
216 | if (!file->f_op->write) { | 216 | if (!(file->f_mode & FMODE_CAN_WRITE)) { |
217 | kfree(acct); | 217 | kfree(acct); |
218 | filp_close(file, NULL); | 218 | filp_close(file, NULL); |
219 | return -EIO; | 219 | return -EIO; |
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index a5ae60f0b0a2..e6983be12bd3 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile | |||
@@ -1,5 +1,2 @@ | |||
1 | obj-y := core.o | 1 | obj-y := core.o |
2 | obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o | 2 | obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o |
3 | ifdef CONFIG_TEST_BPF | ||
4 | obj-$(CONFIG_BPF_SYSCALL) += test_stub.o | ||
5 | endif | ||
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 9eb4d8a7cd87..8a6616583f38 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c | |||
@@ -134,7 +134,7 @@ static void array_map_free(struct bpf_map *map) | |||
134 | kvfree(array); | 134 | kvfree(array); |
135 | } | 135 | } |
136 | 136 | ||
137 | static struct bpf_map_ops array_ops = { | 137 | static const struct bpf_map_ops array_ops = { |
138 | .map_alloc = array_map_alloc, | 138 | .map_alloc = array_map_alloc, |
139 | .map_free = array_map_free, | 139 | .map_free = array_map_free, |
140 | .map_get_next_key = array_map_get_next_key, | 140 | .map_get_next_key = array_map_get_next_key, |
@@ -143,14 +143,14 @@ static struct bpf_map_ops array_ops = { | |||
143 | .map_delete_elem = array_map_delete_elem, | 143 | .map_delete_elem = array_map_delete_elem, |
144 | }; | 144 | }; |
145 | 145 | ||
146 | static struct bpf_map_type_list tl = { | 146 | static struct bpf_map_type_list array_type __read_mostly = { |
147 | .ops = &array_ops, | 147 | .ops = &array_ops, |
148 | .type = BPF_MAP_TYPE_ARRAY, | 148 | .type = BPF_MAP_TYPE_ARRAY, |
149 | }; | 149 | }; |
150 | 150 | ||
151 | static int __init register_array_map(void) | 151 | static int __init register_array_map(void) |
152 | { | 152 | { |
153 | bpf_register_map_type(&tl); | 153 | bpf_register_map_type(&array_type); |
154 | return 0; | 154 | return 0; |
155 | } | 155 | } |
156 | late_initcall(register_array_map); | 156 | late_initcall(register_array_map); |
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a64e7a207d2b..4139a0f8b558 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c | |||
@@ -656,6 +656,14 @@ void bpf_prog_free(struct bpf_prog *fp) | |||
656 | } | 656 | } |
657 | EXPORT_SYMBOL_GPL(bpf_prog_free); | 657 | EXPORT_SYMBOL_GPL(bpf_prog_free); |
658 | 658 | ||
659 | /* Weak definitions of helper functions in case we don't have bpf syscall. */ | ||
660 | const struct bpf_func_proto bpf_map_lookup_elem_proto __weak; | ||
661 | const struct bpf_func_proto bpf_map_update_elem_proto __weak; | ||
662 | const struct bpf_func_proto bpf_map_delete_elem_proto __weak; | ||
663 | |||
664 | const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; | ||
665 | const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; | ||
666 | |||
659 | /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call | 667 | /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call |
660 | * skb_copy_bits(), so provide a weak definition of it for NET-less config. | 668 | * skb_copy_bits(), so provide a weak definition of it for NET-less config. |
661 | */ | 669 | */ |
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index b3ba43674310..83c209d9b17a 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c | |||
@@ -345,7 +345,7 @@ static void htab_map_free(struct bpf_map *map) | |||
345 | kfree(htab); | 345 | kfree(htab); |
346 | } | 346 | } |
347 | 347 | ||
348 | static struct bpf_map_ops htab_ops = { | 348 | static const struct bpf_map_ops htab_ops = { |
349 | .map_alloc = htab_map_alloc, | 349 | .map_alloc = htab_map_alloc, |
350 | .map_free = htab_map_free, | 350 | .map_free = htab_map_free, |
351 | .map_get_next_key = htab_map_get_next_key, | 351 | .map_get_next_key = htab_map_get_next_key, |
@@ -354,14 +354,14 @@ static struct bpf_map_ops htab_ops = { | |||
354 | .map_delete_elem = htab_map_delete_elem, | 354 | .map_delete_elem = htab_map_delete_elem, |
355 | }; | 355 | }; |
356 | 356 | ||
357 | static struct bpf_map_type_list tl = { | 357 | static struct bpf_map_type_list htab_type __read_mostly = { |
358 | .ops = &htab_ops, | 358 | .ops = &htab_ops, |
359 | .type = BPF_MAP_TYPE_HASH, | 359 | .type = BPF_MAP_TYPE_HASH, |
360 | }; | 360 | }; |
361 | 361 | ||
362 | static int __init register_htab_map(void) | 362 | static int __init register_htab_map(void) |
363 | { | 363 | { |
364 | bpf_register_map_type(&tl); | 364 | bpf_register_map_type(&htab_type); |
365 | return 0; | 365 | return 0; |
366 | } | 366 | } |
367 | late_initcall(register_htab_map); | 367 | late_initcall(register_htab_map); |
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 9e3414d85459..bd7f5988ed9c 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c | |||
@@ -11,6 +11,8 @@ | |||
11 | */ | 11 | */ |
12 | #include <linux/bpf.h> | 12 | #include <linux/bpf.h> |
13 | #include <linux/rcupdate.h> | 13 | #include <linux/rcupdate.h> |
14 | #include <linux/random.h> | ||
15 | #include <linux/smp.h> | ||
14 | 16 | ||
15 | /* If kernel subsystem is allowing eBPF programs to call this function, | 17 | /* If kernel subsystem is allowing eBPF programs to call this function, |
16 | * inside its own verifier_ops->get_func_proto() callback it should return | 18 | * inside its own verifier_ops->get_func_proto() callback it should return |
@@ -41,7 +43,7 @@ static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | |||
41 | return (unsigned long) value; | 43 | return (unsigned long) value; |
42 | } | 44 | } |
43 | 45 | ||
44 | struct bpf_func_proto bpf_map_lookup_elem_proto = { | 46 | const struct bpf_func_proto bpf_map_lookup_elem_proto = { |
45 | .func = bpf_map_lookup_elem, | 47 | .func = bpf_map_lookup_elem, |
46 | .gpl_only = false, | 48 | .gpl_only = false, |
47 | .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, | 49 | .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, |
@@ -60,7 +62,7 @@ static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | |||
60 | return map->ops->map_update_elem(map, key, value, r4); | 62 | return map->ops->map_update_elem(map, key, value, r4); |
61 | } | 63 | } |
62 | 64 | ||
63 | struct bpf_func_proto bpf_map_update_elem_proto = { | 65 | const struct bpf_func_proto bpf_map_update_elem_proto = { |
64 | .func = bpf_map_update_elem, | 66 | .func = bpf_map_update_elem, |
65 | .gpl_only = false, | 67 | .gpl_only = false, |
66 | .ret_type = RET_INTEGER, | 68 | .ret_type = RET_INTEGER, |
@@ -80,10 +82,32 @@ static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | |||
80 | return map->ops->map_delete_elem(map, key); | 82 | return map->ops->map_delete_elem(map, key); |
81 | } | 83 | } |
82 | 84 | ||
83 | struct bpf_func_proto bpf_map_delete_elem_proto = { | 85 | const struct bpf_func_proto bpf_map_delete_elem_proto = { |
84 | .func = bpf_map_delete_elem, | 86 | .func = bpf_map_delete_elem, |
85 | .gpl_only = false, | 87 | .gpl_only = false, |
86 | .ret_type = RET_INTEGER, | 88 | .ret_type = RET_INTEGER, |
87 | .arg1_type = ARG_CONST_MAP_PTR, | 89 | .arg1_type = ARG_CONST_MAP_PTR, |
88 | .arg2_type = ARG_PTR_TO_MAP_KEY, | 90 | .arg2_type = ARG_PTR_TO_MAP_KEY, |
89 | }; | 91 | }; |
92 | |||
93 | static u64 bpf_get_prandom_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | ||
94 | { | ||
95 | return prandom_u32(); | ||
96 | } | ||
97 | |||
98 | const struct bpf_func_proto bpf_get_prandom_u32_proto = { | ||
99 | .func = bpf_get_prandom_u32, | ||
100 | .gpl_only = false, | ||
101 | .ret_type = RET_INTEGER, | ||
102 | }; | ||
103 | |||
104 | static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | ||
105 | { | ||
106 | return raw_smp_processor_id(); | ||
107 | } | ||
108 | |||
109 | const struct bpf_func_proto bpf_get_smp_processor_id_proto = { | ||
110 | .func = bpf_get_smp_processor_id, | ||
111 | .gpl_only = false, | ||
112 | .ret_type = RET_INTEGER, | ||
113 | }; | ||
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 536edc2be307..3bae6c591914 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <linux/file.h> | 16 | #include <linux/file.h> |
17 | #include <linux/license.h> | 17 | #include <linux/license.h> |
18 | #include <linux/filter.h> | 18 | #include <linux/filter.h> |
19 | #include <linux/version.h> | ||
19 | 20 | ||
20 | static LIST_HEAD(bpf_map_types); | 21 | static LIST_HEAD(bpf_map_types); |
21 | 22 | ||
@@ -354,10 +355,11 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) | |||
354 | list_for_each_entry(tl, &bpf_prog_types, list_node) { | 355 | list_for_each_entry(tl, &bpf_prog_types, list_node) { |
355 | if (tl->type == type) { | 356 | if (tl->type == type) { |
356 | prog->aux->ops = tl->ops; | 357 | prog->aux->ops = tl->ops; |
357 | prog->aux->prog_type = type; | 358 | prog->type = type; |
358 | return 0; | 359 | return 0; |
359 | } | 360 | } |
360 | } | 361 | } |
362 | |||
361 | return -EINVAL; | 363 | return -EINVAL; |
362 | } | 364 | } |
363 | 365 | ||
@@ -418,6 +420,7 @@ void bpf_prog_put(struct bpf_prog *prog) | |||
418 | bpf_prog_free(prog); | 420 | bpf_prog_free(prog); |
419 | } | 421 | } |
420 | } | 422 | } |
423 | EXPORT_SYMBOL_GPL(bpf_prog_put); | ||
421 | 424 | ||
422 | static int bpf_prog_release(struct inode *inode, struct file *filp) | 425 | static int bpf_prog_release(struct inode *inode, struct file *filp) |
423 | { | 426 | { |
@@ -465,9 +468,10 @@ struct bpf_prog *bpf_prog_get(u32 ufd) | |||
465 | fdput(f); | 468 | fdput(f); |
466 | return prog; | 469 | return prog; |
467 | } | 470 | } |
471 | EXPORT_SYMBOL_GPL(bpf_prog_get); | ||
468 | 472 | ||
469 | /* last field in 'union bpf_attr' used by this command */ | 473 | /* last field in 'union bpf_attr' used by this command */ |
470 | #define BPF_PROG_LOAD_LAST_FIELD log_buf | 474 | #define BPF_PROG_LOAD_LAST_FIELD kern_version |
471 | 475 | ||
472 | static int bpf_prog_load(union bpf_attr *attr) | 476 | static int bpf_prog_load(union bpf_attr *attr) |
473 | { | 477 | { |
@@ -492,6 +496,10 @@ static int bpf_prog_load(union bpf_attr *attr) | |||
492 | if (attr->insn_cnt >= BPF_MAXINSNS) | 496 | if (attr->insn_cnt >= BPF_MAXINSNS) |
493 | return -EINVAL; | 497 | return -EINVAL; |
494 | 498 | ||
499 | if (type == BPF_PROG_TYPE_KPROBE && | ||
500 | attr->kern_version != LINUX_VERSION_CODE) | ||
501 | return -EINVAL; | ||
502 | |||
495 | /* plain bpf_prog allocation */ | 503 | /* plain bpf_prog allocation */ |
496 | prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); | 504 | prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); |
497 | if (!prog) | 505 | if (!prog) |
@@ -508,7 +516,7 @@ static int bpf_prog_load(union bpf_attr *attr) | |||
508 | prog->jited = false; | 516 | prog->jited = false; |
509 | 517 | ||
510 | atomic_set(&prog->aux->refcnt, 1); | 518 | atomic_set(&prog->aux->refcnt, 1); |
511 | prog->aux->is_gpl_compatible = is_gpl; | 519 | prog->gpl_compatible = is_gpl; |
512 | 520 | ||
513 | /* find program type: socket_filter vs tracing_filter */ | 521 | /* find program type: socket_filter vs tracing_filter */ |
514 | err = find_prog_type(type, prog); | 522 | err = find_prog_type(type, prog); |
@@ -516,8 +524,7 @@ static int bpf_prog_load(union bpf_attr *attr) | |||
516 | goto free_prog; | 524 | goto free_prog; |
517 | 525 | ||
518 | /* run eBPF verifier */ | 526 | /* run eBPF verifier */ |
519 | err = bpf_check(prog, attr); | 527 | err = bpf_check(&prog, attr); |
520 | |||
521 | if (err < 0) | 528 | if (err < 0) |
522 | goto free_used_maps; | 529 | goto free_used_maps; |
523 | 530 | ||
@@ -528,7 +535,6 @@ static int bpf_prog_load(union bpf_attr *attr) | |||
528 | bpf_prog_select_runtime(prog); | 535 | bpf_prog_select_runtime(prog); |
529 | 536 | ||
530 | err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC); | 537 | err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC); |
531 | |||
532 | if (err < 0) | 538 | if (err < 0) |
533 | /* failed to allocate fd */ | 539 | /* failed to allocate fd */ |
534 | goto free_used_maps; | 540 | goto free_used_maps; |
diff --git a/kernel/bpf/test_stub.c b/kernel/bpf/test_stub.c deleted file mode 100644 index 0ceae1e6e8b5..000000000000 --- a/kernel/bpf/test_stub.c +++ /dev/null | |||
@@ -1,78 +0,0 @@ | |||
1 | /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of version 2 of the GNU General Public | ||
5 | * License as published by the Free Software Foundation. | ||
6 | */ | ||
7 | #include <linux/kernel.h> | ||
8 | #include <linux/types.h> | ||
9 | #include <linux/slab.h> | ||
10 | #include <linux/err.h> | ||
11 | #include <linux/bpf.h> | ||
12 | |||
13 | /* test stubs for BPF_MAP_TYPE_UNSPEC and for BPF_PROG_TYPE_UNSPEC | ||
14 | * to be used by user space verifier testsuite | ||
15 | */ | ||
16 | struct bpf_context { | ||
17 | u64 arg1; | ||
18 | u64 arg2; | ||
19 | }; | ||
20 | |||
21 | static const struct bpf_func_proto *test_func_proto(enum bpf_func_id func_id) | ||
22 | { | ||
23 | switch (func_id) { | ||
24 | case BPF_FUNC_map_lookup_elem: | ||
25 | return &bpf_map_lookup_elem_proto; | ||
26 | case BPF_FUNC_map_update_elem: | ||
27 | return &bpf_map_update_elem_proto; | ||
28 | case BPF_FUNC_map_delete_elem: | ||
29 | return &bpf_map_delete_elem_proto; | ||
30 | default: | ||
31 | return NULL; | ||
32 | } | ||
33 | } | ||
34 | |||
35 | static const struct bpf_context_access { | ||
36 | int size; | ||
37 | enum bpf_access_type type; | ||
38 | } test_ctx_access[] = { | ||
39 | [offsetof(struct bpf_context, arg1)] = { | ||
40 | FIELD_SIZEOF(struct bpf_context, arg1), | ||
41 | BPF_READ | ||
42 | }, | ||
43 | [offsetof(struct bpf_context, arg2)] = { | ||
44 | FIELD_SIZEOF(struct bpf_context, arg2), | ||
45 | BPF_READ | ||
46 | }, | ||
47 | }; | ||
48 | |||
49 | static bool test_is_valid_access(int off, int size, enum bpf_access_type type) | ||
50 | { | ||
51 | const struct bpf_context_access *access; | ||
52 | |||
53 | if (off < 0 || off >= ARRAY_SIZE(test_ctx_access)) | ||
54 | return false; | ||
55 | |||
56 | access = &test_ctx_access[off]; | ||
57 | if (access->size == size && (access->type & type)) | ||
58 | return true; | ||
59 | |||
60 | return false; | ||
61 | } | ||
62 | |||
63 | static struct bpf_verifier_ops test_ops = { | ||
64 | .get_func_proto = test_func_proto, | ||
65 | .is_valid_access = test_is_valid_access, | ||
66 | }; | ||
67 | |||
68 | static struct bpf_prog_type_list tl_prog = { | ||
69 | .ops = &test_ops, | ||
70 | .type = BPF_PROG_TYPE_UNSPEC, | ||
71 | }; | ||
72 | |||
73 | static int __init register_test_ops(void) | ||
74 | { | ||
75 | bpf_register_prog_type(&tl_prog); | ||
76 | return 0; | ||
77 | } | ||
78 | late_initcall(register_test_ops); | ||
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a28e09c7825d..47dcd3aa6e23 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c | |||
@@ -755,7 +755,7 @@ static int check_func_arg(struct verifier_env *env, u32 regno, | |||
755 | enum bpf_reg_type expected_type; | 755 | enum bpf_reg_type expected_type; |
756 | int err = 0; | 756 | int err = 0; |
757 | 757 | ||
758 | if (arg_type == ARG_ANYTHING) | 758 | if (arg_type == ARG_DONTCARE) |
759 | return 0; | 759 | return 0; |
760 | 760 | ||
761 | if (reg->type == NOT_INIT) { | 761 | if (reg->type == NOT_INIT) { |
@@ -763,6 +763,9 @@ static int check_func_arg(struct verifier_env *env, u32 regno, | |||
763 | return -EACCES; | 763 | return -EACCES; |
764 | } | 764 | } |
765 | 765 | ||
766 | if (arg_type == ARG_ANYTHING) | ||
767 | return 0; | ||
768 | |||
766 | if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY || | 769 | if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY || |
767 | arg_type == ARG_PTR_TO_MAP_VALUE) { | 770 | arg_type == ARG_PTR_TO_MAP_VALUE) { |
768 | expected_type = PTR_TO_STACK; | 771 | expected_type = PTR_TO_STACK; |
@@ -770,6 +773,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno, | |||
770 | expected_type = CONST_IMM; | 773 | expected_type = CONST_IMM; |
771 | } else if (arg_type == ARG_CONST_MAP_PTR) { | 774 | } else if (arg_type == ARG_CONST_MAP_PTR) { |
772 | expected_type = CONST_PTR_TO_MAP; | 775 | expected_type = CONST_PTR_TO_MAP; |
776 | } else if (arg_type == ARG_PTR_TO_CTX) { | ||
777 | expected_type = PTR_TO_CTX; | ||
773 | } else { | 778 | } else { |
774 | verbose("unsupported arg_type %d\n", arg_type); | 779 | verbose("unsupported arg_type %d\n", arg_type); |
775 | return -EFAULT; | 780 | return -EFAULT; |
@@ -852,7 +857,7 @@ static int check_call(struct verifier_env *env, int func_id) | |||
852 | } | 857 | } |
853 | 858 | ||
854 | /* eBPF programs must be GPL compatible to use GPL-ed functions */ | 859 | /* eBPF programs must be GPL compatible to use GPL-ed functions */ |
855 | if (!env->prog->aux->is_gpl_compatible && fn->gpl_only) { | 860 | if (!env->prog->gpl_compatible && fn->gpl_only) { |
856 | verbose("cannot call GPL only function from proprietary program\n"); | 861 | verbose("cannot call GPL only function from proprietary program\n"); |
857 | return -EINVAL; | 862 | return -EINVAL; |
858 | } | 863 | } |
@@ -1172,6 +1177,18 @@ static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn) | |||
1172 | return 0; | 1177 | return 0; |
1173 | } | 1178 | } |
1174 | 1179 | ||
1180 | static bool may_access_skb(enum bpf_prog_type type) | ||
1181 | { | ||
1182 | switch (type) { | ||
1183 | case BPF_PROG_TYPE_SOCKET_FILTER: | ||
1184 | case BPF_PROG_TYPE_SCHED_CLS: | ||
1185 | case BPF_PROG_TYPE_SCHED_ACT: | ||
1186 | return true; | ||
1187 | default: | ||
1188 | return false; | ||
1189 | } | ||
1190 | } | ||
1191 | |||
1175 | /* verify safety of LD_ABS|LD_IND instructions: | 1192 | /* verify safety of LD_ABS|LD_IND instructions: |
1176 | * - they can only appear in the programs where ctx == skb | 1193 | * - they can only appear in the programs where ctx == skb |
1177 | * - since they are wrappers of function calls, they scratch R1-R5 registers, | 1194 | * - since they are wrappers of function calls, they scratch R1-R5 registers, |
@@ -1194,8 +1211,8 @@ static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn) | |||
1194 | struct reg_state *reg; | 1211 | struct reg_state *reg; |
1195 | int i, err; | 1212 | int i, err; |
1196 | 1213 | ||
1197 | if (env->prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) { | 1214 | if (!may_access_skb(env->prog->type)) { |
1198 | verbose("BPF_LD_ABS|IND instructions are only allowed in socket filters\n"); | 1215 | verbose("BPF_LD_ABS|IND instructions not allowed for this program type\n"); |
1199 | return -EINVAL; | 1216 | return -EINVAL; |
1200 | } | 1217 | } |
1201 | 1218 | ||
@@ -1380,7 +1397,8 @@ peek_stack: | |||
1380 | /* tell verifier to check for equivalent states | 1397 | /* tell verifier to check for equivalent states |
1381 | * after every call and jump | 1398 | * after every call and jump |
1382 | */ | 1399 | */ |
1383 | env->explored_states[t + 1] = STATE_LIST_MARK; | 1400 | if (t + 1 < insn_cnt) |
1401 | env->explored_states[t + 1] = STATE_LIST_MARK; | ||
1384 | } else { | 1402 | } else { |
1385 | /* conditional jump with two edges */ | 1403 | /* conditional jump with two edges */ |
1386 | ret = push_insn(t, t + 1, FALLTHROUGH, env); | 1404 | ret = push_insn(t, t + 1, FALLTHROUGH, env); |
@@ -1606,11 +1624,10 @@ static int do_check(struct verifier_env *env) | |||
1606 | return err; | 1624 | return err; |
1607 | 1625 | ||
1608 | } else if (class == BPF_LDX) { | 1626 | } else if (class == BPF_LDX) { |
1609 | if (BPF_MODE(insn->code) != BPF_MEM || | 1627 | enum bpf_reg_type src_reg_type; |
1610 | insn->imm != 0) { | 1628 | |
1611 | verbose("BPF_LDX uses reserved fields\n"); | 1629 | /* check for reserved fields is already done */ |
1612 | return -EINVAL; | 1630 | |
1613 | } | ||
1614 | /* check src operand */ | 1631 | /* check src operand */ |
1615 | err = check_reg_arg(regs, insn->src_reg, SRC_OP); | 1632 | err = check_reg_arg(regs, insn->src_reg, SRC_OP); |
1616 | if (err) | 1633 | if (err) |
@@ -1620,6 +1637,8 @@ static int do_check(struct verifier_env *env) | |||
1620 | if (err) | 1637 | if (err) |
1621 | return err; | 1638 | return err; |
1622 | 1639 | ||
1640 | src_reg_type = regs[insn->src_reg].type; | ||
1641 | |||
1623 | /* check that memory (src_reg + off) is readable, | 1642 | /* check that memory (src_reg + off) is readable, |
1624 | * the state of dst_reg will be updated by this func | 1643 | * the state of dst_reg will be updated by this func |
1625 | */ | 1644 | */ |
@@ -1629,6 +1648,32 @@ static int do_check(struct verifier_env *env) | |||
1629 | if (err) | 1648 | if (err) |
1630 | return err; | 1649 | return err; |
1631 | 1650 | ||
1651 | if (BPF_SIZE(insn->code) != BPF_W) { | ||
1652 | insn_idx++; | ||
1653 | continue; | ||
1654 | } | ||
1655 | |||
1656 | if (insn->imm == 0) { | ||
1657 | /* saw a valid insn | ||
1658 | * dst_reg = *(u32 *)(src_reg + off) | ||
1659 | * use reserved 'imm' field to mark this insn | ||
1660 | */ | ||
1661 | insn->imm = src_reg_type; | ||
1662 | |||
1663 | } else if (src_reg_type != insn->imm && | ||
1664 | (src_reg_type == PTR_TO_CTX || | ||
1665 | insn->imm == PTR_TO_CTX)) { | ||
1666 | /* ABuser program is trying to use the same insn | ||
1667 | * dst_reg = *(u32*) (src_reg + off) | ||
1668 | * with different pointer types: | ||
1669 | * src_reg == ctx in one branch and | ||
1670 | * src_reg == stack|map in some other branch. | ||
1671 | * Reject it. | ||
1672 | */ | ||
1673 | verbose("same insn cannot be used with different pointers\n"); | ||
1674 | return -EINVAL; | ||
1675 | } | ||
1676 | |||
1632 | } else if (class == BPF_STX) { | 1677 | } else if (class == BPF_STX) { |
1633 | if (BPF_MODE(insn->code) == BPF_XADD) { | 1678 | if (BPF_MODE(insn->code) == BPF_XADD) { |
1634 | err = check_xadd(env, insn); | 1679 | err = check_xadd(env, insn); |
@@ -1776,6 +1821,13 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) | |||
1776 | int i, j; | 1821 | int i, j; |
1777 | 1822 | ||
1778 | for (i = 0; i < insn_cnt; i++, insn++) { | 1823 | for (i = 0; i < insn_cnt; i++, insn++) { |
1824 | if (BPF_CLASS(insn->code) == BPF_LDX && | ||
1825 | (BPF_MODE(insn->code) != BPF_MEM || | ||
1826 | insn->imm != 0)) { | ||
1827 | verbose("BPF_LDX uses reserved fields\n"); | ||
1828 | return -EINVAL; | ||
1829 | } | ||
1830 | |||
1779 | if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) { | 1831 | if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) { |
1780 | struct bpf_map *map; | 1832 | struct bpf_map *map; |
1781 | struct fd f; | 1833 | struct fd f; |
@@ -1867,6 +1919,92 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env) | |||
1867 | insn->src_reg = 0; | 1919 | insn->src_reg = 0; |
1868 | } | 1920 | } |
1869 | 1921 | ||
1922 | static void adjust_branches(struct bpf_prog *prog, int pos, int delta) | ||
1923 | { | ||
1924 | struct bpf_insn *insn = prog->insnsi; | ||
1925 | int insn_cnt = prog->len; | ||
1926 | int i; | ||
1927 | |||
1928 | for (i = 0; i < insn_cnt; i++, insn++) { | ||
1929 | if (BPF_CLASS(insn->code) != BPF_JMP || | ||
1930 | BPF_OP(insn->code) == BPF_CALL || | ||
1931 | BPF_OP(insn->code) == BPF_EXIT) | ||
1932 | continue; | ||
1933 | |||
1934 | /* adjust offset of jmps if necessary */ | ||
1935 | if (i < pos && i + insn->off + 1 > pos) | ||
1936 | insn->off += delta; | ||
1937 | else if (i > pos && i + insn->off + 1 < pos) | ||
1938 | insn->off -= delta; | ||
1939 | } | ||
1940 | } | ||
1941 | |||
1942 | /* convert load instructions that access fields of 'struct __sk_buff' | ||
1943 | * into sequence of instructions that access fields of 'struct sk_buff' | ||
1944 | */ | ||
1945 | static int convert_ctx_accesses(struct verifier_env *env) | ||
1946 | { | ||
1947 | struct bpf_insn *insn = env->prog->insnsi; | ||
1948 | int insn_cnt = env->prog->len; | ||
1949 | struct bpf_insn insn_buf[16]; | ||
1950 | struct bpf_prog *new_prog; | ||
1951 | u32 cnt; | ||
1952 | int i; | ||
1953 | |||
1954 | if (!env->prog->aux->ops->convert_ctx_access) | ||
1955 | return 0; | ||
1956 | |||
1957 | for (i = 0; i < insn_cnt; i++, insn++) { | ||
1958 | if (insn->code != (BPF_LDX | BPF_MEM | BPF_W)) | ||
1959 | continue; | ||
1960 | |||
1961 | if (insn->imm != PTR_TO_CTX) { | ||
1962 | /* clear internal mark */ | ||
1963 | insn->imm = 0; | ||
1964 | continue; | ||
1965 | } | ||
1966 | |||
1967 | cnt = env->prog->aux->ops-> | ||
1968 | convert_ctx_access(insn->dst_reg, insn->src_reg, | ||
1969 | insn->off, insn_buf); | ||
1970 | if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { | ||
1971 | verbose("bpf verifier is misconfigured\n"); | ||
1972 | return -EINVAL; | ||
1973 | } | ||
1974 | |||
1975 | if (cnt == 1) { | ||
1976 | memcpy(insn, insn_buf, sizeof(*insn)); | ||
1977 | continue; | ||
1978 | } | ||
1979 | |||
1980 | /* several new insns need to be inserted. Make room for them */ | ||
1981 | insn_cnt += cnt - 1; | ||
1982 | new_prog = bpf_prog_realloc(env->prog, | ||
1983 | bpf_prog_size(insn_cnt), | ||
1984 | GFP_USER); | ||
1985 | if (!new_prog) | ||
1986 | return -ENOMEM; | ||
1987 | |||
1988 | new_prog->len = insn_cnt; | ||
1989 | |||
1990 | memmove(new_prog->insnsi + i + cnt, new_prog->insns + i + 1, | ||
1991 | sizeof(*insn) * (insn_cnt - i - cnt)); | ||
1992 | |||
1993 | /* copy substitute insns in place of load instruction */ | ||
1994 | memcpy(new_prog->insnsi + i, insn_buf, sizeof(*insn) * cnt); | ||
1995 | |||
1996 | /* adjust branches in the whole program */ | ||
1997 | adjust_branches(new_prog, i, cnt - 1); | ||
1998 | |||
1999 | /* keep walking new program and skip insns we just inserted */ | ||
2000 | env->prog = new_prog; | ||
2001 | insn = new_prog->insnsi + i + cnt - 1; | ||
2002 | i += cnt - 1; | ||
2003 | } | ||
2004 | |||
2005 | return 0; | ||
2006 | } | ||
2007 | |||
1870 | static void free_states(struct verifier_env *env) | 2008 | static void free_states(struct verifier_env *env) |
1871 | { | 2009 | { |
1872 | struct verifier_state_list *sl, *sln; | 2010 | struct verifier_state_list *sl, *sln; |
@@ -1889,13 +2027,13 @@ static void free_states(struct verifier_env *env) | |||
1889 | kfree(env->explored_states); | 2027 | kfree(env->explored_states); |
1890 | } | 2028 | } |
1891 | 2029 | ||
1892 | int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) | 2030 | int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) |
1893 | { | 2031 | { |
1894 | char __user *log_ubuf = NULL; | 2032 | char __user *log_ubuf = NULL; |
1895 | struct verifier_env *env; | 2033 | struct verifier_env *env; |
1896 | int ret = -EINVAL; | 2034 | int ret = -EINVAL; |
1897 | 2035 | ||
1898 | if (prog->len <= 0 || prog->len > BPF_MAXINSNS) | 2036 | if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS) |
1899 | return -E2BIG; | 2037 | return -E2BIG; |
1900 | 2038 | ||
1901 | /* 'struct verifier_env' can be global, but since it's not small, | 2039 | /* 'struct verifier_env' can be global, but since it's not small, |
@@ -1905,7 +2043,7 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) | |||
1905 | if (!env) | 2043 | if (!env) |
1906 | return -ENOMEM; | 2044 | return -ENOMEM; |
1907 | 2045 | ||
1908 | env->prog = prog; | 2046 | env->prog = *prog; |
1909 | 2047 | ||
1910 | /* grab the mutex to protect few globals used by verifier */ | 2048 | /* grab the mutex to protect few globals used by verifier */ |
1911 | mutex_lock(&bpf_verifier_lock); | 2049 | mutex_lock(&bpf_verifier_lock); |
@@ -1937,7 +2075,7 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) | |||
1937 | if (ret < 0) | 2075 | if (ret < 0) |
1938 | goto skip_full_check; | 2076 | goto skip_full_check; |
1939 | 2077 | ||
1940 | env->explored_states = kcalloc(prog->len, | 2078 | env->explored_states = kcalloc(env->prog->len, |
1941 | sizeof(struct verifier_state_list *), | 2079 | sizeof(struct verifier_state_list *), |
1942 | GFP_USER); | 2080 | GFP_USER); |
1943 | ret = -ENOMEM; | 2081 | ret = -ENOMEM; |
@@ -1954,6 +2092,10 @@ skip_full_check: | |||
1954 | while (pop_stack(env, NULL) >= 0); | 2092 | while (pop_stack(env, NULL) >= 0); |
1955 | free_states(env); | 2093 | free_states(env); |
1956 | 2094 | ||
2095 | if (ret == 0) | ||
2096 | /* program is valid, convert *(u32*)(ctx + off) accesses */ | ||
2097 | ret = convert_ctx_accesses(env); | ||
2098 | |||
1957 | if (log_level && log_len >= log_size - 1) { | 2099 | if (log_level && log_len >= log_size - 1) { |
1958 | BUG_ON(log_len >= log_size); | 2100 | BUG_ON(log_len >= log_size); |
1959 | /* verifier log exceeded user supplied buffer */ | 2101 | /* verifier log exceeded user supplied buffer */ |
@@ -1969,18 +2111,18 @@ skip_full_check: | |||
1969 | 2111 | ||
1970 | if (ret == 0 && env->used_map_cnt) { | 2112 | if (ret == 0 && env->used_map_cnt) { |
1971 | /* if program passed verifier, update used_maps in bpf_prog_info */ | 2113 | /* if program passed verifier, update used_maps in bpf_prog_info */ |
1972 | prog->aux->used_maps = kmalloc_array(env->used_map_cnt, | 2114 | env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt, |
1973 | sizeof(env->used_maps[0]), | 2115 | sizeof(env->used_maps[0]), |
1974 | GFP_KERNEL); | 2116 | GFP_KERNEL); |
1975 | 2117 | ||
1976 | if (!prog->aux->used_maps) { | 2118 | if (!env->prog->aux->used_maps) { |
1977 | ret = -ENOMEM; | 2119 | ret = -ENOMEM; |
1978 | goto free_log_buf; | 2120 | goto free_log_buf; |
1979 | } | 2121 | } |
1980 | 2122 | ||
1981 | memcpy(prog->aux->used_maps, env->used_maps, | 2123 | memcpy(env->prog->aux->used_maps, env->used_maps, |
1982 | sizeof(env->used_maps[0]) * env->used_map_cnt); | 2124 | sizeof(env->used_maps[0]) * env->used_map_cnt); |
1983 | prog->aux->used_map_cnt = env->used_map_cnt; | 2125 | env->prog->aux->used_map_cnt = env->used_map_cnt; |
1984 | 2126 | ||
1985 | /* program is valid. Convert pseudo bpf_ld_imm64 into generic | 2127 | /* program is valid. Convert pseudo bpf_ld_imm64 into generic |
1986 | * bpf_ld_imm64 instructions | 2128 | * bpf_ld_imm64 instructions |
@@ -1992,11 +2134,12 @@ free_log_buf: | |||
1992 | if (log_level) | 2134 | if (log_level) |
1993 | vfree(log_buf); | 2135 | vfree(log_buf); |
1994 | free_env: | 2136 | free_env: |
1995 | if (!prog->aux->used_maps) | 2137 | if (!env->prog->aux->used_maps) |
1996 | /* if we didn't copy map pointers into bpf_prog_info, release | 2138 | /* if we didn't copy map pointers into bpf_prog_info, release |
1997 | * them now. Otherwise free_bpf_prog_info() will release them. | 2139 | * them now. Otherwise free_bpf_prog_info() will release them. |
1998 | */ | 2140 | */ |
1999 | release_maps(env); | 2141 | release_maps(env); |
2142 | *prog = env->prog; | ||
2000 | kfree(env); | 2143 | kfree(env); |
2001 | mutex_unlock(&bpf_verifier_lock); | 2144 | mutex_unlock(&bpf_verifier_lock); |
2002 | return ret; | 2145 | return ret; |
diff --git a/kernel/capability.c b/kernel/capability.c index 989f5bfc57dc..45432b54d5c6 100644 --- a/kernel/capability.c +++ b/kernel/capability.c | |||
@@ -35,6 +35,7 @@ static int __init file_caps_disable(char *str) | |||
35 | } | 35 | } |
36 | __setup("no_file_caps", file_caps_disable); | 36 | __setup("no_file_caps", file_caps_disable); |
37 | 37 | ||
38 | #ifdef CONFIG_MULTIUSER | ||
38 | /* | 39 | /* |
39 | * More recent versions of libcap are available from: | 40 | * More recent versions of libcap are available from: |
40 | * | 41 | * |
@@ -386,6 +387,24 @@ bool ns_capable(struct user_namespace *ns, int cap) | |||
386 | } | 387 | } |
387 | EXPORT_SYMBOL(ns_capable); | 388 | EXPORT_SYMBOL(ns_capable); |
388 | 389 | ||
390 | |||
391 | /** | ||
392 | * capable - Determine if the current task has a superior capability in effect | ||
393 | * @cap: The capability to be tested for | ||
394 | * | ||
395 | * Return true if the current task has the given superior capability currently | ||
396 | * available for use, false if not. | ||
397 | * | ||
398 | * This sets PF_SUPERPRIV on the task if the capability is available on the | ||
399 | * assumption that it's about to be used. | ||
400 | */ | ||
401 | bool capable(int cap) | ||
402 | { | ||
403 | return ns_capable(&init_user_ns, cap); | ||
404 | } | ||
405 | EXPORT_SYMBOL(capable); | ||
406 | #endif /* CONFIG_MULTIUSER */ | ||
407 | |||
389 | /** | 408 | /** |
390 | * file_ns_capable - Determine if the file's opener had a capability in effect | 409 | * file_ns_capable - Determine if the file's opener had a capability in effect |
391 | * @file: The file we want to check | 410 | * @file: The file we want to check |
@@ -412,22 +431,6 @@ bool file_ns_capable(const struct file *file, struct user_namespace *ns, | |||
412 | EXPORT_SYMBOL(file_ns_capable); | 431 | EXPORT_SYMBOL(file_ns_capable); |
413 | 432 | ||
414 | /** | 433 | /** |
415 | * capable - Determine if the current task has a superior capability in effect | ||
416 | * @cap: The capability to be tested for | ||
417 | * | ||
418 | * Return true if the current task has the given superior capability currently | ||
419 | * available for use, false if not. | ||
420 | * | ||
421 | * This sets PF_SUPERPRIV on the task if the capability is available on the | ||
422 | * assumption that it's about to be used. | ||
423 | */ | ||
424 | bool capable(int cap) | ||
425 | { | ||
426 | return ns_capable(&init_user_ns, cap); | ||
427 | } | ||
428 | EXPORT_SYMBOL(capable); | ||
429 | |||
430 | /** | ||
431 | * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped | 434 | * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped |
432 | * @inode: The inode in question | 435 | * @inode: The inode in question |
433 | * @cap: The capability in question | 436 | * @cap: The capability in question |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 29a7b2cc593e..469dd547770c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -3806,10 +3806,7 @@ static void *pidlist_allocate(int count) | |||
3806 | 3806 | ||
3807 | static void pidlist_free(void *p) | 3807 | static void pidlist_free(void *p) |
3808 | { | 3808 | { |
3809 | if (is_vmalloc_addr(p)) | 3809 | kvfree(p); |
3810 | vfree(p); | ||
3811 | else | ||
3812 | kfree(p); | ||
3813 | } | 3810 | } |
3814 | 3811 | ||
3815 | /* | 3812 | /* |
@@ -4199,7 +4196,9 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) | |||
4199 | 4196 | ||
4200 | static int cgroup_pidlist_show(struct seq_file *s, void *v) | 4197 | static int cgroup_pidlist_show(struct seq_file *s, void *v) |
4201 | { | 4198 | { |
4202 | return seq_printf(s, "%d\n", *(int *)v); | 4199 | seq_printf(s, "%d\n", *(int *)v); |
4200 | |||
4201 | return 0; | ||
4203 | } | 4202 | } |
4204 | 4203 | ||
4205 | static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, | 4204 | static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, |
@@ -5040,6 +5039,9 @@ int __init cgroup_init(void) | |||
5040 | WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes)); | 5039 | WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes)); |
5041 | WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes)); | 5040 | WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes)); |
5042 | } | 5041 | } |
5042 | |||
5043 | if (ss->bind) | ||
5044 | ss->bind(init_css_set.subsys[ssid]); | ||
5043 | } | 5045 | } |
5044 | 5046 | ||
5045 | cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); | 5047 | cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); |
@@ -5451,7 +5453,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, | |||
5451 | struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) | 5453 | struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) |
5452 | { | 5454 | { |
5453 | WARN_ON_ONCE(!rcu_read_lock_held()); | 5455 | WARN_ON_ONCE(!rcu_read_lock_held()); |
5454 | return idr_find(&ss->css_idr, id); | 5456 | return id > 0 ? idr_find(&ss->css_idr, id) : NULL; |
5455 | } | 5457 | } |
5456 | 5458 | ||
5457 | #ifdef CONFIG_CGROUP_DEBUG | 5459 | #ifdef CONFIG_CGROUP_DEBUG |
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 937ecdfdf258..72d59a1a6eb6 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c | |||
@@ -39,15 +39,15 @@ void context_tracking_cpu_set(int cpu) | |||
39 | } | 39 | } |
40 | 40 | ||
41 | /** | 41 | /** |
42 | * context_tracking_user_enter - Inform the context tracking that the CPU is going to | 42 | * context_tracking_enter - Inform the context tracking that the CPU is going |
43 | * enter userspace mode. | 43 | * enter user or guest space mode. |
44 | * | 44 | * |
45 | * This function must be called right before we switch from the kernel | 45 | * This function must be called right before we switch from the kernel |
46 | * to userspace, when it's guaranteed the remaining kernel instructions | 46 | * to user or guest space, when it's guaranteed the remaining kernel |
47 | * to execute won't use any RCU read side critical section because this | 47 | * instructions to execute won't use any RCU read side critical section |
48 | * function sets RCU in extended quiescent state. | 48 | * because this function sets RCU in extended quiescent state. |
49 | */ | 49 | */ |
50 | void context_tracking_user_enter(void) | 50 | void context_tracking_enter(enum ctx_state state) |
51 | { | 51 | { |
52 | unsigned long flags; | 52 | unsigned long flags; |
53 | 53 | ||
@@ -75,9 +75,8 @@ void context_tracking_user_enter(void) | |||
75 | WARN_ON_ONCE(!current->mm); | 75 | WARN_ON_ONCE(!current->mm); |
76 | 76 | ||
77 | local_irq_save(flags); | 77 | local_irq_save(flags); |
78 | if ( __this_cpu_read(context_tracking.state) != IN_USER) { | 78 | if ( __this_cpu_read(context_tracking.state) != state) { |
79 | if (__this_cpu_read(context_tracking.active)) { | 79 | if (__this_cpu_read(context_tracking.active)) { |
80 | trace_user_enter(0); | ||
81 | /* | 80 | /* |
82 | * At this stage, only low level arch entry code remains and | 81 | * At this stage, only low level arch entry code remains and |
83 | * then we'll run in userspace. We can assume there won't be | 82 | * then we'll run in userspace. We can assume there won't be |
@@ -85,7 +84,10 @@ void context_tracking_user_enter(void) | |||
85 | * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency | 84 | * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency |
86 | * on the tick. | 85 | * on the tick. |
87 | */ | 86 | */ |
88 | vtime_user_enter(current); | 87 | if (state == CONTEXT_USER) { |
88 | trace_user_enter(0); | ||
89 | vtime_user_enter(current); | ||
90 | } | ||
89 | rcu_user_enter(); | 91 | rcu_user_enter(); |
90 | } | 92 | } |
91 | /* | 93 | /* |
@@ -101,24 +103,32 @@ void context_tracking_user_enter(void) | |||
101 | * OTOH we can spare the calls to vtime and RCU when context_tracking.active | 103 | * OTOH we can spare the calls to vtime and RCU when context_tracking.active |
102 | * is false because we know that CPU is not tickless. | 104 | * is false because we know that CPU is not tickless. |
103 | */ | 105 | */ |
104 | __this_cpu_write(context_tracking.state, IN_USER); | 106 | __this_cpu_write(context_tracking.state, state); |
105 | } | 107 | } |
106 | local_irq_restore(flags); | 108 | local_irq_restore(flags); |
107 | } | 109 | } |
110 | NOKPROBE_SYMBOL(context_tracking_enter); | ||
111 | EXPORT_SYMBOL_GPL(context_tracking_enter); | ||
112 | |||
113 | void context_tracking_user_enter(void) | ||
114 | { | ||
115 | context_tracking_enter(CONTEXT_USER); | ||
116 | } | ||
108 | NOKPROBE_SYMBOL(context_tracking_user_enter); | 117 | NOKPROBE_SYMBOL(context_tracking_user_enter); |
109 | 118 | ||
110 | /** | 119 | /** |
111 | * context_tracking_user_exit - Inform the context tracking that the CPU is | 120 | * context_tracking_exit - Inform the context tracking that the CPU is |
112 | * exiting userspace mode and entering the kernel. | 121 | * exiting user or guest mode and entering the kernel. |
113 | * | 122 | * |
114 | * This function must be called after we entered the kernel from userspace | 123 | * This function must be called after we entered the kernel from user or |
115 | * before any use of RCU read side critical section. This potentially include | 124 | * guest space before any use of RCU read side critical section. This |
116 | * any high level kernel code like syscalls, exceptions, signal handling, etc... | 125 | * potentially include any high level kernel code like syscalls, exceptions, |
126 | * signal handling, etc... | ||
117 | * | 127 | * |
118 | * This call supports re-entrancy. This way it can be called from any exception | 128 | * This call supports re-entrancy. This way it can be called from any exception |
119 | * handler without needing to know if we came from userspace or not. | 129 | * handler without needing to know if we came from userspace or not. |
120 | */ | 130 | */ |
121 | void context_tracking_user_exit(void) | 131 | void context_tracking_exit(enum ctx_state state) |
122 | { | 132 | { |
123 | unsigned long flags; | 133 | unsigned long flags; |
124 | 134 | ||
@@ -129,20 +139,29 @@ void context_tracking_user_exit(void) | |||
129 | return; | 139 | return; |
130 | 140 | ||
131 | local_irq_save(flags); | 141 | local_irq_save(flags); |
132 | if (__this_cpu_read(context_tracking.state) == IN_USER) { | 142 | if (__this_cpu_read(context_tracking.state) == state) { |
133 | if (__this_cpu_read(context_tracking.active)) { | 143 | if (__this_cpu_read(context_tracking.active)) { |
134 | /* | 144 | /* |
135 | * We are going to run code that may use RCU. Inform | 145 | * We are going to run code that may use RCU. Inform |
136 | * RCU core about that (ie: we may need the tick again). | 146 | * RCU core about that (ie: we may need the tick again). |
137 | */ | 147 | */ |
138 | rcu_user_exit(); | 148 | rcu_user_exit(); |
139 | vtime_user_exit(current); | 149 | if (state == CONTEXT_USER) { |
140 | trace_user_exit(0); | 150 | vtime_user_exit(current); |
151 | trace_user_exit(0); | ||
152 | } | ||
141 | } | 153 | } |
142 | __this_cpu_write(context_tracking.state, IN_KERNEL); | 154 | __this_cpu_write(context_tracking.state, CONTEXT_KERNEL); |
143 | } | 155 | } |
144 | local_irq_restore(flags); | 156 | local_irq_restore(flags); |
145 | } | 157 | } |
158 | NOKPROBE_SYMBOL(context_tracking_exit); | ||
159 | EXPORT_SYMBOL_GPL(context_tracking_exit); | ||
160 | |||
161 | void context_tracking_user_exit(void) | ||
162 | { | ||
163 | context_tracking_exit(CONTEXT_USER); | ||
164 | } | ||
146 | NOKPROBE_SYMBOL(context_tracking_user_exit); | 165 | NOKPROBE_SYMBOL(context_tracking_user_exit); |
147 | 166 | ||
148 | /** | 167 | /** |
diff --git a/kernel/cpu.c b/kernel/cpu.c index 1972b161c61e..94bbe4695232 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -20,6 +20,7 @@ | |||
20 | #include <linux/gfp.h> | 20 | #include <linux/gfp.h> |
21 | #include <linux/suspend.h> | 21 | #include <linux/suspend.h> |
22 | #include <linux/lockdep.h> | 22 | #include <linux/lockdep.h> |
23 | #include <linux/tick.h> | ||
23 | #include <trace/events/power.h> | 24 | #include <trace/events/power.h> |
24 | 25 | ||
25 | #include "smpboot.h" | 26 | #include "smpboot.h" |
@@ -338,6 +339,8 @@ static int __ref take_cpu_down(void *_param) | |||
338 | return err; | 339 | return err; |
339 | 340 | ||
340 | cpu_notify(CPU_DYING | param->mod, param->hcpu); | 341 | cpu_notify(CPU_DYING | param->mod, param->hcpu); |
342 | /* Give up timekeeping duties */ | ||
343 | tick_handover_do_timer(); | ||
341 | /* Park the stopper thread */ | 344 | /* Park the stopper thread */ |
342 | kthread_park(current); | 345 | kthread_park(current); |
343 | return 0; | 346 | return 0; |
@@ -408,13 +411,17 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) | |||
408 | * | 411 | * |
409 | * Wait for the stop thread to go away. | 412 | * Wait for the stop thread to go away. |
410 | */ | 413 | */ |
411 | while (!idle_cpu(cpu)) | 414 | while (!per_cpu(cpu_dead_idle, cpu)) |
412 | cpu_relax(); | 415 | cpu_relax(); |
416 | smp_mb(); /* Read from cpu_dead_idle before __cpu_die(). */ | ||
417 | per_cpu(cpu_dead_idle, cpu) = false; | ||
413 | 418 | ||
419 | hotplug_cpu__broadcast_tick_pull(cpu); | ||
414 | /* This actually kills the CPU. */ | 420 | /* This actually kills the CPU. */ |
415 | __cpu_die(cpu); | 421 | __cpu_die(cpu); |
416 | 422 | ||
417 | /* CPU is completely dead: tell everyone. Too late to complain. */ | 423 | /* CPU is completely dead: tell everyone. Too late to complain. */ |
424 | tick_cleanup_dead_cpu(cpu); | ||
418 | cpu_notify_nofail(CPU_DEAD | mod, hcpu); | 425 | cpu_notify_nofail(CPU_DEAD | mod, hcpu); |
419 | 426 | ||
420 | check_for_tasks(cpu); | 427 | check_for_tasks(cpu); |
@@ -446,6 +453,37 @@ out: | |||
446 | EXPORT_SYMBOL(cpu_down); | 453 | EXPORT_SYMBOL(cpu_down); |
447 | #endif /*CONFIG_HOTPLUG_CPU*/ | 454 | #endif /*CONFIG_HOTPLUG_CPU*/ |
448 | 455 | ||
456 | /* | ||
457 | * Unpark per-CPU smpboot kthreads at CPU-online time. | ||
458 | */ | ||
459 | static int smpboot_thread_call(struct notifier_block *nfb, | ||
460 | unsigned long action, void *hcpu) | ||
461 | { | ||
462 | int cpu = (long)hcpu; | ||
463 | |||
464 | switch (action & ~CPU_TASKS_FROZEN) { | ||
465 | |||
466 | case CPU_ONLINE: | ||
467 | smpboot_unpark_threads(cpu); | ||
468 | break; | ||
469 | |||
470 | default: | ||
471 | break; | ||
472 | } | ||
473 | |||
474 | return NOTIFY_OK; | ||
475 | } | ||
476 | |||
477 | static struct notifier_block smpboot_thread_notifier = { | ||
478 | .notifier_call = smpboot_thread_call, | ||
479 | .priority = CPU_PRI_SMPBOOT, | ||
480 | }; | ||
481 | |||
482 | void __cpuinit smpboot_thread_init(void) | ||
483 | { | ||
484 | register_cpu_notifier(&smpboot_thread_notifier); | ||
485 | } | ||
486 | |||
449 | /* Requires cpu_add_remove_lock to be held */ | 487 | /* Requires cpu_add_remove_lock to be held */ |
450 | static int _cpu_up(unsigned int cpu, int tasks_frozen) | 488 | static int _cpu_up(unsigned int cpu, int tasks_frozen) |
451 | { | 489 | { |
@@ -485,9 +523,6 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen) | |||
485 | goto out_notify; | 523 | goto out_notify; |
486 | BUG_ON(!cpu_online(cpu)); | 524 | BUG_ON(!cpu_online(cpu)); |
487 | 525 | ||
488 | /* Wake the per cpu threads */ | ||
489 | smpboot_unpark_threads(cpu); | ||
490 | |||
491 | /* Now call notifier in preparation. */ | 526 | /* Now call notifier in preparation. */ |
492 | cpu_notify(CPU_ONLINE | mod, hcpu); | 527 | cpu_notify(CPU_ONLINE | mod, hcpu); |
493 | 528 | ||
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index fc7f4748d34a..ee14e3a35a29 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -622,6 +622,7 @@ static int generate_sched_domains(cpumask_var_t **domains, | |||
622 | int csn; /* how many cpuset ptrs in csa so far */ | 622 | int csn; /* how many cpuset ptrs in csa so far */ |
623 | int i, j, k; /* indices for partition finding loops */ | 623 | int i, j, k; /* indices for partition finding loops */ |
624 | cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ | 624 | cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ |
625 | cpumask_var_t non_isolated_cpus; /* load balanced CPUs */ | ||
625 | struct sched_domain_attr *dattr; /* attributes for custom domains */ | 626 | struct sched_domain_attr *dattr; /* attributes for custom domains */ |
626 | int ndoms = 0; /* number of sched domains in result */ | 627 | int ndoms = 0; /* number of sched domains in result */ |
627 | int nslot; /* next empty doms[] struct cpumask slot */ | 628 | int nslot; /* next empty doms[] struct cpumask slot */ |
@@ -631,6 +632,10 @@ static int generate_sched_domains(cpumask_var_t **domains, | |||
631 | dattr = NULL; | 632 | dattr = NULL; |
632 | csa = NULL; | 633 | csa = NULL; |
633 | 634 | ||
635 | if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL)) | ||
636 | goto done; | ||
637 | cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); | ||
638 | |||
634 | /* Special case for the 99% of systems with one, full, sched domain */ | 639 | /* Special case for the 99% of systems with one, full, sched domain */ |
635 | if (is_sched_load_balance(&top_cpuset)) { | 640 | if (is_sched_load_balance(&top_cpuset)) { |
636 | ndoms = 1; | 641 | ndoms = 1; |
@@ -643,7 +648,8 @@ static int generate_sched_domains(cpumask_var_t **domains, | |||
643 | *dattr = SD_ATTR_INIT; | 648 | *dattr = SD_ATTR_INIT; |
644 | update_domain_attr_tree(dattr, &top_cpuset); | 649 | update_domain_attr_tree(dattr, &top_cpuset); |
645 | } | 650 | } |
646 | cpumask_copy(doms[0], top_cpuset.effective_cpus); | 651 | cpumask_and(doms[0], top_cpuset.effective_cpus, |
652 | non_isolated_cpus); | ||
647 | 653 | ||
648 | goto done; | 654 | goto done; |
649 | } | 655 | } |
@@ -666,7 +672,8 @@ static int generate_sched_domains(cpumask_var_t **domains, | |||
666 | * the corresponding sched domain. | 672 | * the corresponding sched domain. |
667 | */ | 673 | */ |
668 | if (!cpumask_empty(cp->cpus_allowed) && | 674 | if (!cpumask_empty(cp->cpus_allowed) && |
669 | !is_sched_load_balance(cp)) | 675 | !(is_sched_load_balance(cp) && |
676 | cpumask_intersects(cp->cpus_allowed, non_isolated_cpus))) | ||
670 | continue; | 677 | continue; |
671 | 678 | ||
672 | if (is_sched_load_balance(cp)) | 679 | if (is_sched_load_balance(cp)) |
@@ -748,6 +755,7 @@ restart: | |||
748 | 755 | ||
749 | if (apn == b->pn) { | 756 | if (apn == b->pn) { |
750 | cpumask_or(dp, dp, b->effective_cpus); | 757 | cpumask_or(dp, dp, b->effective_cpus); |
758 | cpumask_and(dp, dp, non_isolated_cpus); | ||
751 | if (dattr) | 759 | if (dattr) |
752 | update_domain_attr_tree(dattr + nslot, b); | 760 | update_domain_attr_tree(dattr + nslot, b); |
753 | 761 | ||
@@ -760,6 +768,7 @@ restart: | |||
760 | BUG_ON(nslot != ndoms); | 768 | BUG_ON(nslot != ndoms); |
761 | 769 | ||
762 | done: | 770 | done: |
771 | free_cpumask_var(non_isolated_cpus); | ||
763 | kfree(csa); | 772 | kfree(csa); |
764 | 773 | ||
765 | /* | 774 | /* |
@@ -2444,20 +2453,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) | |||
2444 | * @node: is this an allowed node? | 2453 | * @node: is this an allowed node? |
2445 | * @gfp_mask: memory allocation flags | 2454 | * @gfp_mask: memory allocation flags |
2446 | * | 2455 | * |
2447 | * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is | 2456 | * If we're in interrupt, yes, we can always allocate. If @node is set in |
2448 | * set, yes, we can always allocate. If node is in our task's mems_allowed, | 2457 | * current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this |
2449 | * yes. If it's not a __GFP_HARDWALL request and this node is in the nearest | 2458 | * node is set in the nearest hardwalled cpuset ancestor to current's cpuset, |
2450 | * hardwalled cpuset ancestor to this task's cpuset, yes. If the task has been | 2459 | * yes. If current has access to memory reserves due to TIF_MEMDIE, yes. |
2451 | * OOM killed and has access to memory reserves as specified by the TIF_MEMDIE | ||
2452 | * flag, yes. | ||
2453 | * Otherwise, no. | 2460 | * Otherwise, no. |
2454 | * | 2461 | * |
2455 | * The __GFP_THISNODE placement logic is really handled elsewhere, | ||
2456 | * by forcibly using a zonelist starting at a specified node, and by | ||
2457 | * (in get_page_from_freelist()) refusing to consider the zones for | ||
2458 | * any node on the zonelist except the first. By the time any such | ||
2459 | * calls get to this routine, we should just shut up and say 'yes'. | ||
2460 | * | ||
2461 | * GFP_USER allocations are marked with the __GFP_HARDWALL bit, | 2462 | * GFP_USER allocations are marked with the __GFP_HARDWALL bit, |
2462 | * and do not allow allocations outside the current tasks cpuset | 2463 | * and do not allow allocations outside the current tasks cpuset |
2463 | * unless the task has been OOM killed as is marked TIF_MEMDIE. | 2464 | * unless the task has been OOM killed as is marked TIF_MEMDIE. |
@@ -2493,7 +2494,7 @@ int __cpuset_node_allowed(int node, gfp_t gfp_mask) | |||
2493 | int allowed; /* is allocation in zone z allowed? */ | 2494 | int allowed; /* is allocation in zone z allowed? */ |
2494 | unsigned long flags; | 2495 | unsigned long flags; |
2495 | 2496 | ||
2496 | if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) | 2497 | if (in_interrupt()) |
2497 | return 1; | 2498 | return 1; |
2498 | if (node_isset(node, current->mems_allowed)) | 2499 | if (node_isset(node, current->mems_allowed)) |
2499 | return 1; | 2500 | return 1; |
diff --git a/kernel/cred.c b/kernel/cred.c index e0573a43c7df..ec1c07667ec1 100644 --- a/kernel/cred.c +++ b/kernel/cred.c | |||
@@ -29,6 +29,9 @@ | |||
29 | 29 | ||
30 | static struct kmem_cache *cred_jar; | 30 | static struct kmem_cache *cred_jar; |
31 | 31 | ||
32 | /* init to 2 - one for init_task, one to ensure it is never freed */ | ||
33 | struct group_info init_groups = { .usage = ATOMIC_INIT(2) }; | ||
34 | |||
32 | /* | 35 | /* |
33 | * The initial credentials for the initial task | 36 | * The initial credentials for the initial task |
34 | */ | 37 | */ |
diff --git a/kernel/events/core.c b/kernel/events/core.c index 453ef61311d4..81aa3a4ece9f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -34,14 +34,16 @@ | |||
34 | #include <linux/syscalls.h> | 34 | #include <linux/syscalls.h> |
35 | #include <linux/anon_inodes.h> | 35 | #include <linux/anon_inodes.h> |
36 | #include <linux/kernel_stat.h> | 36 | #include <linux/kernel_stat.h> |
37 | #include <linux/cgroup.h> | ||
37 | #include <linux/perf_event.h> | 38 | #include <linux/perf_event.h> |
38 | #include <linux/ftrace_event.h> | 39 | #include <linux/ftrace_event.h> |
39 | #include <linux/hw_breakpoint.h> | 40 | #include <linux/hw_breakpoint.h> |
40 | #include <linux/mm_types.h> | 41 | #include <linux/mm_types.h> |
41 | #include <linux/cgroup.h> | ||
42 | #include <linux/module.h> | 42 | #include <linux/module.h> |
43 | #include <linux/mman.h> | 43 | #include <linux/mman.h> |
44 | #include <linux/compat.h> | 44 | #include <linux/compat.h> |
45 | #include <linux/bpf.h> | ||
46 | #include <linux/filter.h> | ||
45 | 47 | ||
46 | #include "internal.h" | 48 | #include "internal.h" |
47 | 49 | ||
@@ -153,7 +155,7 @@ enum event_type_t { | |||
153 | */ | 155 | */ |
154 | struct static_key_deferred perf_sched_events __read_mostly; | 156 | struct static_key_deferred perf_sched_events __read_mostly; |
155 | static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); | 157 | static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); |
156 | static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events); | 158 | static DEFINE_PER_CPU(int, perf_sched_cb_usages); |
157 | 159 | ||
158 | static atomic_t nr_mmap_events __read_mostly; | 160 | static atomic_t nr_mmap_events __read_mostly; |
159 | static atomic_t nr_comm_events __read_mostly; | 161 | static atomic_t nr_comm_events __read_mostly; |
@@ -327,6 +329,11 @@ static inline u64 perf_clock(void) | |||
327 | return local_clock(); | 329 | return local_clock(); |
328 | } | 330 | } |
329 | 331 | ||
332 | static inline u64 perf_event_clock(struct perf_event *event) | ||
333 | { | ||
334 | return event->clock(); | ||
335 | } | ||
336 | |||
330 | static inline struct perf_cpu_context * | 337 | static inline struct perf_cpu_context * |
331 | __get_cpu_context(struct perf_event_context *ctx) | 338 | __get_cpu_context(struct perf_event_context *ctx) |
332 | { | 339 | { |
@@ -351,32 +358,6 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, | |||
351 | 358 | ||
352 | #ifdef CONFIG_CGROUP_PERF | 359 | #ifdef CONFIG_CGROUP_PERF |
353 | 360 | ||
354 | /* | ||
355 | * perf_cgroup_info keeps track of time_enabled for a cgroup. | ||
356 | * This is a per-cpu dynamically allocated data structure. | ||
357 | */ | ||
358 | struct perf_cgroup_info { | ||
359 | u64 time; | ||
360 | u64 timestamp; | ||
361 | }; | ||
362 | |||
363 | struct perf_cgroup { | ||
364 | struct cgroup_subsys_state css; | ||
365 | struct perf_cgroup_info __percpu *info; | ||
366 | }; | ||
367 | |||
368 | /* | ||
369 | * Must ensure cgroup is pinned (css_get) before calling | ||
370 | * this function. In other words, we cannot call this function | ||
371 | * if there is no cgroup event for the current CPU context. | ||
372 | */ | ||
373 | static inline struct perf_cgroup * | ||
374 | perf_cgroup_from_task(struct task_struct *task) | ||
375 | { | ||
376 | return container_of(task_css(task, perf_event_cgrp_id), | ||
377 | struct perf_cgroup, css); | ||
378 | } | ||
379 | |||
380 | static inline bool | 361 | static inline bool |
381 | perf_cgroup_match(struct perf_event *event) | 362 | perf_cgroup_match(struct perf_event *event) |
382 | { | 363 | { |
@@ -905,6 +886,15 @@ static void get_ctx(struct perf_event_context *ctx) | |||
905 | WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); | 886 | WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); |
906 | } | 887 | } |
907 | 888 | ||
889 | static void free_ctx(struct rcu_head *head) | ||
890 | { | ||
891 | struct perf_event_context *ctx; | ||
892 | |||
893 | ctx = container_of(head, struct perf_event_context, rcu_head); | ||
894 | kfree(ctx->task_ctx_data); | ||
895 | kfree(ctx); | ||
896 | } | ||
897 | |||
908 | static void put_ctx(struct perf_event_context *ctx) | 898 | static void put_ctx(struct perf_event_context *ctx) |
909 | { | 899 | { |
910 | if (atomic_dec_and_test(&ctx->refcount)) { | 900 | if (atomic_dec_and_test(&ctx->refcount)) { |
@@ -912,7 +902,7 @@ static void put_ctx(struct perf_event_context *ctx) | |||
912 | put_ctx(ctx->parent_ctx); | 902 | put_ctx(ctx->parent_ctx); |
913 | if (ctx->task) | 903 | if (ctx->task) |
914 | put_task_struct(ctx->task); | 904 | put_task_struct(ctx->task); |
915 | kfree_rcu(ctx, rcu_head); | 905 | call_rcu(&ctx->rcu_head, free_ctx); |
916 | } | 906 | } |
917 | } | 907 | } |
918 | 908 | ||
@@ -1239,9 +1229,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) | |||
1239 | if (is_cgroup_event(event)) | 1229 | if (is_cgroup_event(event)) |
1240 | ctx->nr_cgroups++; | 1230 | ctx->nr_cgroups++; |
1241 | 1231 | ||
1242 | if (has_branch_stack(event)) | ||
1243 | ctx->nr_branch_stack++; | ||
1244 | |||
1245 | list_add_rcu(&event->event_entry, &ctx->event_list); | 1232 | list_add_rcu(&event->event_entry, &ctx->event_list); |
1246 | ctx->nr_events++; | 1233 | ctx->nr_events++; |
1247 | if (event->attr.inherit_stat) | 1234 | if (event->attr.inherit_stat) |
@@ -1408,9 +1395,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) | |||
1408 | cpuctx->cgrp = NULL; | 1395 | cpuctx->cgrp = NULL; |
1409 | } | 1396 | } |
1410 | 1397 | ||
1411 | if (has_branch_stack(event)) | ||
1412 | ctx->nr_branch_stack--; | ||
1413 | |||
1414 | ctx->nr_events--; | 1398 | ctx->nr_events--; |
1415 | if (event->attr.inherit_stat) | 1399 | if (event->attr.inherit_stat) |
1416 | ctx->nr_stat--; | 1400 | ctx->nr_stat--; |
@@ -1847,6 +1831,7 @@ static void perf_set_shadow_time(struct perf_event *event, | |||
1847 | #define MAX_INTERRUPTS (~0ULL) | 1831 | #define MAX_INTERRUPTS (~0ULL) |
1848 | 1832 | ||
1849 | static void perf_log_throttle(struct perf_event *event, int enable); | 1833 | static void perf_log_throttle(struct perf_event *event, int enable); |
1834 | static void perf_log_itrace_start(struct perf_event *event); | ||
1850 | 1835 | ||
1851 | static int | 1836 | static int |
1852 | event_sched_in(struct perf_event *event, | 1837 | event_sched_in(struct perf_event *event, |
@@ -1881,6 +1866,12 @@ event_sched_in(struct perf_event *event, | |||
1881 | 1866 | ||
1882 | perf_pmu_disable(event->pmu); | 1867 | perf_pmu_disable(event->pmu); |
1883 | 1868 | ||
1869 | event->tstamp_running += tstamp - event->tstamp_stopped; | ||
1870 | |||
1871 | perf_set_shadow_time(event, ctx, tstamp); | ||
1872 | |||
1873 | perf_log_itrace_start(event); | ||
1874 | |||
1884 | if (event->pmu->add(event, PERF_EF_START)) { | 1875 | if (event->pmu->add(event, PERF_EF_START)) { |
1885 | event->state = PERF_EVENT_STATE_INACTIVE; | 1876 | event->state = PERF_EVENT_STATE_INACTIVE; |
1886 | event->oncpu = -1; | 1877 | event->oncpu = -1; |
@@ -1888,10 +1879,6 @@ event_sched_in(struct perf_event *event, | |||
1888 | goto out; | 1879 | goto out; |
1889 | } | 1880 | } |
1890 | 1881 | ||
1891 | event->tstamp_running += tstamp - event->tstamp_stopped; | ||
1892 | |||
1893 | perf_set_shadow_time(event, ctx, tstamp); | ||
1894 | |||
1895 | if (!is_software_event(event)) | 1882 | if (!is_software_event(event)) |
1896 | cpuctx->active_oncpu++; | 1883 | cpuctx->active_oncpu++; |
1897 | if (!ctx->nr_active++) | 1884 | if (!ctx->nr_active++) |
@@ -2559,6 +2546,9 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, | |||
2559 | next->perf_event_ctxp[ctxn] = ctx; | 2546 | next->perf_event_ctxp[ctxn] = ctx; |
2560 | ctx->task = next; | 2547 | ctx->task = next; |
2561 | next_ctx->task = task; | 2548 | next_ctx->task = task; |
2549 | |||
2550 | swap(ctx->task_ctx_data, next_ctx->task_ctx_data); | ||
2551 | |||
2562 | do_switch = 0; | 2552 | do_switch = 0; |
2563 | 2553 | ||
2564 | perf_event_sync_stat(ctx, next_ctx); | 2554 | perf_event_sync_stat(ctx, next_ctx); |
@@ -2577,6 +2567,56 @@ unlock: | |||
2577 | } | 2567 | } |
2578 | } | 2568 | } |
2579 | 2569 | ||
2570 | void perf_sched_cb_dec(struct pmu *pmu) | ||
2571 | { | ||
2572 | this_cpu_dec(perf_sched_cb_usages); | ||
2573 | } | ||
2574 | |||
2575 | void perf_sched_cb_inc(struct pmu *pmu) | ||
2576 | { | ||
2577 | this_cpu_inc(perf_sched_cb_usages); | ||
2578 | } | ||
2579 | |||
2580 | /* | ||
2581 | * This function provides the context switch callback to the lower code | ||
2582 | * layer. It is invoked ONLY when the context switch callback is enabled. | ||
2583 | */ | ||
2584 | static void perf_pmu_sched_task(struct task_struct *prev, | ||
2585 | struct task_struct *next, | ||
2586 | bool sched_in) | ||
2587 | { | ||
2588 | struct perf_cpu_context *cpuctx; | ||
2589 | struct pmu *pmu; | ||
2590 | unsigned long flags; | ||
2591 | |||
2592 | if (prev == next) | ||
2593 | return; | ||
2594 | |||
2595 | local_irq_save(flags); | ||
2596 | |||
2597 | rcu_read_lock(); | ||
2598 | |||
2599 | list_for_each_entry_rcu(pmu, &pmus, entry) { | ||
2600 | if (pmu->sched_task) { | ||
2601 | cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); | ||
2602 | |||
2603 | perf_ctx_lock(cpuctx, cpuctx->task_ctx); | ||
2604 | |||
2605 | perf_pmu_disable(pmu); | ||
2606 | |||
2607 | pmu->sched_task(cpuctx->task_ctx, sched_in); | ||
2608 | |||
2609 | perf_pmu_enable(pmu); | ||
2610 | |||
2611 | perf_ctx_unlock(cpuctx, cpuctx->task_ctx); | ||
2612 | } | ||
2613 | } | ||
2614 | |||
2615 | rcu_read_unlock(); | ||
2616 | |||
2617 | local_irq_restore(flags); | ||
2618 | } | ||
2619 | |||
2580 | #define for_each_task_context_nr(ctxn) \ | 2620 | #define for_each_task_context_nr(ctxn) \ |
2581 | for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) | 2621 | for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) |
2582 | 2622 | ||
@@ -2596,6 +2636,9 @@ void __perf_event_task_sched_out(struct task_struct *task, | |||
2596 | { | 2636 | { |
2597 | int ctxn; | 2637 | int ctxn; |
2598 | 2638 | ||
2639 | if (__this_cpu_read(perf_sched_cb_usages)) | ||
2640 | perf_pmu_sched_task(task, next, false); | ||
2641 | |||
2599 | for_each_task_context_nr(ctxn) | 2642 | for_each_task_context_nr(ctxn) |
2600 | perf_event_context_sched_out(task, ctxn, next); | 2643 | perf_event_context_sched_out(task, ctxn, next); |
2601 | 2644 | ||
@@ -2755,64 +2798,6 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, | |||
2755 | } | 2798 | } |
2756 | 2799 | ||
2757 | /* | 2800 | /* |
2758 | * When sampling the branck stack in system-wide, it may be necessary | ||
2759 | * to flush the stack on context switch. This happens when the branch | ||
2760 | * stack does not tag its entries with the pid of the current task. | ||
2761 | * Otherwise it becomes impossible to associate a branch entry with a | ||
2762 | * task. This ambiguity is more likely to appear when the branch stack | ||
2763 | * supports priv level filtering and the user sets it to monitor only | ||
2764 | * at the user level (which could be a useful measurement in system-wide | ||
2765 | * mode). In that case, the risk is high of having a branch stack with | ||
2766 | * branch from multiple tasks. Flushing may mean dropping the existing | ||
2767 | * entries or stashing them somewhere in the PMU specific code layer. | ||
2768 | * | ||
2769 | * This function provides the context switch callback to the lower code | ||
2770 | * layer. It is invoked ONLY when there is at least one system-wide context | ||
2771 | * with at least one active event using taken branch sampling. | ||
2772 | */ | ||
2773 | static void perf_branch_stack_sched_in(struct task_struct *prev, | ||
2774 | struct task_struct *task) | ||
2775 | { | ||
2776 | struct perf_cpu_context *cpuctx; | ||
2777 | struct pmu *pmu; | ||
2778 | unsigned long flags; | ||
2779 | |||
2780 | /* no need to flush branch stack if not changing task */ | ||
2781 | if (prev == task) | ||
2782 | return; | ||
2783 | |||
2784 | local_irq_save(flags); | ||
2785 | |||
2786 | rcu_read_lock(); | ||
2787 | |||
2788 | list_for_each_entry_rcu(pmu, &pmus, entry) { | ||
2789 | cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); | ||
2790 | |||
2791 | /* | ||
2792 | * check if the context has at least one | ||
2793 | * event using PERF_SAMPLE_BRANCH_STACK | ||
2794 | */ | ||
2795 | if (cpuctx->ctx.nr_branch_stack > 0 | ||
2796 | && pmu->flush_branch_stack) { | ||
2797 | |||
2798 | perf_ctx_lock(cpuctx, cpuctx->task_ctx); | ||
2799 | |||
2800 | perf_pmu_disable(pmu); | ||
2801 | |||
2802 | pmu->flush_branch_stack(); | ||
2803 | |||
2804 | perf_pmu_enable(pmu); | ||
2805 | |||
2806 | perf_ctx_unlock(cpuctx, cpuctx->task_ctx); | ||
2807 | } | ||
2808 | } | ||
2809 | |||
2810 | rcu_read_unlock(); | ||
2811 | |||
2812 | local_irq_restore(flags); | ||
2813 | } | ||
2814 | |||
2815 | /* | ||
2816 | * Called from scheduler to add the events of the current task | 2801 | * Called from scheduler to add the events of the current task |
2817 | * with interrupts disabled. | 2802 | * with interrupts disabled. |
2818 | * | 2803 | * |
@@ -2844,9 +2829,8 @@ void __perf_event_task_sched_in(struct task_struct *prev, | |||
2844 | if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) | 2829 | if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) |
2845 | perf_cgroup_sched_in(prev, task); | 2830 | perf_cgroup_sched_in(prev, task); |
2846 | 2831 | ||
2847 | /* check for system-wide branch_stack events */ | 2832 | if (__this_cpu_read(perf_sched_cb_usages)) |
2848 | if (atomic_read(this_cpu_ptr(&perf_branch_stack_events))) | 2833 | perf_pmu_sched_task(prev, task, true); |
2849 | perf_branch_stack_sched_in(prev, task); | ||
2850 | } | 2834 | } |
2851 | 2835 | ||
2852 | static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) | 2836 | static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) |
@@ -3220,7 +3204,10 @@ static void __perf_event_read(void *info) | |||
3220 | 3204 | ||
3221 | static inline u64 perf_event_count(struct perf_event *event) | 3205 | static inline u64 perf_event_count(struct perf_event *event) |
3222 | { | 3206 | { |
3223 | return local64_read(&event->count) + atomic64_read(&event->child_count); | 3207 | if (event->pmu->count) |
3208 | return event->pmu->count(event); | ||
3209 | |||
3210 | return __perf_event_count(event); | ||
3224 | } | 3211 | } |
3225 | 3212 | ||
3226 | static u64 perf_event_read(struct perf_event *event) | 3213 | static u64 perf_event_read(struct perf_event *event) |
@@ -3321,12 +3308,15 @@ errout: | |||
3321 | * Returns a matching context with refcount and pincount. | 3308 | * Returns a matching context with refcount and pincount. |
3322 | */ | 3309 | */ |
3323 | static struct perf_event_context * | 3310 | static struct perf_event_context * |
3324 | find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) | 3311 | find_get_context(struct pmu *pmu, struct task_struct *task, |
3312 | struct perf_event *event) | ||
3325 | { | 3313 | { |
3326 | struct perf_event_context *ctx, *clone_ctx = NULL; | 3314 | struct perf_event_context *ctx, *clone_ctx = NULL; |
3327 | struct perf_cpu_context *cpuctx; | 3315 | struct perf_cpu_context *cpuctx; |
3316 | void *task_ctx_data = NULL; | ||
3328 | unsigned long flags; | 3317 | unsigned long flags; |
3329 | int ctxn, err; | 3318 | int ctxn, err; |
3319 | int cpu = event->cpu; | ||
3330 | 3320 | ||
3331 | if (!task) { | 3321 | if (!task) { |
3332 | /* Must be root to operate on a CPU event: */ | 3322 | /* Must be root to operate on a CPU event: */ |
@@ -3354,11 +3344,24 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) | |||
3354 | if (ctxn < 0) | 3344 | if (ctxn < 0) |
3355 | goto errout; | 3345 | goto errout; |
3356 | 3346 | ||
3347 | if (event->attach_state & PERF_ATTACH_TASK_DATA) { | ||
3348 | task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL); | ||
3349 | if (!task_ctx_data) { | ||
3350 | err = -ENOMEM; | ||
3351 | goto errout; | ||
3352 | } | ||
3353 | } | ||
3354 | |||
3357 | retry: | 3355 | retry: |
3358 | ctx = perf_lock_task_context(task, ctxn, &flags); | 3356 | ctx = perf_lock_task_context(task, ctxn, &flags); |
3359 | if (ctx) { | 3357 | if (ctx) { |
3360 | clone_ctx = unclone_ctx(ctx); | 3358 | clone_ctx = unclone_ctx(ctx); |
3361 | ++ctx->pin_count; | 3359 | ++ctx->pin_count; |
3360 | |||
3361 | if (task_ctx_data && !ctx->task_ctx_data) { | ||
3362 | ctx->task_ctx_data = task_ctx_data; | ||
3363 | task_ctx_data = NULL; | ||
3364 | } | ||
3362 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | 3365 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
3363 | 3366 | ||
3364 | if (clone_ctx) | 3367 | if (clone_ctx) |
@@ -3369,6 +3372,11 @@ retry: | |||
3369 | if (!ctx) | 3372 | if (!ctx) |
3370 | goto errout; | 3373 | goto errout; |
3371 | 3374 | ||
3375 | if (task_ctx_data) { | ||
3376 | ctx->task_ctx_data = task_ctx_data; | ||
3377 | task_ctx_data = NULL; | ||
3378 | } | ||
3379 | |||
3372 | err = 0; | 3380 | err = 0; |
3373 | mutex_lock(&task->perf_event_mutex); | 3381 | mutex_lock(&task->perf_event_mutex); |
3374 | /* | 3382 | /* |
@@ -3395,13 +3403,16 @@ retry: | |||
3395 | } | 3403 | } |
3396 | } | 3404 | } |
3397 | 3405 | ||
3406 | kfree(task_ctx_data); | ||
3398 | return ctx; | 3407 | return ctx; |
3399 | 3408 | ||
3400 | errout: | 3409 | errout: |
3410 | kfree(task_ctx_data); | ||
3401 | return ERR_PTR(err); | 3411 | return ERR_PTR(err); |
3402 | } | 3412 | } |
3403 | 3413 | ||
3404 | static void perf_event_free_filter(struct perf_event *event); | 3414 | static void perf_event_free_filter(struct perf_event *event); |
3415 | static void perf_event_free_bpf_prog(struct perf_event *event); | ||
3405 | 3416 | ||
3406 | static void free_event_rcu(struct rcu_head *head) | 3417 | static void free_event_rcu(struct rcu_head *head) |
3407 | { | 3418 | { |
@@ -3411,10 +3422,10 @@ static void free_event_rcu(struct rcu_head *head) | |||
3411 | if (event->ns) | 3422 | if (event->ns) |
3412 | put_pid_ns(event->ns); | 3423 | put_pid_ns(event->ns); |
3413 | perf_event_free_filter(event); | 3424 | perf_event_free_filter(event); |
3425 | perf_event_free_bpf_prog(event); | ||
3414 | kfree(event); | 3426 | kfree(event); |
3415 | } | 3427 | } |
3416 | 3428 | ||
3417 | static void ring_buffer_put(struct ring_buffer *rb); | ||
3418 | static void ring_buffer_attach(struct perf_event *event, | 3429 | static void ring_buffer_attach(struct perf_event *event, |
3419 | struct ring_buffer *rb); | 3430 | struct ring_buffer *rb); |
3420 | 3431 | ||
@@ -3423,10 +3434,6 @@ static void unaccount_event_cpu(struct perf_event *event, int cpu) | |||
3423 | if (event->parent) | 3434 | if (event->parent) |
3424 | return; | 3435 | return; |
3425 | 3436 | ||
3426 | if (has_branch_stack(event)) { | ||
3427 | if (!(event->attach_state & PERF_ATTACH_TASK)) | ||
3428 | atomic_dec(&per_cpu(perf_branch_stack_events, cpu)); | ||
3429 | } | ||
3430 | if (is_cgroup_event(event)) | 3437 | if (is_cgroup_event(event)) |
3431 | atomic_dec(&per_cpu(perf_cgroup_events, cpu)); | 3438 | atomic_dec(&per_cpu(perf_cgroup_events, cpu)); |
3432 | } | 3439 | } |
@@ -3454,6 +3461,91 @@ static void unaccount_event(struct perf_event *event) | |||
3454 | unaccount_event_cpu(event, event->cpu); | 3461 | unaccount_event_cpu(event, event->cpu); |
3455 | } | 3462 | } |
3456 | 3463 | ||
3464 | /* | ||
3465 | * The following implement mutual exclusion of events on "exclusive" pmus | ||
3466 | * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled | ||
3467 | * at a time, so we disallow creating events that might conflict, namely: | ||
3468 | * | ||
3469 | * 1) cpu-wide events in the presence of per-task events, | ||
3470 | * 2) per-task events in the presence of cpu-wide events, | ||
3471 | * 3) two matching events on the same context. | ||
3472 | * | ||
3473 | * The former two cases are handled in the allocation path (perf_event_alloc(), | ||
3474 | * __free_event()), the latter -- before the first perf_install_in_context(). | ||
3475 | */ | ||
3476 | static int exclusive_event_init(struct perf_event *event) | ||
3477 | { | ||
3478 | struct pmu *pmu = event->pmu; | ||
3479 | |||
3480 | if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE)) | ||
3481 | return 0; | ||
3482 | |||
3483 | /* | ||
3484 | * Prevent co-existence of per-task and cpu-wide events on the | ||
3485 | * same exclusive pmu. | ||
3486 | * | ||
3487 | * Negative pmu::exclusive_cnt means there are cpu-wide | ||
3488 | * events on this "exclusive" pmu, positive means there are | ||
3489 | * per-task events. | ||
3490 | * | ||
3491 | * Since this is called in perf_event_alloc() path, event::ctx | ||
3492 | * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK | ||
3493 | * to mean "per-task event", because unlike other attach states it | ||
3494 | * never gets cleared. | ||
3495 | */ | ||
3496 | if (event->attach_state & PERF_ATTACH_TASK) { | ||
3497 | if (!atomic_inc_unless_negative(&pmu->exclusive_cnt)) | ||
3498 | return -EBUSY; | ||
3499 | } else { | ||
3500 | if (!atomic_dec_unless_positive(&pmu->exclusive_cnt)) | ||
3501 | return -EBUSY; | ||
3502 | } | ||
3503 | |||
3504 | return 0; | ||
3505 | } | ||
3506 | |||
3507 | static void exclusive_event_destroy(struct perf_event *event) | ||
3508 | { | ||
3509 | struct pmu *pmu = event->pmu; | ||
3510 | |||
3511 | if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE)) | ||
3512 | return; | ||
3513 | |||
3514 | /* see comment in exclusive_event_init() */ | ||
3515 | if (event->attach_state & PERF_ATTACH_TASK) | ||
3516 | atomic_dec(&pmu->exclusive_cnt); | ||
3517 | else | ||
3518 | atomic_inc(&pmu->exclusive_cnt); | ||
3519 | } | ||
3520 | |||
3521 | static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2) | ||
3522 | { | ||
3523 | if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && | ||
3524 | (e1->cpu == e2->cpu || | ||
3525 | e1->cpu == -1 || | ||
3526 | e2->cpu == -1)) | ||
3527 | return true; | ||
3528 | return false; | ||
3529 | } | ||
3530 | |||
3531 | /* Called under the same ctx::mutex as perf_install_in_context() */ | ||
3532 | static bool exclusive_event_installable(struct perf_event *event, | ||
3533 | struct perf_event_context *ctx) | ||
3534 | { | ||
3535 | struct perf_event *iter_event; | ||
3536 | struct pmu *pmu = event->pmu; | ||
3537 | |||
3538 | if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE)) | ||
3539 | return true; | ||
3540 | |||
3541 | list_for_each_entry(iter_event, &ctx->event_list, event_entry) { | ||
3542 | if (exclusive_event_match(iter_event, event)) | ||
3543 | return false; | ||
3544 | } | ||
3545 | |||
3546 | return true; | ||
3547 | } | ||
3548 | |||
3457 | static void __free_event(struct perf_event *event) | 3549 | static void __free_event(struct perf_event *event) |
3458 | { | 3550 | { |
3459 | if (!event->parent) { | 3551 | if (!event->parent) { |
@@ -3467,8 +3559,10 @@ static void __free_event(struct perf_event *event) | |||
3467 | if (event->ctx) | 3559 | if (event->ctx) |
3468 | put_ctx(event->ctx); | 3560 | put_ctx(event->ctx); |
3469 | 3561 | ||
3470 | if (event->pmu) | 3562 | if (event->pmu) { |
3563 | exclusive_event_destroy(event); | ||
3471 | module_put(event->pmu->module); | 3564 | module_put(event->pmu->module); |
3565 | } | ||
3472 | 3566 | ||
3473 | call_rcu(&event->rcu_head, free_event_rcu); | 3567 | call_rcu(&event->rcu_head, free_event_rcu); |
3474 | } | 3568 | } |
@@ -3927,6 +4021,7 @@ static inline int perf_fget_light(int fd, struct fd *p) | |||
3927 | static int perf_event_set_output(struct perf_event *event, | 4021 | static int perf_event_set_output(struct perf_event *event, |
3928 | struct perf_event *output_event); | 4022 | struct perf_event *output_event); |
3929 | static int perf_event_set_filter(struct perf_event *event, void __user *arg); | 4023 | static int perf_event_set_filter(struct perf_event *event, void __user *arg); |
4024 | static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd); | ||
3930 | 4025 | ||
3931 | static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) | 4026 | static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) |
3932 | { | 4027 | { |
@@ -3980,6 +4075,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon | |||
3980 | case PERF_EVENT_IOC_SET_FILTER: | 4075 | case PERF_EVENT_IOC_SET_FILTER: |
3981 | return perf_event_set_filter(event, (void __user *)arg); | 4076 | return perf_event_set_filter(event, (void __user *)arg); |
3982 | 4077 | ||
4078 | case PERF_EVENT_IOC_SET_BPF: | ||
4079 | return perf_event_set_bpf_prog(event, arg); | ||
4080 | |||
3983 | default: | 4081 | default: |
3984 | return -ENOTTY; | 4082 | return -ENOTTY; |
3985 | } | 4083 | } |
@@ -4096,6 +4194,8 @@ static void perf_event_init_userpage(struct perf_event *event) | |||
4096 | /* Allow new userspace to detect that bit 0 is deprecated */ | 4194 | /* Allow new userspace to detect that bit 0 is deprecated */ |
4097 | userpg->cap_bit0_is_deprecated = 1; | 4195 | userpg->cap_bit0_is_deprecated = 1; |
4098 | userpg->size = offsetof(struct perf_event_mmap_page, __reserved); | 4196 | userpg->size = offsetof(struct perf_event_mmap_page, __reserved); |
4197 | userpg->data_offset = PAGE_SIZE; | ||
4198 | userpg->data_size = perf_data_size(rb); | ||
4099 | 4199 | ||
4100 | unlock: | 4200 | unlock: |
4101 | rcu_read_unlock(); | 4201 | rcu_read_unlock(); |
@@ -4263,7 +4363,7 @@ static void rb_free_rcu(struct rcu_head *rcu_head) | |||
4263 | rb_free(rb); | 4363 | rb_free(rb); |
4264 | } | 4364 | } |
4265 | 4365 | ||
4266 | static struct ring_buffer *ring_buffer_get(struct perf_event *event) | 4366 | struct ring_buffer *ring_buffer_get(struct perf_event *event) |
4267 | { | 4367 | { |
4268 | struct ring_buffer *rb; | 4368 | struct ring_buffer *rb; |
4269 | 4369 | ||
@@ -4278,7 +4378,7 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event) | |||
4278 | return rb; | 4378 | return rb; |
4279 | } | 4379 | } |
4280 | 4380 | ||
4281 | static void ring_buffer_put(struct ring_buffer *rb) | 4381 | void ring_buffer_put(struct ring_buffer *rb) |
4282 | { | 4382 | { |
4283 | if (!atomic_dec_and_test(&rb->refcount)) | 4383 | if (!atomic_dec_and_test(&rb->refcount)) |
4284 | return; | 4384 | return; |
@@ -4295,6 +4395,9 @@ static void perf_mmap_open(struct vm_area_struct *vma) | |||
4295 | atomic_inc(&event->mmap_count); | 4395 | atomic_inc(&event->mmap_count); |
4296 | atomic_inc(&event->rb->mmap_count); | 4396 | atomic_inc(&event->rb->mmap_count); |
4297 | 4397 | ||
4398 | if (vma->vm_pgoff) | ||
4399 | atomic_inc(&event->rb->aux_mmap_count); | ||
4400 | |||
4298 | if (event->pmu->event_mapped) | 4401 | if (event->pmu->event_mapped) |
4299 | event->pmu->event_mapped(event); | 4402 | event->pmu->event_mapped(event); |
4300 | } | 4403 | } |
@@ -4319,6 +4422,20 @@ static void perf_mmap_close(struct vm_area_struct *vma) | |||
4319 | if (event->pmu->event_unmapped) | 4422 | if (event->pmu->event_unmapped) |
4320 | event->pmu->event_unmapped(event); | 4423 | event->pmu->event_unmapped(event); |
4321 | 4424 | ||
4425 | /* | ||
4426 | * rb->aux_mmap_count will always drop before rb->mmap_count and | ||
4427 | * event->mmap_count, so it is ok to use event->mmap_mutex to | ||
4428 | * serialize with perf_mmap here. | ||
4429 | */ | ||
4430 | if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff && | ||
4431 | atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) { | ||
4432 | atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm); | ||
4433 | vma->vm_mm->pinned_vm -= rb->aux_mmap_locked; | ||
4434 | |||
4435 | rb_free_aux(rb); | ||
4436 | mutex_unlock(&event->mmap_mutex); | ||
4437 | } | ||
4438 | |||
4322 | atomic_dec(&rb->mmap_count); | 4439 | atomic_dec(&rb->mmap_count); |
4323 | 4440 | ||
4324 | if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) | 4441 | if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) |
@@ -4392,7 +4509,7 @@ out_put: | |||
4392 | 4509 | ||
4393 | static const struct vm_operations_struct perf_mmap_vmops = { | 4510 | static const struct vm_operations_struct perf_mmap_vmops = { |
4394 | .open = perf_mmap_open, | 4511 | .open = perf_mmap_open, |
4395 | .close = perf_mmap_close, | 4512 | .close = perf_mmap_close, /* non mergable */ |
4396 | .fault = perf_mmap_fault, | 4513 | .fault = perf_mmap_fault, |
4397 | .page_mkwrite = perf_mmap_fault, | 4514 | .page_mkwrite = perf_mmap_fault, |
4398 | }; | 4515 | }; |
@@ -4403,10 +4520,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
4403 | unsigned long user_locked, user_lock_limit; | 4520 | unsigned long user_locked, user_lock_limit; |
4404 | struct user_struct *user = current_user(); | 4521 | struct user_struct *user = current_user(); |
4405 | unsigned long locked, lock_limit; | 4522 | unsigned long locked, lock_limit; |
4406 | struct ring_buffer *rb; | 4523 | struct ring_buffer *rb = NULL; |
4407 | unsigned long vma_size; | 4524 | unsigned long vma_size; |
4408 | unsigned long nr_pages; | 4525 | unsigned long nr_pages; |
4409 | long user_extra, extra; | 4526 | long user_extra = 0, extra = 0; |
4410 | int ret = 0, flags = 0; | 4527 | int ret = 0, flags = 0; |
4411 | 4528 | ||
4412 | /* | 4529 | /* |
@@ -4421,7 +4538,66 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
4421 | return -EINVAL; | 4538 | return -EINVAL; |
4422 | 4539 | ||
4423 | vma_size = vma->vm_end - vma->vm_start; | 4540 | vma_size = vma->vm_end - vma->vm_start; |
4424 | nr_pages = (vma_size / PAGE_SIZE) - 1; | 4541 | |
4542 | if (vma->vm_pgoff == 0) { | ||
4543 | nr_pages = (vma_size / PAGE_SIZE) - 1; | ||
4544 | } else { | ||
4545 | /* | ||
4546 | * AUX area mapping: if rb->aux_nr_pages != 0, it's already | ||
4547 | * mapped, all subsequent mappings should have the same size | ||
4548 | * and offset. Must be above the normal perf buffer. | ||
4549 | */ | ||
4550 | u64 aux_offset, aux_size; | ||
4551 | |||
4552 | if (!event->rb) | ||
4553 | return -EINVAL; | ||
4554 | |||
4555 | nr_pages = vma_size / PAGE_SIZE; | ||
4556 | |||
4557 | mutex_lock(&event->mmap_mutex); | ||
4558 | ret = -EINVAL; | ||
4559 | |||
4560 | rb = event->rb; | ||
4561 | if (!rb) | ||
4562 | goto aux_unlock; | ||
4563 | |||
4564 | aux_offset = ACCESS_ONCE(rb->user_page->aux_offset); | ||
4565 | aux_size = ACCESS_ONCE(rb->user_page->aux_size); | ||
4566 | |||
4567 | if (aux_offset < perf_data_size(rb) + PAGE_SIZE) | ||
4568 | goto aux_unlock; | ||
4569 | |||
4570 | if (aux_offset != vma->vm_pgoff << PAGE_SHIFT) | ||
4571 | goto aux_unlock; | ||
4572 | |||
4573 | /* already mapped with a different offset */ | ||
4574 | if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff) | ||
4575 | goto aux_unlock; | ||
4576 | |||
4577 | if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE) | ||
4578 | goto aux_unlock; | ||
4579 | |||
4580 | /* already mapped with a different size */ | ||
4581 | if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages) | ||
4582 | goto aux_unlock; | ||
4583 | |||
4584 | if (!is_power_of_2(nr_pages)) | ||
4585 | goto aux_unlock; | ||
4586 | |||
4587 | if (!atomic_inc_not_zero(&rb->mmap_count)) | ||
4588 | goto aux_unlock; | ||
4589 | |||
4590 | if (rb_has_aux(rb)) { | ||
4591 | atomic_inc(&rb->aux_mmap_count); | ||
4592 | ret = 0; | ||
4593 | goto unlock; | ||
4594 | } | ||
4595 | |||
4596 | atomic_set(&rb->aux_mmap_count, 1); | ||
4597 | user_extra = nr_pages; | ||
4598 | |||
4599 | goto accounting; | ||
4600 | } | ||
4425 | 4601 | ||
4426 | /* | 4602 | /* |
4427 | * If we have rb pages ensure they're a power-of-two number, so we | 4603 | * If we have rb pages ensure they're a power-of-two number, so we |
@@ -4433,9 +4609,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
4433 | if (vma_size != PAGE_SIZE * (1 + nr_pages)) | 4609 | if (vma_size != PAGE_SIZE * (1 + nr_pages)) |
4434 | return -EINVAL; | 4610 | return -EINVAL; |
4435 | 4611 | ||
4436 | if (vma->vm_pgoff != 0) | ||
4437 | return -EINVAL; | ||
4438 | |||
4439 | WARN_ON_ONCE(event->ctx->parent_ctx); | 4612 | WARN_ON_ONCE(event->ctx->parent_ctx); |
4440 | again: | 4613 | again: |
4441 | mutex_lock(&event->mmap_mutex); | 4614 | mutex_lock(&event->mmap_mutex); |
@@ -4459,6 +4632,8 @@ again: | |||
4459 | } | 4632 | } |
4460 | 4633 | ||
4461 | user_extra = nr_pages + 1; | 4634 | user_extra = nr_pages + 1; |
4635 | |||
4636 | accounting: | ||
4462 | user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); | 4637 | user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); |
4463 | 4638 | ||
4464 | /* | 4639 | /* |
@@ -4468,7 +4643,6 @@ again: | |||
4468 | 4643 | ||
4469 | user_locked = atomic_long_read(&user->locked_vm) + user_extra; | 4644 | user_locked = atomic_long_read(&user->locked_vm) + user_extra; |
4470 | 4645 | ||
4471 | extra = 0; | ||
4472 | if (user_locked > user_lock_limit) | 4646 | if (user_locked > user_lock_limit) |
4473 | extra = user_locked - user_lock_limit; | 4647 | extra = user_locked - user_lock_limit; |
4474 | 4648 | ||
@@ -4482,35 +4656,46 @@ again: | |||
4482 | goto unlock; | 4656 | goto unlock; |
4483 | } | 4657 | } |
4484 | 4658 | ||
4485 | WARN_ON(event->rb); | 4659 | WARN_ON(!rb && event->rb); |
4486 | 4660 | ||
4487 | if (vma->vm_flags & VM_WRITE) | 4661 | if (vma->vm_flags & VM_WRITE) |
4488 | flags |= RING_BUFFER_WRITABLE; | 4662 | flags |= RING_BUFFER_WRITABLE; |
4489 | 4663 | ||
4490 | rb = rb_alloc(nr_pages, | ||
4491 | event->attr.watermark ? event->attr.wakeup_watermark : 0, | ||
4492 | event->cpu, flags); | ||
4493 | |||
4494 | if (!rb) { | 4664 | if (!rb) { |
4495 | ret = -ENOMEM; | 4665 | rb = rb_alloc(nr_pages, |
4496 | goto unlock; | 4666 | event->attr.watermark ? event->attr.wakeup_watermark : 0, |
4497 | } | 4667 | event->cpu, flags); |
4498 | 4668 | ||
4499 | atomic_set(&rb->mmap_count, 1); | 4669 | if (!rb) { |
4500 | rb->mmap_locked = extra; | 4670 | ret = -ENOMEM; |
4501 | rb->mmap_user = get_current_user(); | 4671 | goto unlock; |
4672 | } | ||
4502 | 4673 | ||
4503 | atomic_long_add(user_extra, &user->locked_vm); | 4674 | atomic_set(&rb->mmap_count, 1); |
4504 | vma->vm_mm->pinned_vm += extra; | 4675 | rb->mmap_user = get_current_user(); |
4676 | rb->mmap_locked = extra; | ||
4505 | 4677 | ||
4506 | ring_buffer_attach(event, rb); | 4678 | ring_buffer_attach(event, rb); |
4507 | 4679 | ||
4508 | perf_event_init_userpage(event); | 4680 | perf_event_init_userpage(event); |
4509 | perf_event_update_userpage(event); | 4681 | perf_event_update_userpage(event); |
4682 | } else { | ||
4683 | ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, | ||
4684 | event->attr.aux_watermark, flags); | ||
4685 | if (!ret) | ||
4686 | rb->aux_mmap_locked = extra; | ||
4687 | } | ||
4510 | 4688 | ||
4511 | unlock: | 4689 | unlock: |
4512 | if (!ret) | 4690 | if (!ret) { |
4691 | atomic_long_add(user_extra, &user->locked_vm); | ||
4692 | vma->vm_mm->pinned_vm += extra; | ||
4693 | |||
4513 | atomic_inc(&event->mmap_count); | 4694 | atomic_inc(&event->mmap_count); |
4695 | } else if (rb) { | ||
4696 | atomic_dec(&rb->mmap_count); | ||
4697 | } | ||
4698 | aux_unlock: | ||
4514 | mutex_unlock(&event->mmap_mutex); | 4699 | mutex_unlock(&event->mmap_mutex); |
4515 | 4700 | ||
4516 | /* | 4701 | /* |
@@ -4574,6 +4759,13 @@ static void perf_pending_event(struct irq_work *entry) | |||
4574 | { | 4759 | { |
4575 | struct perf_event *event = container_of(entry, | 4760 | struct perf_event *event = container_of(entry, |
4576 | struct perf_event, pending); | 4761 | struct perf_event, pending); |
4762 | int rctx; | ||
4763 | |||
4764 | rctx = perf_swevent_get_recursion_context(); | ||
4765 | /* | ||
4766 | * If we 'fail' here, that's OK, it means recursion is already disabled | ||
4767 | * and we won't recurse 'further'. | ||
4768 | */ | ||
4577 | 4769 | ||
4578 | if (event->pending_disable) { | 4770 | if (event->pending_disable) { |
4579 | event->pending_disable = 0; | 4771 | event->pending_disable = 0; |
@@ -4584,6 +4776,9 @@ static void perf_pending_event(struct irq_work *entry) | |||
4584 | event->pending_wakeup = 0; | 4776 | event->pending_wakeup = 0; |
4585 | perf_event_wakeup(event); | 4777 | perf_event_wakeup(event); |
4586 | } | 4778 | } |
4779 | |||
4780 | if (rctx >= 0) | ||
4781 | perf_swevent_put_recursion_context(rctx); | ||
4587 | } | 4782 | } |
4588 | 4783 | ||
4589 | /* | 4784 | /* |
@@ -4756,7 +4951,7 @@ static void __perf_event_header__init_id(struct perf_event_header *header, | |||
4756 | } | 4951 | } |
4757 | 4952 | ||
4758 | if (sample_type & PERF_SAMPLE_TIME) | 4953 | if (sample_type & PERF_SAMPLE_TIME) |
4759 | data->time = perf_clock(); | 4954 | data->time = perf_event_clock(event); |
4760 | 4955 | ||
4761 | if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) | 4956 | if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) |
4762 | data->id = primary_event_id(event); | 4957 | data->id = primary_event_id(event); |
@@ -5334,6 +5529,8 @@ static void perf_event_task_output(struct perf_event *event, | |||
5334 | task_event->event_id.tid = perf_event_tid(event, task); | 5529 | task_event->event_id.tid = perf_event_tid(event, task); |
5335 | task_event->event_id.ptid = perf_event_tid(event, current); | 5530 | task_event->event_id.ptid = perf_event_tid(event, current); |
5336 | 5531 | ||
5532 | task_event->event_id.time = perf_event_clock(event); | ||
5533 | |||
5337 | perf_output_put(&handle, task_event->event_id); | 5534 | perf_output_put(&handle, task_event->event_id); |
5338 | 5535 | ||
5339 | perf_event__output_id_sample(event, &handle, &sample); | 5536 | perf_event__output_id_sample(event, &handle, &sample); |
@@ -5367,7 +5564,7 @@ static void perf_event_task(struct task_struct *task, | |||
5367 | /* .ppid */ | 5564 | /* .ppid */ |
5368 | /* .tid */ | 5565 | /* .tid */ |
5369 | /* .ptid */ | 5566 | /* .ptid */ |
5370 | .time = perf_clock(), | 5567 | /* .time */ |
5371 | }, | 5568 | }, |
5372 | }; | 5569 | }; |
5373 | 5570 | ||
@@ -5722,6 +5919,40 @@ void perf_event_mmap(struct vm_area_struct *vma) | |||
5722 | perf_event_mmap_event(&mmap_event); | 5919 | perf_event_mmap_event(&mmap_event); |
5723 | } | 5920 | } |
5724 | 5921 | ||
5922 | void perf_event_aux_event(struct perf_event *event, unsigned long head, | ||
5923 | unsigned long size, u64 flags) | ||
5924 | { | ||
5925 | struct perf_output_handle handle; | ||
5926 | struct perf_sample_data sample; | ||
5927 | struct perf_aux_event { | ||
5928 | struct perf_event_header header; | ||
5929 | u64 offset; | ||
5930 | u64 size; | ||
5931 | u64 flags; | ||
5932 | } rec = { | ||
5933 | .header = { | ||
5934 | .type = PERF_RECORD_AUX, | ||
5935 | .misc = 0, | ||
5936 | .size = sizeof(rec), | ||
5937 | }, | ||
5938 | .offset = head, | ||
5939 | .size = size, | ||
5940 | .flags = flags, | ||
5941 | }; | ||
5942 | int ret; | ||
5943 | |||
5944 | perf_event_header__init_id(&rec.header, &sample, event); | ||
5945 | ret = perf_output_begin(&handle, event, rec.header.size); | ||
5946 | |||
5947 | if (ret) | ||
5948 | return; | ||
5949 | |||
5950 | perf_output_put(&handle, rec); | ||
5951 | perf_event__output_id_sample(event, &handle, &sample); | ||
5952 | |||
5953 | perf_output_end(&handle); | ||
5954 | } | ||
5955 | |||
5725 | /* | 5956 | /* |
5726 | * IRQ throttle logging | 5957 | * IRQ throttle logging |
5727 | */ | 5958 | */ |
@@ -5743,7 +5974,7 @@ static void perf_log_throttle(struct perf_event *event, int enable) | |||
5743 | .misc = 0, | 5974 | .misc = 0, |
5744 | .size = sizeof(throttle_event), | 5975 | .size = sizeof(throttle_event), |
5745 | }, | 5976 | }, |
5746 | .time = perf_clock(), | 5977 | .time = perf_event_clock(event), |
5747 | .id = primary_event_id(event), | 5978 | .id = primary_event_id(event), |
5748 | .stream_id = event->id, | 5979 | .stream_id = event->id, |
5749 | }; | 5980 | }; |
@@ -5763,6 +5994,44 @@ static void perf_log_throttle(struct perf_event *event, int enable) | |||
5763 | perf_output_end(&handle); | 5994 | perf_output_end(&handle); |
5764 | } | 5995 | } |
5765 | 5996 | ||
5997 | static void perf_log_itrace_start(struct perf_event *event) | ||
5998 | { | ||
5999 | struct perf_output_handle handle; | ||
6000 | struct perf_sample_data sample; | ||
6001 | struct perf_aux_event { | ||
6002 | struct perf_event_header header; | ||
6003 | u32 pid; | ||
6004 | u32 tid; | ||
6005 | } rec; | ||
6006 | int ret; | ||
6007 | |||
6008 | if (event->parent) | ||
6009 | event = event->parent; | ||
6010 | |||
6011 | if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) || | ||
6012 | event->hw.itrace_started) | ||
6013 | return; | ||
6014 | |||
6015 | event->hw.itrace_started = 1; | ||
6016 | |||
6017 | rec.header.type = PERF_RECORD_ITRACE_START; | ||
6018 | rec.header.misc = 0; | ||
6019 | rec.header.size = sizeof(rec); | ||
6020 | rec.pid = perf_event_pid(event, current); | ||
6021 | rec.tid = perf_event_tid(event, current); | ||
6022 | |||
6023 | perf_event_header__init_id(&rec.header, &sample, event); | ||
6024 | ret = perf_output_begin(&handle, event, rec.header.size); | ||
6025 | |||
6026 | if (ret) | ||
6027 | return; | ||
6028 | |||
6029 | perf_output_put(&handle, rec); | ||
6030 | perf_event__output_id_sample(event, &handle, &sample); | ||
6031 | |||
6032 | perf_output_end(&handle); | ||
6033 | } | ||
6034 | |||
5766 | /* | 6035 | /* |
5767 | * Generic event overflow handling, sampling. | 6036 | * Generic event overflow handling, sampling. |
5768 | */ | 6037 | */ |
@@ -6123,6 +6392,7 @@ static int perf_swevent_add(struct perf_event *event, int flags) | |||
6123 | } | 6392 | } |
6124 | 6393 | ||
6125 | hlist_add_head_rcu(&event->hlist_entry, head); | 6394 | hlist_add_head_rcu(&event->hlist_entry, head); |
6395 | perf_event_update_userpage(event); | ||
6126 | 6396 | ||
6127 | return 0; | 6397 | return 0; |
6128 | } | 6398 | } |
@@ -6286,6 +6556,8 @@ static int perf_swevent_init(struct perf_event *event) | |||
6286 | static struct pmu perf_swevent = { | 6556 | static struct pmu perf_swevent = { |
6287 | .task_ctx_nr = perf_sw_context, | 6557 | .task_ctx_nr = perf_sw_context, |
6288 | 6558 | ||
6559 | .capabilities = PERF_PMU_CAP_NO_NMI, | ||
6560 | |||
6289 | .event_init = perf_swevent_init, | 6561 | .event_init = perf_swevent_init, |
6290 | .add = perf_swevent_add, | 6562 | .add = perf_swevent_add, |
6291 | .del = perf_swevent_del, | 6563 | .del = perf_swevent_del, |
@@ -6439,6 +6711,49 @@ static void perf_event_free_filter(struct perf_event *event) | |||
6439 | ftrace_profile_free_filter(event); | 6711 | ftrace_profile_free_filter(event); |
6440 | } | 6712 | } |
6441 | 6713 | ||
6714 | static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) | ||
6715 | { | ||
6716 | struct bpf_prog *prog; | ||
6717 | |||
6718 | if (event->attr.type != PERF_TYPE_TRACEPOINT) | ||
6719 | return -EINVAL; | ||
6720 | |||
6721 | if (event->tp_event->prog) | ||
6722 | return -EEXIST; | ||
6723 | |||
6724 | if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) | ||
6725 | /* bpf programs can only be attached to kprobes */ | ||
6726 | return -EINVAL; | ||
6727 | |||
6728 | prog = bpf_prog_get(prog_fd); | ||
6729 | if (IS_ERR(prog)) | ||
6730 | return PTR_ERR(prog); | ||
6731 | |||
6732 | if (prog->type != BPF_PROG_TYPE_KPROBE) { | ||
6733 | /* valid fd, but invalid bpf program type */ | ||
6734 | bpf_prog_put(prog); | ||
6735 | return -EINVAL; | ||
6736 | } | ||
6737 | |||
6738 | event->tp_event->prog = prog; | ||
6739 | |||
6740 | return 0; | ||
6741 | } | ||
6742 | |||
6743 | static void perf_event_free_bpf_prog(struct perf_event *event) | ||
6744 | { | ||
6745 | struct bpf_prog *prog; | ||
6746 | |||
6747 | if (!event->tp_event) | ||
6748 | return; | ||
6749 | |||
6750 | prog = event->tp_event->prog; | ||
6751 | if (prog) { | ||
6752 | event->tp_event->prog = NULL; | ||
6753 | bpf_prog_put(prog); | ||
6754 | } | ||
6755 | } | ||
6756 | |||
6442 | #else | 6757 | #else |
6443 | 6758 | ||
6444 | static inline void perf_tp_register(void) | 6759 | static inline void perf_tp_register(void) |
@@ -6454,6 +6769,14 @@ static void perf_event_free_filter(struct perf_event *event) | |||
6454 | { | 6769 | { |
6455 | } | 6770 | } |
6456 | 6771 | ||
6772 | static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) | ||
6773 | { | ||
6774 | return -ENOENT; | ||
6775 | } | ||
6776 | |||
6777 | static void perf_event_free_bpf_prog(struct perf_event *event) | ||
6778 | { | ||
6779 | } | ||
6457 | #endif /* CONFIG_EVENT_TRACING */ | 6780 | #endif /* CONFIG_EVENT_TRACING */ |
6458 | 6781 | ||
6459 | #ifdef CONFIG_HAVE_HW_BREAKPOINT | 6782 | #ifdef CONFIG_HAVE_HW_BREAKPOINT |
@@ -6592,6 +6915,7 @@ static int cpu_clock_event_add(struct perf_event *event, int flags) | |||
6592 | { | 6915 | { |
6593 | if (flags & PERF_EF_START) | 6916 | if (flags & PERF_EF_START) |
6594 | cpu_clock_event_start(event, flags); | 6917 | cpu_clock_event_start(event, flags); |
6918 | perf_event_update_userpage(event); | ||
6595 | 6919 | ||
6596 | return 0; | 6920 | return 0; |
6597 | } | 6921 | } |
@@ -6628,6 +6952,8 @@ static int cpu_clock_event_init(struct perf_event *event) | |||
6628 | static struct pmu perf_cpu_clock = { | 6952 | static struct pmu perf_cpu_clock = { |
6629 | .task_ctx_nr = perf_sw_context, | 6953 | .task_ctx_nr = perf_sw_context, |
6630 | 6954 | ||
6955 | .capabilities = PERF_PMU_CAP_NO_NMI, | ||
6956 | |||
6631 | .event_init = cpu_clock_event_init, | 6957 | .event_init = cpu_clock_event_init, |
6632 | .add = cpu_clock_event_add, | 6958 | .add = cpu_clock_event_add, |
6633 | .del = cpu_clock_event_del, | 6959 | .del = cpu_clock_event_del, |
@@ -6666,6 +6992,7 @@ static int task_clock_event_add(struct perf_event *event, int flags) | |||
6666 | { | 6992 | { |
6667 | if (flags & PERF_EF_START) | 6993 | if (flags & PERF_EF_START) |
6668 | task_clock_event_start(event, flags); | 6994 | task_clock_event_start(event, flags); |
6995 | perf_event_update_userpage(event); | ||
6669 | 6996 | ||
6670 | return 0; | 6997 | return 0; |
6671 | } | 6998 | } |
@@ -6706,6 +7033,8 @@ static int task_clock_event_init(struct perf_event *event) | |||
6706 | static struct pmu perf_task_clock = { | 7033 | static struct pmu perf_task_clock = { |
6707 | .task_ctx_nr = perf_sw_context, | 7034 | .task_ctx_nr = perf_sw_context, |
6708 | 7035 | ||
7036 | .capabilities = PERF_PMU_CAP_NO_NMI, | ||
7037 | |||
6709 | .event_init = task_clock_event_init, | 7038 | .event_init = task_clock_event_init, |
6710 | .add = task_clock_event_add, | 7039 | .add = task_clock_event_add, |
6711 | .del = task_clock_event_del, | 7040 | .del = task_clock_event_del, |
@@ -6983,6 +7312,7 @@ got_cpu_context: | |||
6983 | pmu->event_idx = perf_event_idx_default; | 7312 | pmu->event_idx = perf_event_idx_default; |
6984 | 7313 | ||
6985 | list_add_rcu(&pmu->entry, &pmus); | 7314 | list_add_rcu(&pmu->entry, &pmus); |
7315 | atomic_set(&pmu->exclusive_cnt, 0); | ||
6986 | ret = 0; | 7316 | ret = 0; |
6987 | unlock: | 7317 | unlock: |
6988 | mutex_unlock(&pmus_lock); | 7318 | mutex_unlock(&pmus_lock); |
@@ -7027,12 +7357,23 @@ EXPORT_SYMBOL_GPL(perf_pmu_unregister); | |||
7027 | 7357 | ||
7028 | static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) | 7358 | static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) |
7029 | { | 7359 | { |
7360 | struct perf_event_context *ctx = NULL; | ||
7030 | int ret; | 7361 | int ret; |
7031 | 7362 | ||
7032 | if (!try_module_get(pmu->module)) | 7363 | if (!try_module_get(pmu->module)) |
7033 | return -ENODEV; | 7364 | return -ENODEV; |
7365 | |||
7366 | if (event->group_leader != event) { | ||
7367 | ctx = perf_event_ctx_lock(event->group_leader); | ||
7368 | BUG_ON(!ctx); | ||
7369 | } | ||
7370 | |||
7034 | event->pmu = pmu; | 7371 | event->pmu = pmu; |
7035 | ret = pmu->event_init(event); | 7372 | ret = pmu->event_init(event); |
7373 | |||
7374 | if (ctx) | ||
7375 | perf_event_ctx_unlock(event->group_leader, ctx); | ||
7376 | |||
7036 | if (ret) | 7377 | if (ret) |
7037 | module_put(pmu->module); | 7378 | module_put(pmu->module); |
7038 | 7379 | ||
@@ -7079,10 +7420,6 @@ static void account_event_cpu(struct perf_event *event, int cpu) | |||
7079 | if (event->parent) | 7420 | if (event->parent) |
7080 | return; | 7421 | return; |
7081 | 7422 | ||
7082 | if (has_branch_stack(event)) { | ||
7083 | if (!(event->attach_state & PERF_ATTACH_TASK)) | ||
7084 | atomic_inc(&per_cpu(perf_branch_stack_events, cpu)); | ||
7085 | } | ||
7086 | if (is_cgroup_event(event)) | 7423 | if (is_cgroup_event(event)) |
7087 | atomic_inc(&per_cpu(perf_cgroup_events, cpu)); | 7424 | atomic_inc(&per_cpu(perf_cgroup_events, cpu)); |
7088 | } | 7425 | } |
@@ -7121,7 +7458,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
7121 | struct perf_event *group_leader, | 7458 | struct perf_event *group_leader, |
7122 | struct perf_event *parent_event, | 7459 | struct perf_event *parent_event, |
7123 | perf_overflow_handler_t overflow_handler, | 7460 | perf_overflow_handler_t overflow_handler, |
7124 | void *context) | 7461 | void *context, int cgroup_fd) |
7125 | { | 7462 | { |
7126 | struct pmu *pmu; | 7463 | struct pmu *pmu; |
7127 | struct perf_event *event; | 7464 | struct perf_event *event; |
@@ -7176,18 +7513,18 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
7176 | 7513 | ||
7177 | if (task) { | 7514 | if (task) { |
7178 | event->attach_state = PERF_ATTACH_TASK; | 7515 | event->attach_state = PERF_ATTACH_TASK; |
7179 | |||
7180 | if (attr->type == PERF_TYPE_TRACEPOINT) | ||
7181 | event->hw.tp_target = task; | ||
7182 | #ifdef CONFIG_HAVE_HW_BREAKPOINT | ||
7183 | /* | 7516 | /* |
7184 | * hw_breakpoint is a bit difficult here.. | 7517 | * XXX pmu::event_init needs to know what task to account to |
7518 | * and we cannot use the ctx information because we need the | ||
7519 | * pmu before we get a ctx. | ||
7185 | */ | 7520 | */ |
7186 | else if (attr->type == PERF_TYPE_BREAKPOINT) | 7521 | event->hw.target = task; |
7187 | event->hw.bp_target = task; | ||
7188 | #endif | ||
7189 | } | 7522 | } |
7190 | 7523 | ||
7524 | event->clock = &local_clock; | ||
7525 | if (parent_event) | ||
7526 | event->clock = parent_event->clock; | ||
7527 | |||
7191 | if (!overflow_handler && parent_event) { | 7528 | if (!overflow_handler && parent_event) { |
7192 | overflow_handler = parent_event->overflow_handler; | 7529 | overflow_handler = parent_event->overflow_handler; |
7193 | context = parent_event->overflow_handler_context; | 7530 | context = parent_event->overflow_handler_context; |
@@ -7214,6 +7551,15 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
7214 | if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) | 7551 | if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) |
7215 | goto err_ns; | 7552 | goto err_ns; |
7216 | 7553 | ||
7554 | if (!has_branch_stack(event)) | ||
7555 | event->attr.branch_sample_type = 0; | ||
7556 | |||
7557 | if (cgroup_fd != -1) { | ||
7558 | err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader); | ||
7559 | if (err) | ||
7560 | goto err_ns; | ||
7561 | } | ||
7562 | |||
7217 | pmu = perf_init_event(event); | 7563 | pmu = perf_init_event(event); |
7218 | if (!pmu) | 7564 | if (!pmu) |
7219 | goto err_ns; | 7565 | goto err_ns; |
@@ -7222,21 +7568,30 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
7222 | goto err_ns; | 7568 | goto err_ns; |
7223 | } | 7569 | } |
7224 | 7570 | ||
7571 | err = exclusive_event_init(event); | ||
7572 | if (err) | ||
7573 | goto err_pmu; | ||
7574 | |||
7225 | if (!event->parent) { | 7575 | if (!event->parent) { |
7226 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { | 7576 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { |
7227 | err = get_callchain_buffers(); | 7577 | err = get_callchain_buffers(); |
7228 | if (err) | 7578 | if (err) |
7229 | goto err_pmu; | 7579 | goto err_per_task; |
7230 | } | 7580 | } |
7231 | } | 7581 | } |
7232 | 7582 | ||
7233 | return event; | 7583 | return event; |
7234 | 7584 | ||
7585 | err_per_task: | ||
7586 | exclusive_event_destroy(event); | ||
7587 | |||
7235 | err_pmu: | 7588 | err_pmu: |
7236 | if (event->destroy) | 7589 | if (event->destroy) |
7237 | event->destroy(event); | 7590 | event->destroy(event); |
7238 | module_put(pmu->module); | 7591 | module_put(pmu->module); |
7239 | err_ns: | 7592 | err_ns: |
7593 | if (is_cgroup_event(event)) | ||
7594 | perf_detach_cgroup(event); | ||
7240 | if (event->ns) | 7595 | if (event->ns) |
7241 | put_pid_ns(event->ns); | 7596 | put_pid_ns(event->ns); |
7242 | kfree(event); | 7597 | kfree(event); |
@@ -7399,6 +7754,19 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) | |||
7399 | if (output_event->cpu == -1 && output_event->ctx != event->ctx) | 7754 | if (output_event->cpu == -1 && output_event->ctx != event->ctx) |
7400 | goto out; | 7755 | goto out; |
7401 | 7756 | ||
7757 | /* | ||
7758 | * Mixing clocks in the same buffer is trouble you don't need. | ||
7759 | */ | ||
7760 | if (output_event->clock != event->clock) | ||
7761 | goto out; | ||
7762 | |||
7763 | /* | ||
7764 | * If both events generate aux data, they must be on the same PMU | ||
7765 | */ | ||
7766 | if (has_aux(event) && has_aux(output_event) && | ||
7767 | event->pmu != output_event->pmu) | ||
7768 | goto out; | ||
7769 | |||
7402 | set: | 7770 | set: |
7403 | mutex_lock(&event->mmap_mutex); | 7771 | mutex_lock(&event->mmap_mutex); |
7404 | /* Can't redirect output if we've got an active mmap() */ | 7772 | /* Can't redirect output if we've got an active mmap() */ |
@@ -7431,6 +7799,43 @@ static void mutex_lock_double(struct mutex *a, struct mutex *b) | |||
7431 | mutex_lock_nested(b, SINGLE_DEPTH_NESTING); | 7799 | mutex_lock_nested(b, SINGLE_DEPTH_NESTING); |
7432 | } | 7800 | } |
7433 | 7801 | ||
7802 | static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id) | ||
7803 | { | ||
7804 | bool nmi_safe = false; | ||
7805 | |||
7806 | switch (clk_id) { | ||
7807 | case CLOCK_MONOTONIC: | ||
7808 | event->clock = &ktime_get_mono_fast_ns; | ||
7809 | nmi_safe = true; | ||
7810 | break; | ||
7811 | |||
7812 | case CLOCK_MONOTONIC_RAW: | ||
7813 | event->clock = &ktime_get_raw_fast_ns; | ||
7814 | nmi_safe = true; | ||
7815 | break; | ||
7816 | |||
7817 | case CLOCK_REALTIME: | ||
7818 | event->clock = &ktime_get_real_ns; | ||
7819 | break; | ||
7820 | |||
7821 | case CLOCK_BOOTTIME: | ||
7822 | event->clock = &ktime_get_boot_ns; | ||
7823 | break; | ||
7824 | |||
7825 | case CLOCK_TAI: | ||
7826 | event->clock = &ktime_get_tai_ns; | ||
7827 | break; | ||
7828 | |||
7829 | default: | ||
7830 | return -EINVAL; | ||
7831 | } | ||
7832 | |||
7833 | if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI)) | ||
7834 | return -EINVAL; | ||
7835 | |||
7836 | return 0; | ||
7837 | } | ||
7838 | |||
7434 | /** | 7839 | /** |
7435 | * sys_perf_event_open - open a performance event, associate it to a task/cpu | 7840 | * sys_perf_event_open - open a performance event, associate it to a task/cpu |
7436 | * | 7841 | * |
@@ -7455,6 +7860,7 @@ SYSCALL_DEFINE5(perf_event_open, | |||
7455 | int move_group = 0; | 7860 | int move_group = 0; |
7456 | int err; | 7861 | int err; |
7457 | int f_flags = O_RDWR; | 7862 | int f_flags = O_RDWR; |
7863 | int cgroup_fd = -1; | ||
7458 | 7864 | ||
7459 | /* for future expandability... */ | 7865 | /* for future expandability... */ |
7460 | if (flags & ~PERF_FLAG_ALL) | 7866 | if (flags & ~PERF_FLAG_ALL) |
@@ -7520,21 +7926,16 @@ SYSCALL_DEFINE5(perf_event_open, | |||
7520 | 7926 | ||
7521 | get_online_cpus(); | 7927 | get_online_cpus(); |
7522 | 7928 | ||
7929 | if (flags & PERF_FLAG_PID_CGROUP) | ||
7930 | cgroup_fd = pid; | ||
7931 | |||
7523 | event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, | 7932 | event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, |
7524 | NULL, NULL); | 7933 | NULL, NULL, cgroup_fd); |
7525 | if (IS_ERR(event)) { | 7934 | if (IS_ERR(event)) { |
7526 | err = PTR_ERR(event); | 7935 | err = PTR_ERR(event); |
7527 | goto err_cpus; | 7936 | goto err_cpus; |
7528 | } | 7937 | } |
7529 | 7938 | ||
7530 | if (flags & PERF_FLAG_PID_CGROUP) { | ||
7531 | err = perf_cgroup_connect(pid, event, &attr, group_leader); | ||
7532 | if (err) { | ||
7533 | __free_event(event); | ||
7534 | goto err_cpus; | ||
7535 | } | ||
7536 | } | ||
7537 | |||
7538 | if (is_sampling_event(event)) { | 7939 | if (is_sampling_event(event)) { |
7539 | if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) { | 7940 | if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) { |
7540 | err = -ENOTSUPP; | 7941 | err = -ENOTSUPP; |
@@ -7550,6 +7951,12 @@ SYSCALL_DEFINE5(perf_event_open, | |||
7550 | */ | 7951 | */ |
7551 | pmu = event->pmu; | 7952 | pmu = event->pmu; |
7552 | 7953 | ||
7954 | if (attr.use_clockid) { | ||
7955 | err = perf_event_set_clock(event, attr.clockid); | ||
7956 | if (err) | ||
7957 | goto err_alloc; | ||
7958 | } | ||
7959 | |||
7553 | if (group_leader && | 7960 | if (group_leader && |
7554 | (is_software_event(event) != is_software_event(group_leader))) { | 7961 | (is_software_event(event) != is_software_event(group_leader))) { |
7555 | if (is_software_event(event)) { | 7962 | if (is_software_event(event)) { |
@@ -7576,12 +7983,17 @@ SYSCALL_DEFINE5(perf_event_open, | |||
7576 | /* | 7983 | /* |
7577 | * Get the target context (task or percpu): | 7984 | * Get the target context (task or percpu): |
7578 | */ | 7985 | */ |
7579 | ctx = find_get_context(pmu, task, event->cpu); | 7986 | ctx = find_get_context(pmu, task, event); |
7580 | if (IS_ERR(ctx)) { | 7987 | if (IS_ERR(ctx)) { |
7581 | err = PTR_ERR(ctx); | 7988 | err = PTR_ERR(ctx); |
7582 | goto err_alloc; | 7989 | goto err_alloc; |
7583 | } | 7990 | } |
7584 | 7991 | ||
7992 | if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) { | ||
7993 | err = -EBUSY; | ||
7994 | goto err_context; | ||
7995 | } | ||
7996 | |||
7585 | if (task) { | 7997 | if (task) { |
7586 | put_task_struct(task); | 7998 | put_task_struct(task); |
7587 | task = NULL; | 7999 | task = NULL; |
@@ -7599,6 +8011,11 @@ SYSCALL_DEFINE5(perf_event_open, | |||
7599 | */ | 8011 | */ |
7600 | if (group_leader->group_leader != group_leader) | 8012 | if (group_leader->group_leader != group_leader) |
7601 | goto err_context; | 8013 | goto err_context; |
8014 | |||
8015 | /* All events in a group should have the same clock */ | ||
8016 | if (group_leader->clock != event->clock) | ||
8017 | goto err_context; | ||
8018 | |||
7602 | /* | 8019 | /* |
7603 | * Do not allow to attach to a group in a different | 8020 | * Do not allow to attach to a group in a different |
7604 | * task or CPU context: | 8021 | * task or CPU context: |
@@ -7699,6 +8116,13 @@ SYSCALL_DEFINE5(perf_event_open, | |||
7699 | get_ctx(ctx); | 8116 | get_ctx(ctx); |
7700 | } | 8117 | } |
7701 | 8118 | ||
8119 | if (!exclusive_event_installable(event, ctx)) { | ||
8120 | err = -EBUSY; | ||
8121 | mutex_unlock(&ctx->mutex); | ||
8122 | fput(event_file); | ||
8123 | goto err_context; | ||
8124 | } | ||
8125 | |||
7702 | perf_install_in_context(ctx, event, event->cpu); | 8126 | perf_install_in_context(ctx, event, event->cpu); |
7703 | perf_unpin_context(ctx); | 8127 | perf_unpin_context(ctx); |
7704 | 8128 | ||
@@ -7771,7 +8195,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | |||
7771 | */ | 8195 | */ |
7772 | 8196 | ||
7773 | event = perf_event_alloc(attr, cpu, task, NULL, NULL, | 8197 | event = perf_event_alloc(attr, cpu, task, NULL, NULL, |
7774 | overflow_handler, context); | 8198 | overflow_handler, context, -1); |
7775 | if (IS_ERR(event)) { | 8199 | if (IS_ERR(event)) { |
7776 | err = PTR_ERR(event); | 8200 | err = PTR_ERR(event); |
7777 | goto err; | 8201 | goto err; |
@@ -7782,7 +8206,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | |||
7782 | 8206 | ||
7783 | account_event(event); | 8207 | account_event(event); |
7784 | 8208 | ||
7785 | ctx = find_get_context(event->pmu, task, cpu); | 8209 | ctx = find_get_context(event->pmu, task, event); |
7786 | if (IS_ERR(ctx)) { | 8210 | if (IS_ERR(ctx)) { |
7787 | err = PTR_ERR(ctx); | 8211 | err = PTR_ERR(ctx); |
7788 | goto err_free; | 8212 | goto err_free; |
@@ -7790,6 +8214,14 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | |||
7790 | 8214 | ||
7791 | WARN_ON_ONCE(ctx->parent_ctx); | 8215 | WARN_ON_ONCE(ctx->parent_ctx); |
7792 | mutex_lock(&ctx->mutex); | 8216 | mutex_lock(&ctx->mutex); |
8217 | if (!exclusive_event_installable(event, ctx)) { | ||
8218 | mutex_unlock(&ctx->mutex); | ||
8219 | perf_unpin_context(ctx); | ||
8220 | put_ctx(ctx); | ||
8221 | err = -EBUSY; | ||
8222 | goto err_free; | ||
8223 | } | ||
8224 | |||
7793 | perf_install_in_context(ctx, event, cpu); | 8225 | perf_install_in_context(ctx, event, cpu); |
7794 | perf_unpin_context(ctx); | 8226 | perf_unpin_context(ctx); |
7795 | mutex_unlock(&ctx->mutex); | 8227 | mutex_unlock(&ctx->mutex); |
@@ -8132,7 +8564,7 @@ inherit_event(struct perf_event *parent_event, | |||
8132 | parent_event->cpu, | 8564 | parent_event->cpu, |
8133 | child, | 8565 | child, |
8134 | group_leader, parent_event, | 8566 | group_leader, parent_event, |
8135 | NULL, NULL); | 8567 | NULL, NULL, -1); |
8136 | if (IS_ERR(child_event)) | 8568 | if (IS_ERR(child_event)) |
8137 | return child_event; | 8569 | return child_event; |
8138 | 8570 | ||
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 9803a6600d49..92ce5f4ccc26 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c | |||
@@ -116,12 +116,12 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) | |||
116 | */ | 116 | */ |
117 | static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) | 117 | static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) |
118 | { | 118 | { |
119 | struct task_struct *tsk = bp->hw.bp_target; | 119 | struct task_struct *tsk = bp->hw.target; |
120 | struct perf_event *iter; | 120 | struct perf_event *iter; |
121 | int count = 0; | 121 | int count = 0; |
122 | 122 | ||
123 | list_for_each_entry(iter, &bp_task_head, hw.bp_list) { | 123 | list_for_each_entry(iter, &bp_task_head, hw.bp_list) { |
124 | if (iter->hw.bp_target == tsk && | 124 | if (iter->hw.target == tsk && |
125 | find_slot_idx(iter) == type && | 125 | find_slot_idx(iter) == type && |
126 | (iter->cpu < 0 || cpu == iter->cpu)) | 126 | (iter->cpu < 0 || cpu == iter->cpu)) |
127 | count += hw_breakpoint_weight(iter); | 127 | count += hw_breakpoint_weight(iter); |
@@ -153,7 +153,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, | |||
153 | int nr; | 153 | int nr; |
154 | 154 | ||
155 | nr = info->cpu_pinned; | 155 | nr = info->cpu_pinned; |
156 | if (!bp->hw.bp_target) | 156 | if (!bp->hw.target) |
157 | nr += max_task_bp_pinned(cpu, type); | 157 | nr += max_task_bp_pinned(cpu, type); |
158 | else | 158 | else |
159 | nr += task_bp_pinned(cpu, bp, type); | 159 | nr += task_bp_pinned(cpu, bp, type); |
@@ -210,7 +210,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, | |||
210 | weight = -weight; | 210 | weight = -weight; |
211 | 211 | ||
212 | /* Pinned counter cpu profiling */ | 212 | /* Pinned counter cpu profiling */ |
213 | if (!bp->hw.bp_target) { | 213 | if (!bp->hw.target) { |
214 | get_bp_info(bp->cpu, type)->cpu_pinned += weight; | 214 | get_bp_info(bp->cpu, type)->cpu_pinned += weight; |
215 | return; | 215 | return; |
216 | } | 216 | } |
diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 569b218782ad..9f6ce9ba4a04 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h | |||
@@ -27,6 +27,7 @@ struct ring_buffer { | |||
27 | local_t lost; /* nr records lost */ | 27 | local_t lost; /* nr records lost */ |
28 | 28 | ||
29 | long watermark; /* wakeup watermark */ | 29 | long watermark; /* wakeup watermark */ |
30 | long aux_watermark; | ||
30 | /* poll crap */ | 31 | /* poll crap */ |
31 | spinlock_t event_lock; | 32 | spinlock_t event_lock; |
32 | struct list_head event_list; | 33 | struct list_head event_list; |
@@ -35,6 +36,20 @@ struct ring_buffer { | |||
35 | unsigned long mmap_locked; | 36 | unsigned long mmap_locked; |
36 | struct user_struct *mmap_user; | 37 | struct user_struct *mmap_user; |
37 | 38 | ||
39 | /* AUX area */ | ||
40 | local_t aux_head; | ||
41 | local_t aux_nest; | ||
42 | local_t aux_wakeup; | ||
43 | unsigned long aux_pgoff; | ||
44 | int aux_nr_pages; | ||
45 | int aux_overwrite; | ||
46 | atomic_t aux_mmap_count; | ||
47 | unsigned long aux_mmap_locked; | ||
48 | void (*free_aux)(void *); | ||
49 | atomic_t aux_refcount; | ||
50 | void **aux_pages; | ||
51 | void *aux_priv; | ||
52 | |||
38 | struct perf_event_mmap_page *user_page; | 53 | struct perf_event_mmap_page *user_page; |
39 | void *data_pages[0]; | 54 | void *data_pages[0]; |
40 | }; | 55 | }; |
@@ -43,6 +58,19 @@ extern void rb_free(struct ring_buffer *rb); | |||
43 | extern struct ring_buffer * | 58 | extern struct ring_buffer * |
44 | rb_alloc(int nr_pages, long watermark, int cpu, int flags); | 59 | rb_alloc(int nr_pages, long watermark, int cpu, int flags); |
45 | extern void perf_event_wakeup(struct perf_event *event); | 60 | extern void perf_event_wakeup(struct perf_event *event); |
61 | extern int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, | ||
62 | pgoff_t pgoff, int nr_pages, long watermark, int flags); | ||
63 | extern void rb_free_aux(struct ring_buffer *rb); | ||
64 | extern struct ring_buffer *ring_buffer_get(struct perf_event *event); | ||
65 | extern void ring_buffer_put(struct ring_buffer *rb); | ||
66 | |||
67 | static inline bool rb_has_aux(struct ring_buffer *rb) | ||
68 | { | ||
69 | return !!rb->aux_nr_pages; | ||
70 | } | ||
71 | |||
72 | void perf_event_aux_event(struct perf_event *event, unsigned long head, | ||
73 | unsigned long size, u64 flags); | ||
46 | 74 | ||
47 | extern void | 75 | extern void |
48 | perf_event_header__init_id(struct perf_event_header *header, | 76 | perf_event_header__init_id(struct perf_event_header *header, |
@@ -81,6 +109,11 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb) | |||
81 | return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); | 109 | return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); |
82 | } | 110 | } |
83 | 111 | ||
112 | static inline unsigned long perf_aux_size(struct ring_buffer *rb) | ||
113 | { | ||
114 | return rb->aux_nr_pages << PAGE_SHIFT; | ||
115 | } | ||
116 | |||
84 | #define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ | 117 | #define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ |
85 | static inline unsigned long \ | 118 | static inline unsigned long \ |
86 | func_name(struct perf_output_handle *handle, \ | 119 | func_name(struct perf_output_handle *handle, \ |
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index eadb95ce7aac..232f00f273cb 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c | |||
@@ -243,14 +243,317 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) | |||
243 | spin_lock_init(&rb->event_lock); | 243 | spin_lock_init(&rb->event_lock); |
244 | } | 244 | } |
245 | 245 | ||
246 | /* | ||
247 | * This is called before hardware starts writing to the AUX area to | ||
248 | * obtain an output handle and make sure there's room in the buffer. | ||
249 | * When the capture completes, call perf_aux_output_end() to commit | ||
250 | * the recorded data to the buffer. | ||
251 | * | ||
252 | * The ordering is similar to that of perf_output_{begin,end}, with | ||
253 | * the exception of (B), which should be taken care of by the pmu | ||
254 | * driver, since ordering rules will differ depending on hardware. | ||
255 | */ | ||
256 | void *perf_aux_output_begin(struct perf_output_handle *handle, | ||
257 | struct perf_event *event) | ||
258 | { | ||
259 | struct perf_event *output_event = event; | ||
260 | unsigned long aux_head, aux_tail; | ||
261 | struct ring_buffer *rb; | ||
262 | |||
263 | if (output_event->parent) | ||
264 | output_event = output_event->parent; | ||
265 | |||
266 | /* | ||
267 | * Since this will typically be open across pmu::add/pmu::del, we | ||
268 | * grab ring_buffer's refcount instead of holding rcu read lock | ||
269 | * to make sure it doesn't disappear under us. | ||
270 | */ | ||
271 | rb = ring_buffer_get(output_event); | ||
272 | if (!rb) | ||
273 | return NULL; | ||
274 | |||
275 | if (!rb_has_aux(rb) || !atomic_inc_not_zero(&rb->aux_refcount)) | ||
276 | goto err; | ||
277 | |||
278 | /* | ||
279 | * Nesting is not supported for AUX area, make sure nested | ||
280 | * writers are caught early | ||
281 | */ | ||
282 | if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1))) | ||
283 | goto err_put; | ||
284 | |||
285 | aux_head = local_read(&rb->aux_head); | ||
286 | |||
287 | handle->rb = rb; | ||
288 | handle->event = event; | ||
289 | handle->head = aux_head; | ||
290 | handle->size = 0; | ||
291 | |||
292 | /* | ||
293 | * In overwrite mode, AUX data stores do not depend on aux_tail, | ||
294 | * therefore (A) control dependency barrier does not exist. The | ||
295 | * (B) <-> (C) ordering is still observed by the pmu driver. | ||
296 | */ | ||
297 | if (!rb->aux_overwrite) { | ||
298 | aux_tail = ACCESS_ONCE(rb->user_page->aux_tail); | ||
299 | handle->wakeup = local_read(&rb->aux_wakeup) + rb->aux_watermark; | ||
300 | if (aux_head - aux_tail < perf_aux_size(rb)) | ||
301 | handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb)); | ||
302 | |||
303 | /* | ||
304 | * handle->size computation depends on aux_tail load; this forms a | ||
305 | * control dependency barrier separating aux_tail load from aux data | ||
306 | * store that will be enabled on successful return | ||
307 | */ | ||
308 | if (!handle->size) { /* A, matches D */ | ||
309 | event->pending_disable = 1; | ||
310 | perf_output_wakeup(handle); | ||
311 | local_set(&rb->aux_nest, 0); | ||
312 | goto err_put; | ||
313 | } | ||
314 | } | ||
315 | |||
316 | return handle->rb->aux_priv; | ||
317 | |||
318 | err_put: | ||
319 | rb_free_aux(rb); | ||
320 | |||
321 | err: | ||
322 | ring_buffer_put(rb); | ||
323 | handle->event = NULL; | ||
324 | |||
325 | return NULL; | ||
326 | } | ||
327 | |||
328 | /* | ||
329 | * Commit the data written by hardware into the ring buffer by adjusting | ||
330 | * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the | ||
331 | * pmu driver's responsibility to observe ordering rules of the hardware, | ||
332 | * so that all the data is externally visible before this is called. | ||
333 | */ | ||
334 | void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, | ||
335 | bool truncated) | ||
336 | { | ||
337 | struct ring_buffer *rb = handle->rb; | ||
338 | unsigned long aux_head; | ||
339 | u64 flags = 0; | ||
340 | |||
341 | if (truncated) | ||
342 | flags |= PERF_AUX_FLAG_TRUNCATED; | ||
343 | |||
344 | /* in overwrite mode, driver provides aux_head via handle */ | ||
345 | if (rb->aux_overwrite) { | ||
346 | flags |= PERF_AUX_FLAG_OVERWRITE; | ||
347 | |||
348 | aux_head = handle->head; | ||
349 | local_set(&rb->aux_head, aux_head); | ||
350 | } else { | ||
351 | aux_head = local_read(&rb->aux_head); | ||
352 | local_add(size, &rb->aux_head); | ||
353 | } | ||
354 | |||
355 | if (size || flags) { | ||
356 | /* | ||
357 | * Only send RECORD_AUX if we have something useful to communicate | ||
358 | */ | ||
359 | |||
360 | perf_event_aux_event(handle->event, aux_head, size, flags); | ||
361 | } | ||
362 | |||
363 | aux_head = rb->user_page->aux_head = local_read(&rb->aux_head); | ||
364 | |||
365 | if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) { | ||
366 | perf_output_wakeup(handle); | ||
367 | local_add(rb->aux_watermark, &rb->aux_wakeup); | ||
368 | } | ||
369 | handle->event = NULL; | ||
370 | |||
371 | local_set(&rb->aux_nest, 0); | ||
372 | rb_free_aux(rb); | ||
373 | ring_buffer_put(rb); | ||
374 | } | ||
375 | |||
376 | /* | ||
377 | * Skip over a given number of bytes in the AUX buffer, due to, for example, | ||
378 | * hardware's alignment constraints. | ||
379 | */ | ||
380 | int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) | ||
381 | { | ||
382 | struct ring_buffer *rb = handle->rb; | ||
383 | unsigned long aux_head; | ||
384 | |||
385 | if (size > handle->size) | ||
386 | return -ENOSPC; | ||
387 | |||
388 | local_add(size, &rb->aux_head); | ||
389 | |||
390 | aux_head = rb->user_page->aux_head = local_read(&rb->aux_head); | ||
391 | if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) { | ||
392 | perf_output_wakeup(handle); | ||
393 | local_add(rb->aux_watermark, &rb->aux_wakeup); | ||
394 | handle->wakeup = local_read(&rb->aux_wakeup) + | ||
395 | rb->aux_watermark; | ||
396 | } | ||
397 | |||
398 | handle->head = aux_head; | ||
399 | handle->size -= size; | ||
400 | |||
401 | return 0; | ||
402 | } | ||
403 | |||
404 | void *perf_get_aux(struct perf_output_handle *handle) | ||
405 | { | ||
406 | /* this is only valid between perf_aux_output_begin and *_end */ | ||
407 | if (!handle->event) | ||
408 | return NULL; | ||
409 | |||
410 | return handle->rb->aux_priv; | ||
411 | } | ||
412 | |||
413 | #define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY) | ||
414 | |||
415 | static struct page *rb_alloc_aux_page(int node, int order) | ||
416 | { | ||
417 | struct page *page; | ||
418 | |||
419 | if (order > MAX_ORDER) | ||
420 | order = MAX_ORDER; | ||
421 | |||
422 | do { | ||
423 | page = alloc_pages_node(node, PERF_AUX_GFP, order); | ||
424 | } while (!page && order--); | ||
425 | |||
426 | if (page && order) { | ||
427 | /* | ||
428 | * Communicate the allocation size to the driver | ||
429 | */ | ||
430 | split_page(page, order); | ||
431 | SetPagePrivate(page); | ||
432 | set_page_private(page, order); | ||
433 | } | ||
434 | |||
435 | return page; | ||
436 | } | ||
437 | |||
438 | static void rb_free_aux_page(struct ring_buffer *rb, int idx) | ||
439 | { | ||
440 | struct page *page = virt_to_page(rb->aux_pages[idx]); | ||
441 | |||
442 | ClearPagePrivate(page); | ||
443 | page->mapping = NULL; | ||
444 | __free_page(page); | ||
445 | } | ||
446 | |||
447 | int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, | ||
448 | pgoff_t pgoff, int nr_pages, long watermark, int flags) | ||
449 | { | ||
450 | bool overwrite = !(flags & RING_BUFFER_WRITABLE); | ||
451 | int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu); | ||
452 | int ret = -ENOMEM, max_order = 0; | ||
453 | |||
454 | if (!has_aux(event)) | ||
455 | return -ENOTSUPP; | ||
456 | |||
457 | if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) { | ||
458 | /* | ||
459 | * We need to start with the max_order that fits in nr_pages, | ||
460 | * not the other way around, hence ilog2() and not get_order. | ||
461 | */ | ||
462 | max_order = ilog2(nr_pages); | ||
463 | |||
464 | /* | ||
465 | * PMU requests more than one contiguous chunks of memory | ||
466 | * for SW double buffering | ||
467 | */ | ||
468 | if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_SW_DOUBLEBUF) && | ||
469 | !overwrite) { | ||
470 | if (!max_order) | ||
471 | return -EINVAL; | ||
472 | |||
473 | max_order--; | ||
474 | } | ||
475 | } | ||
476 | |||
477 | rb->aux_pages = kzalloc_node(nr_pages * sizeof(void *), GFP_KERNEL, node); | ||
478 | if (!rb->aux_pages) | ||
479 | return -ENOMEM; | ||
480 | |||
481 | rb->free_aux = event->pmu->free_aux; | ||
482 | for (rb->aux_nr_pages = 0; rb->aux_nr_pages < nr_pages;) { | ||
483 | struct page *page; | ||
484 | int last, order; | ||
485 | |||
486 | order = min(max_order, ilog2(nr_pages - rb->aux_nr_pages)); | ||
487 | page = rb_alloc_aux_page(node, order); | ||
488 | if (!page) | ||
489 | goto out; | ||
490 | |||
491 | for (last = rb->aux_nr_pages + (1 << page_private(page)); | ||
492 | last > rb->aux_nr_pages; rb->aux_nr_pages++) | ||
493 | rb->aux_pages[rb->aux_nr_pages] = page_address(page++); | ||
494 | } | ||
495 | |||
496 | rb->aux_priv = event->pmu->setup_aux(event->cpu, rb->aux_pages, nr_pages, | ||
497 | overwrite); | ||
498 | if (!rb->aux_priv) | ||
499 | goto out; | ||
500 | |||
501 | ret = 0; | ||
502 | |||
503 | /* | ||
504 | * aux_pages (and pmu driver's private data, aux_priv) will be | ||
505 | * referenced in both producer's and consumer's contexts, thus | ||
506 | * we keep a refcount here to make sure either of the two can | ||
507 | * reference them safely. | ||
508 | */ | ||
509 | atomic_set(&rb->aux_refcount, 1); | ||
510 | |||
511 | rb->aux_overwrite = overwrite; | ||
512 | rb->aux_watermark = watermark; | ||
513 | |||
514 | if (!rb->aux_watermark && !rb->aux_overwrite) | ||
515 | rb->aux_watermark = nr_pages << (PAGE_SHIFT - 1); | ||
516 | |||
517 | out: | ||
518 | if (!ret) | ||
519 | rb->aux_pgoff = pgoff; | ||
520 | else | ||
521 | rb_free_aux(rb); | ||
522 | |||
523 | return ret; | ||
524 | } | ||
525 | |||
526 | static void __rb_free_aux(struct ring_buffer *rb) | ||
527 | { | ||
528 | int pg; | ||
529 | |||
530 | if (rb->aux_priv) { | ||
531 | rb->free_aux(rb->aux_priv); | ||
532 | rb->free_aux = NULL; | ||
533 | rb->aux_priv = NULL; | ||
534 | } | ||
535 | |||
536 | for (pg = 0; pg < rb->aux_nr_pages; pg++) | ||
537 | rb_free_aux_page(rb, pg); | ||
538 | |||
539 | kfree(rb->aux_pages); | ||
540 | rb->aux_nr_pages = 0; | ||
541 | } | ||
542 | |||
543 | void rb_free_aux(struct ring_buffer *rb) | ||
544 | { | ||
545 | if (atomic_dec_and_test(&rb->aux_refcount)) | ||
546 | __rb_free_aux(rb); | ||
547 | } | ||
548 | |||
246 | #ifndef CONFIG_PERF_USE_VMALLOC | 549 | #ifndef CONFIG_PERF_USE_VMALLOC |
247 | 550 | ||
248 | /* | 551 | /* |
249 | * Back perf_mmap() with regular GFP_KERNEL-0 pages. | 552 | * Back perf_mmap() with regular GFP_KERNEL-0 pages. |
250 | */ | 553 | */ |
251 | 554 | ||
252 | struct page * | 555 | static struct page * |
253 | perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) | 556 | __perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) |
254 | { | 557 | { |
255 | if (pgoff > rb->nr_pages) | 558 | if (pgoff > rb->nr_pages) |
256 | return NULL; | 559 | return NULL; |
@@ -340,8 +643,8 @@ static int data_page_nr(struct ring_buffer *rb) | |||
340 | return rb->nr_pages << page_order(rb); | 643 | return rb->nr_pages << page_order(rb); |
341 | } | 644 | } |
342 | 645 | ||
343 | struct page * | 646 | static struct page * |
344 | perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) | 647 | __perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) |
345 | { | 648 | { |
346 | /* The '>' counts in the user page. */ | 649 | /* The '>' counts in the user page. */ |
347 | if (pgoff > data_page_nr(rb)) | 650 | if (pgoff > data_page_nr(rb)) |
@@ -416,3 +719,19 @@ fail: | |||
416 | } | 719 | } |
417 | 720 | ||
418 | #endif | 721 | #endif |
722 | |||
723 | struct page * | ||
724 | perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) | ||
725 | { | ||
726 | if (rb->aux_nr_pages) { | ||
727 | /* above AUX space */ | ||
728 | if (pgoff > rb->aux_pgoff + rb->aux_nr_pages) | ||
729 | return NULL; | ||
730 | |||
731 | /* AUX space */ | ||
732 | if (pgoff >= rb->aux_pgoff) | ||
733 | return virt_to_page(rb->aux_pages[pgoff - rb->aux_pgoff]); | ||
734 | } | ||
735 | |||
736 | return __perf_mmap_to_page(rb, pgoff); | ||
737 | } | ||
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index 83d4382f5699..6873bb3e6b7e 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c | |||
@@ -20,145 +20,10 @@ | |||
20 | #include <linux/types.h> | 20 | #include <linux/types.h> |
21 | #include <linux/fs_struct.h> | 21 | #include <linux/fs_struct.h> |
22 | 22 | ||
23 | |||
24 | static void default_handler(int, struct pt_regs *); | ||
25 | |||
26 | static struct exec_domain *exec_domains = &default_exec_domain; | ||
27 | static DEFINE_RWLOCK(exec_domains_lock); | ||
28 | |||
29 | |||
30 | static unsigned long ident_map[32] = { | ||
31 | 0, 1, 2, 3, 4, 5, 6, 7, | ||
32 | 8, 9, 10, 11, 12, 13, 14, 15, | ||
33 | 16, 17, 18, 19, 20, 21, 22, 23, | ||
34 | 24, 25, 26, 27, 28, 29, 30, 31 | ||
35 | }; | ||
36 | |||
37 | struct exec_domain default_exec_domain = { | ||
38 | .name = "Linux", /* name */ | ||
39 | .handler = default_handler, /* lcall7 causes a seg fault. */ | ||
40 | .pers_low = 0, /* PER_LINUX personality. */ | ||
41 | .pers_high = 0, /* PER_LINUX personality. */ | ||
42 | .signal_map = ident_map, /* Identity map signals. */ | ||
43 | .signal_invmap = ident_map, /* - both ways. */ | ||
44 | }; | ||
45 | |||
46 | |||
47 | static void | ||
48 | default_handler(int segment, struct pt_regs *regp) | ||
49 | { | ||
50 | set_personality(0); | ||
51 | |||
52 | if (current_thread_info()->exec_domain->handler != default_handler) | ||
53 | current_thread_info()->exec_domain->handler(segment, regp); | ||
54 | else | ||
55 | send_sig(SIGSEGV, current, 1); | ||
56 | } | ||
57 | |||
58 | static struct exec_domain * | ||
59 | lookup_exec_domain(unsigned int personality) | ||
60 | { | ||
61 | unsigned int pers = personality(personality); | ||
62 | struct exec_domain *ep; | ||
63 | |||
64 | read_lock(&exec_domains_lock); | ||
65 | for (ep = exec_domains; ep; ep = ep->next) { | ||
66 | if (pers >= ep->pers_low && pers <= ep->pers_high) | ||
67 | if (try_module_get(ep->module)) | ||
68 | goto out; | ||
69 | } | ||
70 | |||
71 | #ifdef CONFIG_MODULES | ||
72 | read_unlock(&exec_domains_lock); | ||
73 | request_module("personality-%d", pers); | ||
74 | read_lock(&exec_domains_lock); | ||
75 | |||
76 | for (ep = exec_domains; ep; ep = ep->next) { | ||
77 | if (pers >= ep->pers_low && pers <= ep->pers_high) | ||
78 | if (try_module_get(ep->module)) | ||
79 | goto out; | ||
80 | } | ||
81 | #endif | ||
82 | |||
83 | ep = &default_exec_domain; | ||
84 | out: | ||
85 | read_unlock(&exec_domains_lock); | ||
86 | return ep; | ||
87 | } | ||
88 | |||
89 | int | ||
90 | register_exec_domain(struct exec_domain *ep) | ||
91 | { | ||
92 | struct exec_domain *tmp; | ||
93 | int err = -EBUSY; | ||
94 | |||
95 | if (ep == NULL) | ||
96 | return -EINVAL; | ||
97 | |||
98 | if (ep->next != NULL) | ||
99 | return -EBUSY; | ||
100 | |||
101 | write_lock(&exec_domains_lock); | ||
102 | for (tmp = exec_domains; tmp; tmp = tmp->next) { | ||
103 | if (tmp == ep) | ||
104 | goto out; | ||
105 | } | ||
106 | |||
107 | ep->next = exec_domains; | ||
108 | exec_domains = ep; | ||
109 | err = 0; | ||
110 | |||
111 | out: | ||
112 | write_unlock(&exec_domains_lock); | ||
113 | return err; | ||
114 | } | ||
115 | EXPORT_SYMBOL(register_exec_domain); | ||
116 | |||
117 | int | ||
118 | unregister_exec_domain(struct exec_domain *ep) | ||
119 | { | ||
120 | struct exec_domain **epp; | ||
121 | |||
122 | epp = &exec_domains; | ||
123 | write_lock(&exec_domains_lock); | ||
124 | for (epp = &exec_domains; *epp; epp = &(*epp)->next) { | ||
125 | if (ep == *epp) | ||
126 | goto unregister; | ||
127 | } | ||
128 | write_unlock(&exec_domains_lock); | ||
129 | return -EINVAL; | ||
130 | |||
131 | unregister: | ||
132 | *epp = ep->next; | ||
133 | ep->next = NULL; | ||
134 | write_unlock(&exec_domains_lock); | ||
135 | return 0; | ||
136 | } | ||
137 | EXPORT_SYMBOL(unregister_exec_domain); | ||
138 | |||
139 | int __set_personality(unsigned int personality) | ||
140 | { | ||
141 | struct exec_domain *oep = current_thread_info()->exec_domain; | ||
142 | |||
143 | current_thread_info()->exec_domain = lookup_exec_domain(personality); | ||
144 | current->personality = personality; | ||
145 | module_put(oep->module); | ||
146 | |||
147 | return 0; | ||
148 | } | ||
149 | EXPORT_SYMBOL(__set_personality); | ||
150 | |||
151 | #ifdef CONFIG_PROC_FS | 23 | #ifdef CONFIG_PROC_FS |
152 | static int execdomains_proc_show(struct seq_file *m, void *v) | 24 | static int execdomains_proc_show(struct seq_file *m, void *v) |
153 | { | 25 | { |
154 | struct exec_domain *ep; | 26 | seq_puts(m, "0-0\tLinux \t[kernel]\n"); |
155 | |||
156 | read_lock(&exec_domains_lock); | ||
157 | for (ep = exec_domains; ep; ep = ep->next) | ||
158 | seq_printf(m, "%d-%d\t%-16s\t[%s]\n", | ||
159 | ep->pers_low, ep->pers_high, ep->name, | ||
160 | module_name(ep->module)); | ||
161 | read_unlock(&exec_domains_lock); | ||
162 | return 0; | 27 | return 0; |
163 | } | 28 | } |
164 | 29 | ||
diff --git a/kernel/exit.c b/kernel/exit.c index feff10bbb307..22fcc05dec40 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -756,8 +756,6 @@ void do_exit(long code) | |||
756 | 756 | ||
757 | cgroup_exit(tsk); | 757 | cgroup_exit(tsk); |
758 | 758 | ||
759 | module_put(task_thread_info(tsk)->exec_domain->module); | ||
760 | |||
761 | /* | 759 | /* |
762 | * FIXME: do that only when needed, using sched_exit tracepoint | 760 | * FIXME: do that only when needed, using sched_exit tracepoint |
763 | */ | 761 | */ |
diff --git a/kernel/fork.c b/kernel/fork.c index cf65139615a0..03c1eaaa6ef5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -74,6 +74,7 @@ | |||
74 | #include <linux/uprobes.h> | 74 | #include <linux/uprobes.h> |
75 | #include <linux/aio.h> | 75 | #include <linux/aio.h> |
76 | #include <linux/compiler.h> | 76 | #include <linux/compiler.h> |
77 | #include <linux/sysctl.h> | ||
77 | 78 | ||
78 | #include <asm/pgtable.h> | 79 | #include <asm/pgtable.h> |
79 | #include <asm/pgalloc.h> | 80 | #include <asm/pgalloc.h> |
@@ -88,6 +89,16 @@ | |||
88 | #include <trace/events/task.h> | 89 | #include <trace/events/task.h> |
89 | 90 | ||
90 | /* | 91 | /* |
92 | * Minimum number of threads to boot the kernel | ||
93 | */ | ||
94 | #define MIN_THREADS 20 | ||
95 | |||
96 | /* | ||
97 | * Maximum number of threads | ||
98 | */ | ||
99 | #define MAX_THREADS FUTEX_TID_MASK | ||
100 | |||
101 | /* | ||
91 | * Protected counters by write_lock_irq(&tasklist_lock) | 102 | * Protected counters by write_lock_irq(&tasklist_lock) |
92 | */ | 103 | */ |
93 | unsigned long total_forks; /* Handle normal Linux uptimes. */ | 104 | unsigned long total_forks; /* Handle normal Linux uptimes. */ |
@@ -253,7 +264,30 @@ EXPORT_SYMBOL_GPL(__put_task_struct); | |||
253 | 264 | ||
254 | void __init __weak arch_task_cache_init(void) { } | 265 | void __init __weak arch_task_cache_init(void) { } |
255 | 266 | ||
256 | void __init fork_init(unsigned long mempages) | 267 | /* |
268 | * set_max_threads | ||
269 | */ | ||
270 | static void set_max_threads(unsigned int max_threads_suggested) | ||
271 | { | ||
272 | u64 threads; | ||
273 | |||
274 | /* | ||
275 | * The number of threads shall be limited such that the thread | ||
276 | * structures may only consume a small part of the available memory. | ||
277 | */ | ||
278 | if (fls64(totalram_pages) + fls64(PAGE_SIZE) > 64) | ||
279 | threads = MAX_THREADS; | ||
280 | else | ||
281 | threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE, | ||
282 | (u64) THREAD_SIZE * 8UL); | ||
283 | |||
284 | if (threads > max_threads_suggested) | ||
285 | threads = max_threads_suggested; | ||
286 | |||
287 | max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS); | ||
288 | } | ||
289 | |||
290 | void __init fork_init(void) | ||
257 | { | 291 | { |
258 | #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR | 292 | #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR |
259 | #ifndef ARCH_MIN_TASKALIGN | 293 | #ifndef ARCH_MIN_TASKALIGN |
@@ -268,18 +302,7 @@ void __init fork_init(unsigned long mempages) | |||
268 | /* do the arch specific task caches init */ | 302 | /* do the arch specific task caches init */ |
269 | arch_task_cache_init(); | 303 | arch_task_cache_init(); |
270 | 304 | ||
271 | /* | 305 | set_max_threads(MAX_THREADS); |
272 | * The default maximum number of threads is set to a safe | ||
273 | * value: the thread structures can take up at most half | ||
274 | * of memory. | ||
275 | */ | ||
276 | max_threads = mempages / (8 * THREAD_SIZE / PAGE_SIZE); | ||
277 | |||
278 | /* | ||
279 | * we need to allow at least 20 threads to boot a system | ||
280 | */ | ||
281 | if (max_threads < 20) | ||
282 | max_threads = 20; | ||
283 | 306 | ||
284 | init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; | 307 | init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; |
285 | init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; | 308 | init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; |
@@ -380,6 +403,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
380 | */ | 403 | */ |
381 | down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); | 404 | down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); |
382 | 405 | ||
406 | /* No ordering required: file already has been exposed. */ | ||
407 | RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); | ||
408 | |||
383 | mm->total_vm = oldmm->total_vm; | 409 | mm->total_vm = oldmm->total_vm; |
384 | mm->shared_vm = oldmm->shared_vm; | 410 | mm->shared_vm = oldmm->shared_vm; |
385 | mm->exec_vm = oldmm->exec_vm; | 411 | mm->exec_vm = oldmm->exec_vm; |
@@ -505,7 +531,13 @@ static inline void mm_free_pgd(struct mm_struct *mm) | |||
505 | pgd_free(mm, mm->pgd); | 531 | pgd_free(mm, mm->pgd); |
506 | } | 532 | } |
507 | #else | 533 | #else |
508 | #define dup_mmap(mm, oldmm) (0) | 534 | static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) |
535 | { | ||
536 | down_write(&oldmm->mmap_sem); | ||
537 | RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); | ||
538 | up_write(&oldmm->mmap_sem); | ||
539 | return 0; | ||
540 | } | ||
509 | #define mm_alloc_pgd(mm) (0) | 541 | #define mm_alloc_pgd(mm) (0) |
510 | #define mm_free_pgd(mm) | 542 | #define mm_free_pgd(mm) |
511 | #endif /* CONFIG_MMU */ | 543 | #endif /* CONFIG_MMU */ |
@@ -674,34 +706,53 @@ void mmput(struct mm_struct *mm) | |||
674 | } | 706 | } |
675 | EXPORT_SYMBOL_GPL(mmput); | 707 | EXPORT_SYMBOL_GPL(mmput); |
676 | 708 | ||
709 | /** | ||
710 | * set_mm_exe_file - change a reference to the mm's executable file | ||
711 | * | ||
712 | * This changes mm's executable file (shown as symlink /proc/[pid]/exe). | ||
713 | * | ||
714 | * Main users are mmput() and sys_execve(). Callers prevent concurrent | ||
715 | * invocations: in mmput() nobody alive left, in execve task is single | ||
716 | * threaded. sys_prctl(PR_SET_MM_MAP/EXE_FILE) also needs to set the | ||
717 | * mm->exe_file, but does so without using set_mm_exe_file() in order | ||
718 | * to do avoid the need for any locks. | ||
719 | */ | ||
677 | void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) | 720 | void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) |
678 | { | 721 | { |
722 | struct file *old_exe_file; | ||
723 | |||
724 | /* | ||
725 | * It is safe to dereference the exe_file without RCU as | ||
726 | * this function is only called if nobody else can access | ||
727 | * this mm -- see comment above for justification. | ||
728 | */ | ||
729 | old_exe_file = rcu_dereference_raw(mm->exe_file); | ||
730 | |||
679 | if (new_exe_file) | 731 | if (new_exe_file) |
680 | get_file(new_exe_file); | 732 | get_file(new_exe_file); |
681 | if (mm->exe_file) | 733 | rcu_assign_pointer(mm->exe_file, new_exe_file); |
682 | fput(mm->exe_file); | 734 | if (old_exe_file) |
683 | mm->exe_file = new_exe_file; | 735 | fput(old_exe_file); |
684 | } | 736 | } |
685 | 737 | ||
738 | /** | ||
739 | * get_mm_exe_file - acquire a reference to the mm's executable file | ||
740 | * | ||
741 | * Returns %NULL if mm has no associated executable file. | ||
742 | * User must release file via fput(). | ||
743 | */ | ||
686 | struct file *get_mm_exe_file(struct mm_struct *mm) | 744 | struct file *get_mm_exe_file(struct mm_struct *mm) |
687 | { | 745 | { |
688 | struct file *exe_file; | 746 | struct file *exe_file; |
689 | 747 | ||
690 | /* We need mmap_sem to protect against races with removal of exe_file */ | 748 | rcu_read_lock(); |
691 | down_read(&mm->mmap_sem); | 749 | exe_file = rcu_dereference(mm->exe_file); |
692 | exe_file = mm->exe_file; | 750 | if (exe_file && !get_file_rcu(exe_file)) |
693 | if (exe_file) | 751 | exe_file = NULL; |
694 | get_file(exe_file); | 752 | rcu_read_unlock(); |
695 | up_read(&mm->mmap_sem); | ||
696 | return exe_file; | 753 | return exe_file; |
697 | } | 754 | } |
698 | 755 | EXPORT_SYMBOL(get_mm_exe_file); | |
699 | static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm) | ||
700 | { | ||
701 | /* It's safe to write the exe_file pointer without exe_file_lock because | ||
702 | * this is called during fork when the task is not yet in /proc */ | ||
703 | newmm->exe_file = get_mm_exe_file(oldmm); | ||
704 | } | ||
705 | 756 | ||
706 | /** | 757 | /** |
707 | * get_task_mm - acquire a reference to the task's mm | 758 | * get_task_mm - acquire a reference to the task's mm |
@@ -864,8 +915,6 @@ static struct mm_struct *dup_mm(struct task_struct *tsk) | |||
864 | if (!mm_init(mm, tsk)) | 915 | if (!mm_init(mm, tsk)) |
865 | goto fail_nomem; | 916 | goto fail_nomem; |
866 | 917 | ||
867 | dup_mm_exe_file(oldmm, mm); | ||
868 | |||
869 | err = dup_mmap(mm, oldmm); | 918 | err = dup_mmap(mm, oldmm); |
870 | if (err) | 919 | if (err) |
871 | goto free_pt; | 920 | goto free_pt; |
@@ -1279,9 +1328,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1279 | if (nr_threads >= max_threads) | 1328 | if (nr_threads >= max_threads) |
1280 | goto bad_fork_cleanup_count; | 1329 | goto bad_fork_cleanup_count; |
1281 | 1330 | ||
1282 | if (!try_module_get(task_thread_info(p)->exec_domain->module)) | ||
1283 | goto bad_fork_cleanup_count; | ||
1284 | |||
1285 | delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ | 1331 | delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ |
1286 | p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); | 1332 | p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); |
1287 | p->flags |= PF_FORKNOEXEC; | 1333 | p->flags |= PF_FORKNOEXEC; |
@@ -1406,10 +1452,11 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1406 | goto bad_fork_cleanup_io; | 1452 | goto bad_fork_cleanup_io; |
1407 | 1453 | ||
1408 | if (pid != &init_struct_pid) { | 1454 | if (pid != &init_struct_pid) { |
1409 | retval = -ENOMEM; | ||
1410 | pid = alloc_pid(p->nsproxy->pid_ns_for_children); | 1455 | pid = alloc_pid(p->nsproxy->pid_ns_for_children); |
1411 | if (!pid) | 1456 | if (IS_ERR(pid)) { |
1457 | retval = PTR_ERR(pid); | ||
1412 | goto bad_fork_cleanup_io; | 1458 | goto bad_fork_cleanup_io; |
1459 | } | ||
1413 | } | 1460 | } |
1414 | 1461 | ||
1415 | p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; | 1462 | p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; |
@@ -1590,7 +1637,6 @@ bad_fork_cleanup_threadgroup_lock: | |||
1590 | if (clone_flags & CLONE_THREAD) | 1637 | if (clone_flags & CLONE_THREAD) |
1591 | threadgroup_change_end(current); | 1638 | threadgroup_change_end(current); |
1592 | delayacct_tsk_free(p); | 1639 | delayacct_tsk_free(p); |
1593 | module_put(task_thread_info(p)->exec_domain->module); | ||
1594 | bad_fork_cleanup_count: | 1640 | bad_fork_cleanup_count: |
1595 | atomic_dec(&p->cred->user->processes); | 1641 | atomic_dec(&p->cred->user->processes); |
1596 | exit_creds(p); | 1642 | exit_creds(p); |
@@ -2004,3 +2050,26 @@ int unshare_files(struct files_struct **displaced) | |||
2004 | task_unlock(task); | 2050 | task_unlock(task); |
2005 | return 0; | 2051 | return 0; |
2006 | } | 2052 | } |
2053 | |||
2054 | int sysctl_max_threads(struct ctl_table *table, int write, | ||
2055 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
2056 | { | ||
2057 | struct ctl_table t; | ||
2058 | int ret; | ||
2059 | int threads = max_threads; | ||
2060 | int min = MIN_THREADS; | ||
2061 | int max = MAX_THREADS; | ||
2062 | |||
2063 | t = *table; | ||
2064 | t.data = &threads; | ||
2065 | t.extra1 = &min; | ||
2066 | t.extra2 = &max; | ||
2067 | |||
2068 | ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); | ||
2069 | if (ret || !write) | ||
2070 | return ret; | ||
2071 | |||
2072 | set_max_threads(threads); | ||
2073 | |||
2074 | return 0; | ||
2075 | } | ||
diff --git a/kernel/futex.c b/kernel/futex.c index 2a5e3830e953..2579e407ff67 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -900,7 +900,7 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key, | |||
900 | if (!p) | 900 | if (!p) |
901 | return -ESRCH; | 901 | return -ESRCH; |
902 | 902 | ||
903 | if (!p->mm) { | 903 | if (unlikely(p->flags & PF_KTHREAD)) { |
904 | put_task_struct(p); | 904 | put_task_struct(p); |
905 | return -EPERM; | 905 | return -EPERM; |
906 | } | 906 | } |
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index b358a802fd18..a744098e4eb7 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c | |||
@@ -18,6 +18,7 @@ | |||
18 | #include <linux/init.h> | 18 | #include <linux/init.h> |
19 | #include <linux/module.h> | 19 | #include <linux/module.h> |
20 | #include <linux/mutex.h> | 20 | #include <linux/mutex.h> |
21 | #include <linux/sched.h> | ||
21 | #include "gcov.h" | 22 | #include "gcov.h" |
22 | 23 | ||
23 | static int gcov_events_enabled; | 24 | static int gcov_events_enabled; |
@@ -107,8 +108,10 @@ void gcov_enable_events(void) | |||
107 | gcov_events_enabled = 1; | 108 | gcov_events_enabled = 1; |
108 | 109 | ||
109 | /* Perform event callback for previously registered entries. */ | 110 | /* Perform event callback for previously registered entries. */ |
110 | while ((info = gcov_info_next(info))) | 111 | while ((info = gcov_info_next(info))) { |
111 | gcov_event(GCOV_ADD, info); | 112 | gcov_event(GCOV_ADD, info); |
113 | cond_resched(); | ||
114 | } | ||
112 | 115 | ||
113 | mutex_unlock(&gcov_lock); | 116 | mutex_unlock(&gcov_lock); |
114 | } | 117 | } |
diff --git a/kernel/groups.c b/kernel/groups.c index 664411f171b5..74d431d25251 100644 --- a/kernel/groups.c +++ b/kernel/groups.c | |||
@@ -9,9 +9,6 @@ | |||
9 | #include <linux/user_namespace.h> | 9 | #include <linux/user_namespace.h> |
10 | #include <asm/uaccess.h> | 10 | #include <asm/uaccess.h> |
11 | 11 | ||
12 | /* init to 2 - one for init_task, one to ensure it is never freed */ | ||
13 | struct group_info init_groups = { .usage = ATOMIC_INIT(2) }; | ||
14 | |||
15 | struct group_info *groups_alloc(int gidsetsize) | 12 | struct group_info *groups_alloc(int gidsetsize) |
16 | { | 13 | { |
17 | struct group_info *group_info; | 14 | struct group_info *group_info; |
diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 06db12434d72..e0f90c2b57aa 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c | |||
@@ -169,7 +169,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) | |||
169 | return; | 169 | return; |
170 | 170 | ||
171 | rcu_read_lock(); | 171 | rcu_read_lock(); |
172 | do_each_thread(g, t) { | 172 | for_each_process_thread(g, t) { |
173 | if (!max_count--) | 173 | if (!max_count--) |
174 | goto unlock; | 174 | goto unlock; |
175 | if (!--batch_count) { | 175 | if (!--batch_count) { |
@@ -180,7 +180,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) | |||
180 | /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ | 180 | /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ |
181 | if (t->state == TASK_UNINTERRUPTIBLE) | 181 | if (t->state == TASK_UNINTERRUPTIBLE) |
182 | check_hung_task(t, timeout); | 182 | check_hung_task(t, timeout); |
183 | } while_each_thread(g, t); | 183 | } |
184 | unlock: | 184 | unlock: |
185 | rcu_read_unlock(); | 185 | rcu_read_unlock(); |
186 | } | 186 | } |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 6f1c7a566b95..eb9a4ea394ab 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -948,6 +948,22 @@ int irq_chip_retrigger_hierarchy(struct irq_data *data) | |||
948 | 948 | ||
949 | return -ENOSYS; | 949 | return -ENOSYS; |
950 | } | 950 | } |
951 | |||
952 | /** | ||
953 | * irq_chip_set_wake_parent - Set/reset wake-up on the parent interrupt | ||
954 | * @data: Pointer to interrupt specific data | ||
955 | * @on: Whether to set or reset the wake-up capability of this irq | ||
956 | * | ||
957 | * Conditional, as the underlying parent chip might not implement it. | ||
958 | */ | ||
959 | int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on) | ||
960 | { | ||
961 | data = data->parent_data; | ||
962 | if (data->chip->irq_set_wake) | ||
963 | return data->chip->irq_set_wake(data, on); | ||
964 | |||
965 | return -ENOSYS; | ||
966 | } | ||
951 | #endif | 967 | #endif |
952 | 968 | ||
953 | /** | 969 | /** |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 886d09e691d5..e68932bb308e 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -68,14 +68,20 @@ static void __synchronize_hardirq(struct irq_desc *desc) | |||
68 | * Do not use this for shutdown scenarios where you must be sure | 68 | * Do not use this for shutdown scenarios where you must be sure |
69 | * that all parts (hardirq and threaded handler) have completed. | 69 | * that all parts (hardirq and threaded handler) have completed. |
70 | * | 70 | * |
71 | * Returns: false if a threaded handler is active. | ||
72 | * | ||
71 | * This function may be called - with care - from IRQ context. | 73 | * This function may be called - with care - from IRQ context. |
72 | */ | 74 | */ |
73 | void synchronize_hardirq(unsigned int irq) | 75 | bool synchronize_hardirq(unsigned int irq) |
74 | { | 76 | { |
75 | struct irq_desc *desc = irq_to_desc(irq); | 77 | struct irq_desc *desc = irq_to_desc(irq); |
76 | 78 | ||
77 | if (desc) | 79 | if (desc) { |
78 | __synchronize_hardirq(desc); | 80 | __synchronize_hardirq(desc); |
81 | return !atomic_read(&desc->threads_active); | ||
82 | } | ||
83 | |||
84 | return true; | ||
79 | } | 85 | } |
80 | EXPORT_SYMBOL(synchronize_hardirq); | 86 | EXPORT_SYMBOL(synchronize_hardirq); |
81 | 87 | ||
@@ -440,6 +446,32 @@ void disable_irq(unsigned int irq) | |||
440 | } | 446 | } |
441 | EXPORT_SYMBOL(disable_irq); | 447 | EXPORT_SYMBOL(disable_irq); |
442 | 448 | ||
449 | /** | ||
450 | * disable_hardirq - disables an irq and waits for hardirq completion | ||
451 | * @irq: Interrupt to disable | ||
452 | * | ||
453 | * Disable the selected interrupt line. Enables and Disables are | ||
454 | * nested. | ||
455 | * This function waits for any pending hard IRQ handlers for this | ||
456 | * interrupt to complete before returning. If you use this function while | ||
457 | * holding a resource the hard IRQ handler may need you will deadlock. | ||
458 | * | ||
459 | * When used to optimistically disable an interrupt from atomic context | ||
460 | * the return value must be checked. | ||
461 | * | ||
462 | * Returns: false if a threaded handler is active. | ||
463 | * | ||
464 | * This function may be called - with care - from IRQ context. | ||
465 | */ | ||
466 | bool disable_hardirq(unsigned int irq) | ||
467 | { | ||
468 | if (!__disable_irq_nosync(irq)) | ||
469 | return synchronize_hardirq(irq); | ||
470 | |||
471 | return false; | ||
472 | } | ||
473 | EXPORT_SYMBOL_GPL(disable_hardirq); | ||
474 | |||
443 | void __enable_irq(struct irq_desc *desc, unsigned int irq) | 475 | void __enable_irq(struct irq_desc *desc, unsigned int irq) |
444 | { | 476 | { |
445 | switch (desc->depth) { | 477 | switch (desc->depth) { |
@@ -1766,3 +1798,94 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler, | |||
1766 | 1798 | ||
1767 | return retval; | 1799 | return retval; |
1768 | } | 1800 | } |
1801 | |||
1802 | /** | ||
1803 | * irq_get_irqchip_state - returns the irqchip state of a interrupt. | ||
1804 | * @irq: Interrupt line that is forwarded to a VM | ||
1805 | * @which: One of IRQCHIP_STATE_* the caller wants to know about | ||
1806 | * @state: a pointer to a boolean where the state is to be storeed | ||
1807 | * | ||
1808 | * This call snapshots the internal irqchip state of an | ||
1809 | * interrupt, returning into @state the bit corresponding to | ||
1810 | * stage @which | ||
1811 | * | ||
1812 | * This function should be called with preemption disabled if the | ||
1813 | * interrupt controller has per-cpu registers. | ||
1814 | */ | ||
1815 | int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which, | ||
1816 | bool *state) | ||
1817 | { | ||
1818 | struct irq_desc *desc; | ||
1819 | struct irq_data *data; | ||
1820 | struct irq_chip *chip; | ||
1821 | unsigned long flags; | ||
1822 | int err = -EINVAL; | ||
1823 | |||
1824 | desc = irq_get_desc_buslock(irq, &flags, 0); | ||
1825 | if (!desc) | ||
1826 | return err; | ||
1827 | |||
1828 | data = irq_desc_get_irq_data(desc); | ||
1829 | |||
1830 | do { | ||
1831 | chip = irq_data_get_irq_chip(data); | ||
1832 | if (chip->irq_get_irqchip_state) | ||
1833 | break; | ||
1834 | #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY | ||
1835 | data = data->parent_data; | ||
1836 | #else | ||
1837 | data = NULL; | ||
1838 | #endif | ||
1839 | } while (data); | ||
1840 | |||
1841 | if (data) | ||
1842 | err = chip->irq_get_irqchip_state(data, which, state); | ||
1843 | |||
1844 | irq_put_desc_busunlock(desc, flags); | ||
1845 | return err; | ||
1846 | } | ||
1847 | |||
1848 | /** | ||
1849 | * irq_set_irqchip_state - set the state of a forwarded interrupt. | ||
1850 | * @irq: Interrupt line that is forwarded to a VM | ||
1851 | * @which: State to be restored (one of IRQCHIP_STATE_*) | ||
1852 | * @val: Value corresponding to @which | ||
1853 | * | ||
1854 | * This call sets the internal irqchip state of an interrupt, | ||
1855 | * depending on the value of @which. | ||
1856 | * | ||
1857 | * This function should be called with preemption disabled if the | ||
1858 | * interrupt controller has per-cpu registers. | ||
1859 | */ | ||
1860 | int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, | ||
1861 | bool val) | ||
1862 | { | ||
1863 | struct irq_desc *desc; | ||
1864 | struct irq_data *data; | ||
1865 | struct irq_chip *chip; | ||
1866 | unsigned long flags; | ||
1867 | int err = -EINVAL; | ||
1868 | |||
1869 | desc = irq_get_desc_buslock(irq, &flags, 0); | ||
1870 | if (!desc) | ||
1871 | return err; | ||
1872 | |||
1873 | data = irq_desc_get_irq_data(desc); | ||
1874 | |||
1875 | do { | ||
1876 | chip = irq_data_get_irq_chip(data); | ||
1877 | if (chip->irq_set_irqchip_state) | ||
1878 | break; | ||
1879 | #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY | ||
1880 | data = data->parent_data; | ||
1881 | #else | ||
1882 | data = NULL; | ||
1883 | #endif | ||
1884 | } while (data); | ||
1885 | |||
1886 | if (data) | ||
1887 | err = chip->irq_set_irqchip_state(data, which, val); | ||
1888 | |||
1889 | irq_put_desc_busunlock(desc, flags); | ||
1890 | return err; | ||
1891 | } | ||
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 3e18163f336f..474de5cb394d 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c | |||
@@ -310,8 +310,15 @@ void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) | |||
310 | struct msi_desc *desc; | 310 | struct msi_desc *desc; |
311 | 311 | ||
312 | for_each_msi_entry(desc, dev) { | 312 | for_each_msi_entry(desc, dev) { |
313 | irq_domain_free_irqs(desc->irq, desc->nvec_used); | 313 | /* |
314 | desc->irq = 0; | 314 | * We might have failed to allocate an MSI early |
315 | * enough that there is no IRQ associated to this | ||
316 | * entry. If that's the case, don't do anything. | ||
317 | */ | ||
318 | if (desc->irq) { | ||
319 | irq_domain_free_irqs(desc->irq, desc->nvec_used); | ||
320 | desc->irq = 0; | ||
321 | } | ||
315 | } | 322 | } |
316 | } | 323 | } |
317 | 324 | ||
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 3f9f1d6b4c2e..284e2691e380 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c | |||
@@ -335,32 +335,20 @@ unlock: | |||
335 | rcu_read_unlock(); | 335 | rcu_read_unlock(); |
336 | } | 336 | } |
337 | 337 | ||
338 | static int klp_disable_func(struct klp_func *func) | 338 | static void klp_disable_func(struct klp_func *func) |
339 | { | 339 | { |
340 | struct klp_ops *ops; | 340 | struct klp_ops *ops; |
341 | int ret; | ||
342 | |||
343 | if (WARN_ON(func->state != KLP_ENABLED)) | ||
344 | return -EINVAL; | ||
345 | 341 | ||
346 | if (WARN_ON(!func->old_addr)) | 342 | WARN_ON(func->state != KLP_ENABLED); |
347 | return -EINVAL; | 343 | WARN_ON(!func->old_addr); |
348 | 344 | ||
349 | ops = klp_find_ops(func->old_addr); | 345 | ops = klp_find_ops(func->old_addr); |
350 | if (WARN_ON(!ops)) | 346 | if (WARN_ON(!ops)) |
351 | return -EINVAL; | 347 | return; |
352 | 348 | ||
353 | if (list_is_singular(&ops->func_stack)) { | 349 | if (list_is_singular(&ops->func_stack)) { |
354 | ret = unregister_ftrace_function(&ops->fops); | 350 | WARN_ON(unregister_ftrace_function(&ops->fops)); |
355 | if (ret) { | 351 | WARN_ON(ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0)); |
356 | pr_err("failed to unregister ftrace handler for function '%s' (%d)\n", | ||
357 | func->old_name, ret); | ||
358 | return ret; | ||
359 | } | ||
360 | |||
361 | ret = ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0); | ||
362 | if (ret) | ||
363 | pr_warn("function unregister succeeded but failed to clear the filter\n"); | ||
364 | 352 | ||
365 | list_del_rcu(&func->stack_node); | 353 | list_del_rcu(&func->stack_node); |
366 | list_del(&ops->node); | 354 | list_del(&ops->node); |
@@ -370,8 +358,6 @@ static int klp_disable_func(struct klp_func *func) | |||
370 | } | 358 | } |
371 | 359 | ||
372 | func->state = KLP_DISABLED; | 360 | func->state = KLP_DISABLED; |
373 | |||
374 | return 0; | ||
375 | } | 361 | } |
376 | 362 | ||
377 | static int klp_enable_func(struct klp_func *func) | 363 | static int klp_enable_func(struct klp_func *func) |
@@ -432,23 +418,15 @@ err: | |||
432 | return ret; | 418 | return ret; |
433 | } | 419 | } |
434 | 420 | ||
435 | static int klp_disable_object(struct klp_object *obj) | 421 | static void klp_disable_object(struct klp_object *obj) |
436 | { | 422 | { |
437 | struct klp_func *func; | 423 | struct klp_func *func; |
438 | int ret; | ||
439 | 424 | ||
440 | for (func = obj->funcs; func->old_name; func++) { | 425 | for (func = obj->funcs; func->old_name; func++) |
441 | if (func->state != KLP_ENABLED) | 426 | if (func->state == KLP_ENABLED) |
442 | continue; | 427 | klp_disable_func(func); |
443 | |||
444 | ret = klp_disable_func(func); | ||
445 | if (ret) | ||
446 | return ret; | ||
447 | } | ||
448 | 428 | ||
449 | obj->state = KLP_DISABLED; | 429 | obj->state = KLP_DISABLED; |
450 | |||
451 | return 0; | ||
452 | } | 430 | } |
453 | 431 | ||
454 | static int klp_enable_object(struct klp_object *obj) | 432 | static int klp_enable_object(struct klp_object *obj) |
@@ -464,22 +442,19 @@ static int klp_enable_object(struct klp_object *obj) | |||
464 | 442 | ||
465 | for (func = obj->funcs; func->old_name; func++) { | 443 | for (func = obj->funcs; func->old_name; func++) { |
466 | ret = klp_enable_func(func); | 444 | ret = klp_enable_func(func); |
467 | if (ret) | 445 | if (ret) { |
468 | goto unregister; | 446 | klp_disable_object(obj); |
447 | return ret; | ||
448 | } | ||
469 | } | 449 | } |
470 | obj->state = KLP_ENABLED; | 450 | obj->state = KLP_ENABLED; |
471 | 451 | ||
472 | return 0; | 452 | return 0; |
473 | |||
474 | unregister: | ||
475 | WARN_ON(klp_disable_object(obj)); | ||
476 | return ret; | ||
477 | } | 453 | } |
478 | 454 | ||
479 | static int __klp_disable_patch(struct klp_patch *patch) | 455 | static int __klp_disable_patch(struct klp_patch *patch) |
480 | { | 456 | { |
481 | struct klp_object *obj; | 457 | struct klp_object *obj; |
482 | int ret; | ||
483 | 458 | ||
484 | /* enforce stacking: only the last enabled patch can be disabled */ | 459 | /* enforce stacking: only the last enabled patch can be disabled */ |
485 | if (!list_is_last(&patch->list, &klp_patches) && | 460 | if (!list_is_last(&patch->list, &klp_patches) && |
@@ -489,12 +464,8 @@ static int __klp_disable_patch(struct klp_patch *patch) | |||
489 | pr_notice("disabling patch '%s'\n", patch->mod->name); | 464 | pr_notice("disabling patch '%s'\n", patch->mod->name); |
490 | 465 | ||
491 | for (obj = patch->objs; obj->funcs; obj++) { | 466 | for (obj = patch->objs; obj->funcs; obj++) { |
492 | if (obj->state != KLP_ENABLED) | 467 | if (obj->state == KLP_ENABLED) |
493 | continue; | 468 | klp_disable_object(obj); |
494 | |||
495 | ret = klp_disable_object(obj); | ||
496 | if (ret) | ||
497 | return ret; | ||
498 | } | 469 | } |
499 | 470 | ||
500 | patch->state = KLP_DISABLED; | 471 | patch->state = KLP_DISABLED; |
@@ -553,8 +524,6 @@ static int __klp_enable_patch(struct klp_patch *patch) | |||
553 | pr_notice("enabling patch '%s'\n", patch->mod->name); | 524 | pr_notice("enabling patch '%s'\n", patch->mod->name); |
554 | 525 | ||
555 | for (obj = patch->objs; obj->funcs; obj++) { | 526 | for (obj = patch->objs; obj->funcs; obj++) { |
556 | klp_find_object_module(obj); | ||
557 | |||
558 | if (!klp_is_object_loaded(obj)) | 527 | if (!klp_is_object_loaded(obj)) |
559 | continue; | 528 | continue; |
560 | 529 | ||
@@ -945,7 +914,6 @@ static void klp_module_notify_going(struct klp_patch *patch, | |||
945 | { | 914 | { |
946 | struct module *pmod = patch->mod; | 915 | struct module *pmod = patch->mod; |
947 | struct module *mod = obj->mod; | 916 | struct module *mod = obj->mod; |
948 | int ret; | ||
949 | 917 | ||
950 | if (patch->state == KLP_DISABLED) | 918 | if (patch->state == KLP_DISABLED) |
951 | goto disabled; | 919 | goto disabled; |
@@ -953,10 +921,7 @@ static void klp_module_notify_going(struct klp_patch *patch, | |||
953 | pr_notice("reverting patch '%s' on unloading module '%s'\n", | 921 | pr_notice("reverting patch '%s' on unloading module '%s'\n", |
954 | pmod->name, mod->name); | 922 | pmod->name, mod->name); |
955 | 923 | ||
956 | ret = klp_disable_object(obj); | 924 | klp_disable_object(obj); |
957 | if (ret) | ||
958 | pr_warn("failed to revert patch '%s' on module '%s' (%d)\n", | ||
959 | pmod->name, mod->name, ret); | ||
960 | 925 | ||
961 | disabled: | 926 | disabled: |
962 | klp_free_object_loaded(obj); | 927 | klp_free_object_loaded(obj); |
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 88d0d4420ad2..a0831e1b99f4 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c | |||
@@ -551,7 +551,21 @@ static void print_lockdep_cache(struct lockdep_map *lock) | |||
551 | 551 | ||
552 | static void print_lock(struct held_lock *hlock) | 552 | static void print_lock(struct held_lock *hlock) |
553 | { | 553 | { |
554 | print_lock_name(hlock_class(hlock)); | 554 | /* |
555 | * We can be called locklessly through debug_show_all_locks() so be | ||
556 | * extra careful, the hlock might have been released and cleared. | ||
557 | */ | ||
558 | unsigned int class_idx = hlock->class_idx; | ||
559 | |||
560 | /* Don't re-read hlock->class_idx, can't use READ_ONCE() on bitfields: */ | ||
561 | barrier(); | ||
562 | |||
563 | if (!class_idx || (class_idx - 1) >= MAX_LOCKDEP_KEYS) { | ||
564 | printk("<RELEASED>\n"); | ||
565 | return; | ||
566 | } | ||
567 | |||
568 | print_lock_name(lock_classes + class_idx - 1); | ||
555 | printk(", at: "); | 569 | printk(", at: "); |
556 | print_ip_sym(hlock->acquire_ip); | 570 | print_ip_sym(hlock->acquire_ip); |
557 | } | 571 | } |
@@ -633,7 +647,7 @@ static int count_matching_names(struct lock_class *new_class) | |||
633 | if (!new_class->name) | 647 | if (!new_class->name) |
634 | return 0; | 648 | return 0; |
635 | 649 | ||
636 | list_for_each_entry(class, &all_lock_classes, lock_entry) { | 650 | list_for_each_entry_rcu(class, &all_lock_classes, lock_entry) { |
637 | if (new_class->key - new_class->subclass == class->key) | 651 | if (new_class->key - new_class->subclass == class->key) |
638 | return class->name_version; | 652 | return class->name_version; |
639 | if (class->name && !strcmp(class->name, new_class->name)) | 653 | if (class->name && !strcmp(class->name, new_class->name)) |
@@ -700,10 +714,12 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) | |||
700 | hash_head = classhashentry(key); | 714 | hash_head = classhashentry(key); |
701 | 715 | ||
702 | /* | 716 | /* |
703 | * We can walk the hash lockfree, because the hash only | 717 | * We do an RCU walk of the hash, see lockdep_free_key_range(). |
704 | * grows, and we are careful when adding entries to the end: | ||
705 | */ | 718 | */ |
706 | list_for_each_entry(class, hash_head, hash_entry) { | 719 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) |
720 | return NULL; | ||
721 | |||
722 | list_for_each_entry_rcu(class, hash_head, hash_entry) { | ||
707 | if (class->key == key) { | 723 | if (class->key == key) { |
708 | /* | 724 | /* |
709 | * Huh! same key, different name? Did someone trample | 725 | * Huh! same key, different name? Did someone trample |
@@ -728,7 +744,8 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) | |||
728 | struct lockdep_subclass_key *key; | 744 | struct lockdep_subclass_key *key; |
729 | struct list_head *hash_head; | 745 | struct list_head *hash_head; |
730 | struct lock_class *class; | 746 | struct lock_class *class; |
731 | unsigned long flags; | 747 | |
748 | DEBUG_LOCKS_WARN_ON(!irqs_disabled()); | ||
732 | 749 | ||
733 | class = look_up_lock_class(lock, subclass); | 750 | class = look_up_lock_class(lock, subclass); |
734 | if (likely(class)) | 751 | if (likely(class)) |
@@ -750,28 +767,26 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) | |||
750 | key = lock->key->subkeys + subclass; | 767 | key = lock->key->subkeys + subclass; |
751 | hash_head = classhashentry(key); | 768 | hash_head = classhashentry(key); |
752 | 769 | ||
753 | raw_local_irq_save(flags); | ||
754 | if (!graph_lock()) { | 770 | if (!graph_lock()) { |
755 | raw_local_irq_restore(flags); | ||
756 | return NULL; | 771 | return NULL; |
757 | } | 772 | } |
758 | /* | 773 | /* |
759 | * We have to do the hash-walk again, to avoid races | 774 | * We have to do the hash-walk again, to avoid races |
760 | * with another CPU: | 775 | * with another CPU: |
761 | */ | 776 | */ |
762 | list_for_each_entry(class, hash_head, hash_entry) | 777 | list_for_each_entry_rcu(class, hash_head, hash_entry) { |
763 | if (class->key == key) | 778 | if (class->key == key) |
764 | goto out_unlock_set; | 779 | goto out_unlock_set; |
780 | } | ||
781 | |||
765 | /* | 782 | /* |
766 | * Allocate a new key from the static array, and add it to | 783 | * Allocate a new key from the static array, and add it to |
767 | * the hash: | 784 | * the hash: |
768 | */ | 785 | */ |
769 | if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { | 786 | if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { |
770 | if (!debug_locks_off_graph_unlock()) { | 787 | if (!debug_locks_off_graph_unlock()) { |
771 | raw_local_irq_restore(flags); | ||
772 | return NULL; | 788 | return NULL; |
773 | } | 789 | } |
774 | raw_local_irq_restore(flags); | ||
775 | 790 | ||
776 | print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!"); | 791 | print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!"); |
777 | dump_stack(); | 792 | dump_stack(); |
@@ -798,7 +813,6 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) | |||
798 | 813 | ||
799 | if (verbose(class)) { | 814 | if (verbose(class)) { |
800 | graph_unlock(); | 815 | graph_unlock(); |
801 | raw_local_irq_restore(flags); | ||
802 | 816 | ||
803 | printk("\nnew class %p: %s", class->key, class->name); | 817 | printk("\nnew class %p: %s", class->key, class->name); |
804 | if (class->name_version > 1) | 818 | if (class->name_version > 1) |
@@ -806,15 +820,12 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) | |||
806 | printk("\n"); | 820 | printk("\n"); |
807 | dump_stack(); | 821 | dump_stack(); |
808 | 822 | ||
809 | raw_local_irq_save(flags); | ||
810 | if (!graph_lock()) { | 823 | if (!graph_lock()) { |
811 | raw_local_irq_restore(flags); | ||
812 | return NULL; | 824 | return NULL; |
813 | } | 825 | } |
814 | } | 826 | } |
815 | out_unlock_set: | 827 | out_unlock_set: |
816 | graph_unlock(); | 828 | graph_unlock(); |
817 | raw_local_irq_restore(flags); | ||
818 | 829 | ||
819 | out_set_class_cache: | 830 | out_set_class_cache: |
820 | if (!subclass || force) | 831 | if (!subclass || force) |
@@ -870,11 +881,9 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this, | |||
870 | entry->distance = distance; | 881 | entry->distance = distance; |
871 | entry->trace = *trace; | 882 | entry->trace = *trace; |
872 | /* | 883 | /* |
873 | * Since we never remove from the dependency list, the list can | 884 | * Both allocation and removal are done under the graph lock; but |
874 | * be walked lockless by other CPUs, it's only allocation | 885 | * iteration is under RCU-sched; see look_up_lock_class() and |
875 | * that must be protected by the spinlock. But this also means | 886 | * lockdep_free_key_range(). |
876 | * we must make new entries visible only once writes to the | ||
877 | * entry become visible - hence the RCU op: | ||
878 | */ | 887 | */ |
879 | list_add_tail_rcu(&entry->entry, head); | 888 | list_add_tail_rcu(&entry->entry, head); |
880 | 889 | ||
@@ -1025,7 +1034,9 @@ static int __bfs(struct lock_list *source_entry, | |||
1025 | else | 1034 | else |
1026 | head = &lock->class->locks_before; | 1035 | head = &lock->class->locks_before; |
1027 | 1036 | ||
1028 | list_for_each_entry(entry, head, entry) { | 1037 | DEBUG_LOCKS_WARN_ON(!irqs_disabled()); |
1038 | |||
1039 | list_for_each_entry_rcu(entry, head, entry) { | ||
1029 | if (!lock_accessed(entry)) { | 1040 | if (!lock_accessed(entry)) { |
1030 | unsigned int cq_depth; | 1041 | unsigned int cq_depth; |
1031 | mark_lock_accessed(entry, lock); | 1042 | mark_lock_accessed(entry, lock); |
@@ -2022,7 +2033,7 @@ static inline int lookup_chain_cache(struct task_struct *curr, | |||
2022 | * We can walk it lock-free, because entries only get added | 2033 | * We can walk it lock-free, because entries only get added |
2023 | * to the hash: | 2034 | * to the hash: |
2024 | */ | 2035 | */ |
2025 | list_for_each_entry(chain, hash_head, entry) { | 2036 | list_for_each_entry_rcu(chain, hash_head, entry) { |
2026 | if (chain->chain_key == chain_key) { | 2037 | if (chain->chain_key == chain_key) { |
2027 | cache_hit: | 2038 | cache_hit: |
2028 | debug_atomic_inc(chain_lookup_hits); | 2039 | debug_atomic_inc(chain_lookup_hits); |
@@ -2996,8 +3007,18 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, | |||
2996 | if (unlikely(!debug_locks)) | 3007 | if (unlikely(!debug_locks)) |
2997 | return; | 3008 | return; |
2998 | 3009 | ||
2999 | if (subclass) | 3010 | if (subclass) { |
3011 | unsigned long flags; | ||
3012 | |||
3013 | if (DEBUG_LOCKS_WARN_ON(current->lockdep_recursion)) | ||
3014 | return; | ||
3015 | |||
3016 | raw_local_irq_save(flags); | ||
3017 | current->lockdep_recursion = 1; | ||
3000 | register_lock_class(lock, subclass, 1); | 3018 | register_lock_class(lock, subclass, 1); |
3019 | current->lockdep_recursion = 0; | ||
3020 | raw_local_irq_restore(flags); | ||
3021 | } | ||
3001 | } | 3022 | } |
3002 | EXPORT_SYMBOL_GPL(lockdep_init_map); | 3023 | EXPORT_SYMBOL_GPL(lockdep_init_map); |
3003 | 3024 | ||
@@ -3887,9 +3908,17 @@ static inline int within(const void *addr, void *start, unsigned long size) | |||
3887 | return addr >= start && addr < start + size; | 3908 | return addr >= start && addr < start + size; |
3888 | } | 3909 | } |
3889 | 3910 | ||
3911 | /* | ||
3912 | * Used in module.c to remove lock classes from memory that is going to be | ||
3913 | * freed; and possibly re-used by other modules. | ||
3914 | * | ||
3915 | * We will have had one sync_sched() before getting here, so we're guaranteed | ||
3916 | * nobody will look up these exact classes -- they're properly dead but still | ||
3917 | * allocated. | ||
3918 | */ | ||
3890 | void lockdep_free_key_range(void *start, unsigned long size) | 3919 | void lockdep_free_key_range(void *start, unsigned long size) |
3891 | { | 3920 | { |
3892 | struct lock_class *class, *next; | 3921 | struct lock_class *class; |
3893 | struct list_head *head; | 3922 | struct list_head *head; |
3894 | unsigned long flags; | 3923 | unsigned long flags; |
3895 | int i; | 3924 | int i; |
@@ -3905,7 +3934,7 @@ void lockdep_free_key_range(void *start, unsigned long size) | |||
3905 | head = classhash_table + i; | 3934 | head = classhash_table + i; |
3906 | if (list_empty(head)) | 3935 | if (list_empty(head)) |
3907 | continue; | 3936 | continue; |
3908 | list_for_each_entry_safe(class, next, head, hash_entry) { | 3937 | list_for_each_entry_rcu(class, head, hash_entry) { |
3909 | if (within(class->key, start, size)) | 3938 | if (within(class->key, start, size)) |
3910 | zap_class(class); | 3939 | zap_class(class); |
3911 | else if (within(class->name, start, size)) | 3940 | else if (within(class->name, start, size)) |
@@ -3916,11 +3945,25 @@ void lockdep_free_key_range(void *start, unsigned long size) | |||
3916 | if (locked) | 3945 | if (locked) |
3917 | graph_unlock(); | 3946 | graph_unlock(); |
3918 | raw_local_irq_restore(flags); | 3947 | raw_local_irq_restore(flags); |
3948 | |||
3949 | /* | ||
3950 | * Wait for any possible iterators from look_up_lock_class() to pass | ||
3951 | * before continuing to free the memory they refer to. | ||
3952 | * | ||
3953 | * sync_sched() is sufficient because the read-side is IRQ disable. | ||
3954 | */ | ||
3955 | synchronize_sched(); | ||
3956 | |||
3957 | /* | ||
3958 | * XXX at this point we could return the resources to the pool; | ||
3959 | * instead we leak them. We would need to change to bitmap allocators | ||
3960 | * instead of the linear allocators we have now. | ||
3961 | */ | ||
3919 | } | 3962 | } |
3920 | 3963 | ||
3921 | void lockdep_reset_lock(struct lockdep_map *lock) | 3964 | void lockdep_reset_lock(struct lockdep_map *lock) |
3922 | { | 3965 | { |
3923 | struct lock_class *class, *next; | 3966 | struct lock_class *class; |
3924 | struct list_head *head; | 3967 | struct list_head *head; |
3925 | unsigned long flags; | 3968 | unsigned long flags; |
3926 | int i, j; | 3969 | int i, j; |
@@ -3948,7 +3991,7 @@ void lockdep_reset_lock(struct lockdep_map *lock) | |||
3948 | head = classhash_table + i; | 3991 | head = classhash_table + i; |
3949 | if (list_empty(head)) | 3992 | if (list_empty(head)) |
3950 | continue; | 3993 | continue; |
3951 | list_for_each_entry_safe(class, next, head, hash_entry) { | 3994 | list_for_each_entry_rcu(class, head, hash_entry) { |
3952 | int match = 0; | 3995 | int match = 0; |
3953 | 3996 | ||
3954 | for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++) | 3997 | for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++) |
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index d1fe2ba5bac9..75e114bdf3f2 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h | |||
@@ -78,7 +78,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) | |||
78 | */ | 78 | */ |
79 | return; | 79 | return; |
80 | } | 80 | } |
81 | ACCESS_ONCE(prev->next) = node; | 81 | WRITE_ONCE(prev->next, node); |
82 | 82 | ||
83 | /* Wait until the lock holder passes the lock down. */ | 83 | /* Wait until the lock holder passes the lock down. */ |
84 | arch_mcs_spin_lock_contended(&node->locked); | 84 | arch_mcs_spin_lock_contended(&node->locked); |
@@ -91,7 +91,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) | |||
91 | static inline | 91 | static inline |
92 | void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) | 92 | void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) |
93 | { | 93 | { |
94 | struct mcs_spinlock *next = ACCESS_ONCE(node->next); | 94 | struct mcs_spinlock *next = READ_ONCE(node->next); |
95 | 95 | ||
96 | if (likely(!next)) { | 96 | if (likely(!next)) { |
97 | /* | 97 | /* |
@@ -100,7 +100,7 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) | |||
100 | if (likely(cmpxchg(lock, node, NULL) == node)) | 100 | if (likely(cmpxchg(lock, node, NULL) == node)) |
101 | return; | 101 | return; |
102 | /* Wait until the next pointer is set */ | 102 | /* Wait until the next pointer is set */ |
103 | while (!(next = ACCESS_ONCE(node->next))) | 103 | while (!(next = READ_ONCE(node->next))) |
104 | cpu_relax_lowlatency(); | 104 | cpu_relax_lowlatency(); |
105 | } | 105 | } |
106 | 106 | ||
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 94674e5919cb..4cccea6b8934 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c | |||
@@ -25,7 +25,7 @@ | |||
25 | #include <linux/spinlock.h> | 25 | #include <linux/spinlock.h> |
26 | #include <linux/interrupt.h> | 26 | #include <linux/interrupt.h> |
27 | #include <linux/debug_locks.h> | 27 | #include <linux/debug_locks.h> |
28 | #include "mcs_spinlock.h" | 28 | #include <linux/osq_lock.h> |
29 | 29 | ||
30 | /* | 30 | /* |
31 | * In the DEBUG case we are using the "NULL fastpath" for mutexes, | 31 | * In the DEBUG case we are using the "NULL fastpath" for mutexes, |
@@ -217,44 +217,35 @@ ww_mutex_set_context_slowpath(struct ww_mutex *lock, | |||
217 | } | 217 | } |
218 | 218 | ||
219 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER | 219 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER |
220 | static inline bool owner_running(struct mutex *lock, struct task_struct *owner) | ||
221 | { | ||
222 | if (lock->owner != owner) | ||
223 | return false; | ||
224 | |||
225 | /* | ||
226 | * Ensure we emit the owner->on_cpu, dereference _after_ checking | ||
227 | * lock->owner still matches owner, if that fails, owner might | ||
228 | * point to free()d memory, if it still matches, the rcu_read_lock() | ||
229 | * ensures the memory stays valid. | ||
230 | */ | ||
231 | barrier(); | ||
232 | |||
233 | return owner->on_cpu; | ||
234 | } | ||
235 | |||
236 | /* | 220 | /* |
237 | * Look out! "owner" is an entirely speculative pointer | 221 | * Look out! "owner" is an entirely speculative pointer |
238 | * access and not reliable. | 222 | * access and not reliable. |
239 | */ | 223 | */ |
240 | static noinline | 224 | static noinline |
241 | int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) | 225 | bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) |
242 | { | 226 | { |
227 | bool ret = true; | ||
228 | |||
243 | rcu_read_lock(); | 229 | rcu_read_lock(); |
244 | while (owner_running(lock, owner)) { | 230 | while (lock->owner == owner) { |
245 | if (need_resched()) | 231 | /* |
232 | * Ensure we emit the owner->on_cpu, dereference _after_ | ||
233 | * checking lock->owner still matches owner. If that fails, | ||
234 | * owner might point to freed memory. If it still matches, | ||
235 | * the rcu_read_lock() ensures the memory stays valid. | ||
236 | */ | ||
237 | barrier(); | ||
238 | |||
239 | if (!owner->on_cpu || need_resched()) { | ||
240 | ret = false; | ||
246 | break; | 241 | break; |
242 | } | ||
247 | 243 | ||
248 | cpu_relax_lowlatency(); | 244 | cpu_relax_lowlatency(); |
249 | } | 245 | } |
250 | rcu_read_unlock(); | 246 | rcu_read_unlock(); |
251 | 247 | ||
252 | /* | 248 | return ret; |
253 | * We break out the loop above on need_resched() and when the | ||
254 | * owner changed, which is a sign for heavy contention. Return | ||
255 | * success only when lock->owner is NULL. | ||
256 | */ | ||
257 | return lock->owner == NULL; | ||
258 | } | 249 | } |
259 | 250 | ||
260 | /* | 251 | /* |
@@ -269,7 +260,7 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) | |||
269 | return 0; | 260 | return 0; |
270 | 261 | ||
271 | rcu_read_lock(); | 262 | rcu_read_lock(); |
272 | owner = ACCESS_ONCE(lock->owner); | 263 | owner = READ_ONCE(lock->owner); |
273 | if (owner) | 264 | if (owner) |
274 | retval = owner->on_cpu; | 265 | retval = owner->on_cpu; |
275 | rcu_read_unlock(); | 266 | rcu_read_unlock(); |
@@ -343,7 +334,7 @@ static bool mutex_optimistic_spin(struct mutex *lock, | |||
343 | * As such, when deadlock detection needs to be | 334 | * As such, when deadlock detection needs to be |
344 | * performed the optimistic spinning cannot be done. | 335 | * performed the optimistic spinning cannot be done. |
345 | */ | 336 | */ |
346 | if (ACCESS_ONCE(ww->ctx)) | 337 | if (READ_ONCE(ww->ctx)) |
347 | break; | 338 | break; |
348 | } | 339 | } |
349 | 340 | ||
@@ -351,7 +342,7 @@ static bool mutex_optimistic_spin(struct mutex *lock, | |||
351 | * If there's an owner, wait for it to either | 342 | * If there's an owner, wait for it to either |
352 | * release the lock or go to sleep. | 343 | * release the lock or go to sleep. |
353 | */ | 344 | */ |
354 | owner = ACCESS_ONCE(lock->owner); | 345 | owner = READ_ONCE(lock->owner); |
355 | if (owner && !mutex_spin_on_owner(lock, owner)) | 346 | if (owner && !mutex_spin_on_owner(lock, owner)) |
356 | break; | 347 | break; |
357 | 348 | ||
@@ -490,7 +481,7 @@ static inline int __sched | |||
490 | __ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) | 481 | __ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) |
491 | { | 482 | { |
492 | struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); | 483 | struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); |
493 | struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx); | 484 | struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx); |
494 | 485 | ||
495 | if (!hold_ctx) | 486 | if (!hold_ctx) |
496 | return 0; | 487 | return 0; |
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index c112d00341b0..dc85ee23a26f 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c | |||
@@ -98,7 +98,7 @@ bool osq_lock(struct optimistic_spin_queue *lock) | |||
98 | 98 | ||
99 | prev = decode_cpu(old); | 99 | prev = decode_cpu(old); |
100 | node->prev = prev; | 100 | node->prev = prev; |
101 | ACCESS_ONCE(prev->next) = node; | 101 | WRITE_ONCE(prev->next, node); |
102 | 102 | ||
103 | /* | 103 | /* |
104 | * Normally @prev is untouchable after the above store; because at that | 104 | * Normally @prev is untouchable after the above store; because at that |
@@ -109,7 +109,7 @@ bool osq_lock(struct optimistic_spin_queue *lock) | |||
109 | * cmpxchg in an attempt to undo our queueing. | 109 | * cmpxchg in an attempt to undo our queueing. |
110 | */ | 110 | */ |
111 | 111 | ||
112 | while (!ACCESS_ONCE(node->locked)) { | 112 | while (!READ_ONCE(node->locked)) { |
113 | /* | 113 | /* |
114 | * If we need to reschedule bail... so we can block. | 114 | * If we need to reschedule bail... so we can block. |
115 | */ | 115 | */ |
@@ -148,7 +148,7 @@ unqueue: | |||
148 | * Or we race against a concurrent unqueue()'s step-B, in which | 148 | * Or we race against a concurrent unqueue()'s step-B, in which |
149 | * case its step-C will write us a new @node->prev pointer. | 149 | * case its step-C will write us a new @node->prev pointer. |
150 | */ | 150 | */ |
151 | prev = ACCESS_ONCE(node->prev); | 151 | prev = READ_ONCE(node->prev); |
152 | } | 152 | } |
153 | 153 | ||
154 | /* | 154 | /* |
@@ -170,8 +170,8 @@ unqueue: | |||
170 | * it will wait in Step-A. | 170 | * it will wait in Step-A. |
171 | */ | 171 | */ |
172 | 172 | ||
173 | ACCESS_ONCE(next->prev) = prev; | 173 | WRITE_ONCE(next->prev, prev); |
174 | ACCESS_ONCE(prev->next) = next; | 174 | WRITE_ONCE(prev->next, next); |
175 | 175 | ||
176 | return false; | 176 | return false; |
177 | } | 177 | } |
@@ -193,11 +193,11 @@ void osq_unlock(struct optimistic_spin_queue *lock) | |||
193 | node = this_cpu_ptr(&osq_node); | 193 | node = this_cpu_ptr(&osq_node); |
194 | next = xchg(&node->next, NULL); | 194 | next = xchg(&node->next, NULL); |
195 | if (next) { | 195 | if (next) { |
196 | ACCESS_ONCE(next->locked) = 1; | 196 | WRITE_ONCE(next->locked, 1); |
197 | return; | 197 | return; |
198 | } | 198 | } |
199 | 199 | ||
200 | next = osq_wait_next(lock, node, NULL); | 200 | next = osq_wait_next(lock, node, NULL); |
201 | if (next) | 201 | if (next) |
202 | ACCESS_ONCE(next->locked) = 1; | 202 | WRITE_ONCE(next->locked, 1); |
203 | } | 203 | } |
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 6357265a31ad..b73279367087 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c | |||
@@ -349,7 +349,7 @@ static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p) | |||
349 | * | 349 | * |
350 | * @task: the task owning the mutex (owner) for which a chain walk is | 350 | * @task: the task owning the mutex (owner) for which a chain walk is |
351 | * probably needed | 351 | * probably needed |
352 | * @deadlock_detect: do we have to carry out deadlock detection? | 352 | * @chwalk: do we have to carry out deadlock detection? |
353 | * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck | 353 | * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck |
354 | * things for a task that has just got its priority adjusted, and | 354 | * things for a task that has just got its priority adjusted, and |
355 | * is waiting on a mutex) | 355 | * is waiting on a mutex) |
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c index 2555ae15ec14..3a5048572065 100644 --- a/kernel/locking/rwsem-spinlock.c +++ b/kernel/locking/rwsem-spinlock.c | |||
@@ -85,6 +85,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) | |||
85 | 85 | ||
86 | list_del(&waiter->list); | 86 | list_del(&waiter->list); |
87 | tsk = waiter->task; | 87 | tsk = waiter->task; |
88 | /* | ||
89 | * Make sure we do not wakeup the next reader before | ||
90 | * setting the nil condition to grant the next reader; | ||
91 | * otherwise we could miss the wakeup on the other | ||
92 | * side and end up sleeping again. See the pairing | ||
93 | * in rwsem_down_read_failed(). | ||
94 | */ | ||
88 | smp_mb(); | 95 | smp_mb(); |
89 | waiter->task = NULL; | 96 | waiter->task = NULL; |
90 | wake_up_process(tsk); | 97 | wake_up_process(tsk); |
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 2f7cc4076f50..3417d0172a5d 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c | |||
@@ -14,8 +14,9 @@ | |||
14 | #include <linux/init.h> | 14 | #include <linux/init.h> |
15 | #include <linux/export.h> | 15 | #include <linux/export.h> |
16 | #include <linux/sched/rt.h> | 16 | #include <linux/sched/rt.h> |
17 | #include <linux/osq_lock.h> | ||
17 | 18 | ||
18 | #include "mcs_spinlock.h" | 19 | #include "rwsem.h" |
19 | 20 | ||
20 | /* | 21 | /* |
21 | * Guide to the rw_semaphore's count field for common values. | 22 | * Guide to the rw_semaphore's count field for common values. |
@@ -186,6 +187,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) | |||
186 | waiter = list_entry(next, struct rwsem_waiter, list); | 187 | waiter = list_entry(next, struct rwsem_waiter, list); |
187 | next = waiter->list.next; | 188 | next = waiter->list.next; |
188 | tsk = waiter->task; | 189 | tsk = waiter->task; |
190 | /* | ||
191 | * Make sure we do not wakeup the next reader before | ||
192 | * setting the nil condition to grant the next reader; | ||
193 | * otherwise we could miss the wakeup on the other | ||
194 | * side and end up sleeping again. See the pairing | ||
195 | * in rwsem_down_read_failed(). | ||
196 | */ | ||
189 | smp_mb(); | 197 | smp_mb(); |
190 | waiter->task = NULL; | 198 | waiter->task = NULL; |
191 | wake_up_process(tsk); | 199 | wake_up_process(tsk); |
@@ -258,6 +266,7 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) | |||
258 | RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { | 266 | RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { |
259 | if (!list_is_singular(&sem->wait_list)) | 267 | if (!list_is_singular(&sem->wait_list)) |
260 | rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); | 268 | rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); |
269 | rwsem_set_owner(sem); | ||
261 | return true; | 270 | return true; |
262 | } | 271 | } |
263 | 272 | ||
@@ -270,15 +279,17 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) | |||
270 | */ | 279 | */ |
271 | static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) | 280 | static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) |
272 | { | 281 | { |
273 | long old, count = ACCESS_ONCE(sem->count); | 282 | long old, count = READ_ONCE(sem->count); |
274 | 283 | ||
275 | while (true) { | 284 | while (true) { |
276 | if (!(count == 0 || count == RWSEM_WAITING_BIAS)) | 285 | if (!(count == 0 || count == RWSEM_WAITING_BIAS)) |
277 | return false; | 286 | return false; |
278 | 287 | ||
279 | old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS); | 288 | old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS); |
280 | if (old == count) | 289 | if (old == count) { |
290 | rwsem_set_owner(sem); | ||
281 | return true; | 291 | return true; |
292 | } | ||
282 | 293 | ||
283 | count = old; | 294 | count = old; |
284 | } | 295 | } |
@@ -287,60 +298,67 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) | |||
287 | static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) | 298 | static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) |
288 | { | 299 | { |
289 | struct task_struct *owner; | 300 | struct task_struct *owner; |
290 | bool on_cpu = false; | 301 | bool ret = true; |
291 | 302 | ||
292 | if (need_resched()) | 303 | if (need_resched()) |
293 | return false; | 304 | return false; |
294 | 305 | ||
295 | rcu_read_lock(); | 306 | rcu_read_lock(); |
296 | owner = ACCESS_ONCE(sem->owner); | 307 | owner = READ_ONCE(sem->owner); |
297 | if (owner) | 308 | if (!owner) { |
298 | on_cpu = owner->on_cpu; | 309 | long count = READ_ONCE(sem->count); |
299 | rcu_read_unlock(); | 310 | /* |
300 | 311 | * If sem->owner is not set, yet we have just recently entered the | |
301 | /* | 312 | * slowpath with the lock being active, then there is a possibility |
302 | * If sem->owner is not set, yet we have just recently entered the | 313 | * reader(s) may have the lock. To be safe, bail spinning in these |
303 | * slowpath, then there is a possibility reader(s) may have the lock. | 314 | * situations. |
304 | * To be safe, avoid spinning in these situations. | 315 | */ |
305 | */ | 316 | if (count & RWSEM_ACTIVE_MASK) |
306 | return on_cpu; | 317 | ret = false; |
307 | } | 318 | goto done; |
308 | 319 | } | |
309 | static inline bool owner_running(struct rw_semaphore *sem, | ||
310 | struct task_struct *owner) | ||
311 | { | ||
312 | if (sem->owner != owner) | ||
313 | return false; | ||
314 | |||
315 | /* | ||
316 | * Ensure we emit the owner->on_cpu, dereference _after_ checking | ||
317 | * sem->owner still matches owner, if that fails, owner might | ||
318 | * point to free()d memory, if it still matches, the rcu_read_lock() | ||
319 | * ensures the memory stays valid. | ||
320 | */ | ||
321 | barrier(); | ||
322 | 320 | ||
323 | return owner->on_cpu; | 321 | ret = owner->on_cpu; |
322 | done: | ||
323 | rcu_read_unlock(); | ||
324 | return ret; | ||
324 | } | 325 | } |
325 | 326 | ||
326 | static noinline | 327 | static noinline |
327 | bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) | 328 | bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) |
328 | { | 329 | { |
330 | long count; | ||
331 | |||
329 | rcu_read_lock(); | 332 | rcu_read_lock(); |
330 | while (owner_running(sem, owner)) { | 333 | while (sem->owner == owner) { |
331 | if (need_resched()) | 334 | /* |
332 | break; | 335 | * Ensure we emit the owner->on_cpu, dereference _after_ |
336 | * checking sem->owner still matches owner, if that fails, | ||
337 | * owner might point to free()d memory, if it still matches, | ||
338 | * the rcu_read_lock() ensures the memory stays valid. | ||
339 | */ | ||
340 | barrier(); | ||
341 | |||
342 | /* abort spinning when need_resched or owner is not running */ | ||
343 | if (!owner->on_cpu || need_resched()) { | ||
344 | rcu_read_unlock(); | ||
345 | return false; | ||
346 | } | ||
333 | 347 | ||
334 | cpu_relax_lowlatency(); | 348 | cpu_relax_lowlatency(); |
335 | } | 349 | } |
336 | rcu_read_unlock(); | 350 | rcu_read_unlock(); |
337 | 351 | ||
352 | if (READ_ONCE(sem->owner)) | ||
353 | return true; /* new owner, continue spinning */ | ||
354 | |||
338 | /* | 355 | /* |
339 | * We break out the loop above on need_resched() or when the | 356 | * When the owner is not set, the lock could be free or |
340 | * owner changed, which is a sign for heavy contention. Return | 357 | * held by readers. Check the counter to verify the |
341 | * success only when sem->owner is NULL. | 358 | * state. |
342 | */ | 359 | */ |
343 | return sem->owner == NULL; | 360 | count = READ_ONCE(sem->count); |
361 | return (count == 0 || count == RWSEM_WAITING_BIAS); | ||
344 | } | 362 | } |
345 | 363 | ||
346 | static bool rwsem_optimistic_spin(struct rw_semaphore *sem) | 364 | static bool rwsem_optimistic_spin(struct rw_semaphore *sem) |
@@ -358,7 +376,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) | |||
358 | goto done; | 376 | goto done; |
359 | 377 | ||
360 | while (true) { | 378 | while (true) { |
361 | owner = ACCESS_ONCE(sem->owner); | 379 | owner = READ_ONCE(sem->owner); |
362 | if (owner && !rwsem_spin_on_owner(sem, owner)) | 380 | if (owner && !rwsem_spin_on_owner(sem, owner)) |
363 | break; | 381 | break; |
364 | 382 | ||
@@ -432,7 +450,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) | |||
432 | 450 | ||
433 | /* we're now waiting on the lock, but no longer actively locking */ | 451 | /* we're now waiting on the lock, but no longer actively locking */ |
434 | if (waiting) { | 452 | if (waiting) { |
435 | count = ACCESS_ONCE(sem->count); | 453 | count = READ_ONCE(sem->count); |
436 | 454 | ||
437 | /* | 455 | /* |
438 | * If there were already threads queued before us and there are | 456 | * If there were already threads queued before us and there are |
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index e2d3bc7f03b4..205be0ce34de 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c | |||
@@ -9,29 +9,9 @@ | |||
9 | #include <linux/sched.h> | 9 | #include <linux/sched.h> |
10 | #include <linux/export.h> | 10 | #include <linux/export.h> |
11 | #include <linux/rwsem.h> | 11 | #include <linux/rwsem.h> |
12 | |||
13 | #include <linux/atomic.h> | 12 | #include <linux/atomic.h> |
14 | 13 | ||
15 | #ifdef CONFIG_RWSEM_SPIN_ON_OWNER | 14 | #include "rwsem.h" |
16 | static inline void rwsem_set_owner(struct rw_semaphore *sem) | ||
17 | { | ||
18 | sem->owner = current; | ||
19 | } | ||
20 | |||
21 | static inline void rwsem_clear_owner(struct rw_semaphore *sem) | ||
22 | { | ||
23 | sem->owner = NULL; | ||
24 | } | ||
25 | |||
26 | #else | ||
27 | static inline void rwsem_set_owner(struct rw_semaphore *sem) | ||
28 | { | ||
29 | } | ||
30 | |||
31 | static inline void rwsem_clear_owner(struct rw_semaphore *sem) | ||
32 | { | ||
33 | } | ||
34 | #endif | ||
35 | 15 | ||
36 | /* | 16 | /* |
37 | * lock for reading | 17 | * lock for reading |
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h new file mode 100644 index 000000000000..870ed9a5b426 --- /dev/null +++ b/kernel/locking/rwsem.h | |||
@@ -0,0 +1,20 @@ | |||
1 | #ifdef CONFIG_RWSEM_SPIN_ON_OWNER | ||
2 | static inline void rwsem_set_owner(struct rw_semaphore *sem) | ||
3 | { | ||
4 | sem->owner = current; | ||
5 | } | ||
6 | |||
7 | static inline void rwsem_clear_owner(struct rw_semaphore *sem) | ||
8 | { | ||
9 | sem->owner = NULL; | ||
10 | } | ||
11 | |||
12 | #else | ||
13 | static inline void rwsem_set_owner(struct rw_semaphore *sem) | ||
14 | { | ||
15 | } | ||
16 | |||
17 | static inline void rwsem_clear_owner(struct rw_semaphore *sem) | ||
18 | { | ||
19 | } | ||
20 | #endif | ||
diff --git a/kernel/module.c b/kernel/module.c index b3d634ed06c9..650b038ae520 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -1865,7 +1865,7 @@ static void free_module(struct module *mod) | |||
1865 | kfree(mod->args); | 1865 | kfree(mod->args); |
1866 | percpu_modfree(mod); | 1866 | percpu_modfree(mod); |
1867 | 1867 | ||
1868 | /* Free lock-classes: */ | 1868 | /* Free lock-classes; relies on the preceding sync_rcu(). */ |
1869 | lockdep_free_key_range(mod->module_core, mod->core_size); | 1869 | lockdep_free_key_range(mod->module_core, mod->core_size); |
1870 | 1870 | ||
1871 | /* Finally, free the core (containing the module structure) */ | 1871 | /* Finally, free the core (containing the module structure) */ |
@@ -2479,6 +2479,23 @@ static int elf_header_check(struct load_info *info) | |||
2479 | return 0; | 2479 | return 0; |
2480 | } | 2480 | } |
2481 | 2481 | ||
2482 | #define COPY_CHUNK_SIZE (16*PAGE_SIZE) | ||
2483 | |||
2484 | static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned long len) | ||
2485 | { | ||
2486 | do { | ||
2487 | unsigned long n = min(len, COPY_CHUNK_SIZE); | ||
2488 | |||
2489 | if (copy_from_user(dst, usrc, n) != 0) | ||
2490 | return -EFAULT; | ||
2491 | cond_resched(); | ||
2492 | dst += n; | ||
2493 | usrc += n; | ||
2494 | len -= n; | ||
2495 | } while (len); | ||
2496 | return 0; | ||
2497 | } | ||
2498 | |||
2482 | /* Sets info->hdr and info->len. */ | 2499 | /* Sets info->hdr and info->len. */ |
2483 | static int copy_module_from_user(const void __user *umod, unsigned long len, | 2500 | static int copy_module_from_user(const void __user *umod, unsigned long len, |
2484 | struct load_info *info) | 2501 | struct load_info *info) |
@@ -2498,7 +2515,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, | |||
2498 | if (!info->hdr) | 2515 | if (!info->hdr) |
2499 | return -ENOMEM; | 2516 | return -ENOMEM; |
2500 | 2517 | ||
2501 | if (copy_from_user(info->hdr, umod, info->len) != 0) { | 2518 | if (copy_chunked_from_user(info->hdr, umod, info->len) != 0) { |
2502 | vfree(info->hdr); | 2519 | vfree(info->hdr); |
2503 | return -EFAULT; | 2520 | return -EFAULT; |
2504 | } | 2521 | } |
@@ -2753,6 +2770,9 @@ static int find_module_sections(struct module *mod, struct load_info *info) | |||
2753 | mod->trace_events = section_objs(info, "_ftrace_events", | 2770 | mod->trace_events = section_objs(info, "_ftrace_events", |
2754 | sizeof(*mod->trace_events), | 2771 | sizeof(*mod->trace_events), |
2755 | &mod->num_trace_events); | 2772 | &mod->num_trace_events); |
2773 | mod->trace_enums = section_objs(info, "_ftrace_enum_map", | ||
2774 | sizeof(*mod->trace_enums), | ||
2775 | &mod->num_trace_enums); | ||
2756 | #endif | 2776 | #endif |
2757 | #ifdef CONFIG_TRACING | 2777 | #ifdef CONFIG_TRACING |
2758 | mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt", | 2778 | mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt", |
@@ -3349,9 +3369,6 @@ static int load_module(struct load_info *info, const char __user *uargs, | |||
3349 | module_bug_cleanup(mod); | 3369 | module_bug_cleanup(mod); |
3350 | mutex_unlock(&module_mutex); | 3370 | mutex_unlock(&module_mutex); |
3351 | 3371 | ||
3352 | /* Free lock-classes: */ | ||
3353 | lockdep_free_key_range(mod->module_core, mod->core_size); | ||
3354 | |||
3355 | /* we can't deallocate the module until we clear memory protection */ | 3372 | /* we can't deallocate the module until we clear memory protection */ |
3356 | unset_module_init_ro_nx(mod); | 3373 | unset_module_init_ro_nx(mod); |
3357 | unset_module_core_ro_nx(mod); | 3374 | unset_module_core_ro_nx(mod); |
@@ -3375,6 +3392,9 @@ static int load_module(struct load_info *info, const char __user *uargs, | |||
3375 | synchronize_rcu(); | 3392 | synchronize_rcu(); |
3376 | mutex_unlock(&module_mutex); | 3393 | mutex_unlock(&module_mutex); |
3377 | free_module: | 3394 | free_module: |
3395 | /* Free lock-classes; relies on the preceding sync_rcu() */ | ||
3396 | lockdep_free_key_range(mod->module_core, mod->core_size); | ||
3397 | |||
3378 | module_deallocate(mod, info); | 3398 | module_deallocate(mod, info); |
3379 | free_copy: | 3399 | free_copy: |
3380 | free_copy(info); | 3400 | free_copy(info); |
diff --git a/kernel/pid.c b/kernel/pid.c index cd36a5e0d173..4fd07d5b7baf 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
@@ -182,7 +182,7 @@ static int alloc_pidmap(struct pid_namespace *pid_ns) | |||
182 | spin_unlock_irq(&pidmap_lock); | 182 | spin_unlock_irq(&pidmap_lock); |
183 | kfree(page); | 183 | kfree(page); |
184 | if (unlikely(!map->page)) | 184 | if (unlikely(!map->page)) |
185 | break; | 185 | return -ENOMEM; |
186 | } | 186 | } |
187 | if (likely(atomic_read(&map->nr_free))) { | 187 | if (likely(atomic_read(&map->nr_free))) { |
188 | for ( ; ; ) { | 188 | for ( ; ; ) { |
@@ -210,7 +210,7 @@ static int alloc_pidmap(struct pid_namespace *pid_ns) | |||
210 | } | 210 | } |
211 | pid = mk_pid(pid_ns, map, offset); | 211 | pid = mk_pid(pid_ns, map, offset); |
212 | } | 212 | } |
213 | return -1; | 213 | return -EAGAIN; |
214 | } | 214 | } |
215 | 215 | ||
216 | int next_pidmap(struct pid_namespace *pid_ns, unsigned int last) | 216 | int next_pidmap(struct pid_namespace *pid_ns, unsigned int last) |
@@ -301,17 +301,20 @@ struct pid *alloc_pid(struct pid_namespace *ns) | |||
301 | int i, nr; | 301 | int i, nr; |
302 | struct pid_namespace *tmp; | 302 | struct pid_namespace *tmp; |
303 | struct upid *upid; | 303 | struct upid *upid; |
304 | int retval = -ENOMEM; | ||
304 | 305 | ||
305 | pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); | 306 | pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); |
306 | if (!pid) | 307 | if (!pid) |
307 | goto out; | 308 | return ERR_PTR(retval); |
308 | 309 | ||
309 | tmp = ns; | 310 | tmp = ns; |
310 | pid->level = ns->level; | 311 | pid->level = ns->level; |
311 | for (i = ns->level; i >= 0; i--) { | 312 | for (i = ns->level; i >= 0; i--) { |
312 | nr = alloc_pidmap(tmp); | 313 | nr = alloc_pidmap(tmp); |
313 | if (nr < 0) | 314 | if (IS_ERR_VALUE(nr)) { |
315 | retval = nr; | ||
314 | goto out_free; | 316 | goto out_free; |
317 | } | ||
315 | 318 | ||
316 | pid->numbers[i].nr = nr; | 319 | pid->numbers[i].nr = nr; |
317 | pid->numbers[i].ns = tmp; | 320 | pid->numbers[i].ns = tmp; |
@@ -339,7 +342,6 @@ struct pid *alloc_pid(struct pid_namespace *ns) | |||
339 | } | 342 | } |
340 | spin_unlock_irq(&pidmap_lock); | 343 | spin_unlock_irq(&pidmap_lock); |
341 | 344 | ||
342 | out: | ||
343 | return pid; | 345 | return pid; |
344 | 346 | ||
345 | out_unlock: | 347 | out_unlock: |
@@ -351,8 +353,7 @@ out_free: | |||
351 | free_pidmap(pid->numbers + i); | 353 | free_pidmap(pid->numbers + i); |
352 | 354 | ||
353 | kmem_cache_free(ns->pid_cachep, pid); | 355 | kmem_cache_free(ns->pid_cachep, pid); |
354 | pid = NULL; | 356 | return ERR_PTR(retval); |
355 | goto out; | ||
356 | } | 357 | } |
357 | 358 | ||
358 | void disable_pid_allocation(struct pid_namespace *ns) | 359 | void disable_pid_allocation(struct pid_namespace *ns) |
diff --git a/kernel/power/main.c b/kernel/power/main.c index 9a59d042ea84..86e8157a450f 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -11,7 +11,7 @@ | |||
11 | #include <linux/export.h> | 11 | #include <linux/export.h> |
12 | #include <linux/kobject.h> | 12 | #include <linux/kobject.h> |
13 | #include <linux/string.h> | 13 | #include <linux/string.h> |
14 | #include <linux/resume-trace.h> | 14 | #include <linux/pm-trace.h> |
15 | #include <linux/workqueue.h> | 15 | #include <linux/workqueue.h> |
16 | #include <linux/debugfs.h> | 16 | #include <linux/debugfs.h> |
17 | #include <linux/seq_file.h> | 17 | #include <linux/seq_file.h> |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index c24d5a23bf93..5235dd4e1e2f 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
@@ -955,25 +955,6 @@ static void mark_nosave_pages(struct memory_bitmap *bm) | |||
955 | } | 955 | } |
956 | } | 956 | } |
957 | 957 | ||
958 | static bool is_nosave_page(unsigned long pfn) | ||
959 | { | ||
960 | struct nosave_region *region; | ||
961 | |||
962 | list_for_each_entry(region, &nosave_regions, list) { | ||
963 | if (pfn >= region->start_pfn && pfn < region->end_pfn) { | ||
964 | pr_err("PM: %#010llx in e820 nosave region: " | ||
965 | "[mem %#010llx-%#010llx]\n", | ||
966 | (unsigned long long) pfn << PAGE_SHIFT, | ||
967 | (unsigned long long) region->start_pfn << PAGE_SHIFT, | ||
968 | ((unsigned long long) region->end_pfn << PAGE_SHIFT) | ||
969 | - 1); | ||
970 | return true; | ||
971 | } | ||
972 | } | ||
973 | |||
974 | return false; | ||
975 | } | ||
976 | |||
977 | /** | 958 | /** |
978 | * create_basic_memory_bitmaps - create bitmaps needed for marking page | 959 | * create_basic_memory_bitmaps - create bitmaps needed for marking page |
979 | * frames that should not be saved and free page frames. The pointers | 960 | * frames that should not be saved and free page frames. The pointers |
@@ -2042,7 +2023,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm) | |||
2042 | do { | 2023 | do { |
2043 | pfn = memory_bm_next_pfn(bm); | 2024 | pfn = memory_bm_next_pfn(bm); |
2044 | if (likely(pfn != BM_END_OF_MAP)) { | 2025 | if (likely(pfn != BM_END_OF_MAP)) { |
2045 | if (likely(pfn_valid(pfn)) && !is_nosave_page(pfn)) | 2026 | if (likely(pfn_valid(pfn))) |
2046 | swsusp_set_page_free(pfn_to_page(pfn)); | 2027 | swsusp_set_page_free(pfn_to_page(pfn)); |
2047 | else | 2028 | else |
2048 | return -EFAULT; | 2029 | return -EFAULT; |
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index b7d6b3a721b1..8d7a1ef72758 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
@@ -28,6 +28,7 @@ | |||
28 | #include <linux/ftrace.h> | 28 | #include <linux/ftrace.h> |
29 | #include <trace/events/power.h> | 29 | #include <trace/events/power.h> |
30 | #include <linux/compiler.h> | 30 | #include <linux/compiler.h> |
31 | #include <linux/moduleparam.h> | ||
31 | 32 | ||
32 | #include "power.h" | 33 | #include "power.h" |
33 | 34 | ||
@@ -233,12 +234,20 @@ static bool platform_suspend_again(suspend_state_t state) | |||
233 | suspend_ops->suspend_again() : false; | 234 | suspend_ops->suspend_again() : false; |
234 | } | 235 | } |
235 | 236 | ||
237 | #ifdef CONFIG_PM_DEBUG | ||
238 | static unsigned int pm_test_delay = 5; | ||
239 | module_param(pm_test_delay, uint, 0644); | ||
240 | MODULE_PARM_DESC(pm_test_delay, | ||
241 | "Number of seconds to wait before resuming from suspend test"); | ||
242 | #endif | ||
243 | |||
236 | static int suspend_test(int level) | 244 | static int suspend_test(int level) |
237 | { | 245 | { |
238 | #ifdef CONFIG_PM_DEBUG | 246 | #ifdef CONFIG_PM_DEBUG |
239 | if (pm_test_level == level) { | 247 | if (pm_test_level == level) { |
240 | printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n"); | 248 | printk(KERN_INFO "suspend debug: Waiting for %d second(s).\n", |
241 | mdelay(5000); | 249 | pm_test_delay); |
250 | mdelay(pm_test_delay * 1000); | ||
242 | return 1; | 251 | return 1; |
243 | } | 252 | } |
244 | #endif /* !CONFIG_PM_DEBUG */ | 253 | #endif /* !CONFIG_PM_DEBUG */ |
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index bb0635bd74f2..c099b082cd02 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c | |||
@@ -32,7 +32,6 @@ | |||
32 | #include <linux/security.h> | 32 | #include <linux/security.h> |
33 | #include <linux/bootmem.h> | 33 | #include <linux/bootmem.h> |
34 | #include <linux/memblock.h> | 34 | #include <linux/memblock.h> |
35 | #include <linux/aio.h> | ||
36 | #include <linux/syscalls.h> | 35 | #include <linux/syscalls.h> |
37 | #include <linux/kexec.h> | 36 | #include <linux/kexec.h> |
38 | #include <linux/kdb.h> | 37 | #include <linux/kdb.h> |
@@ -46,6 +45,7 @@ | |||
46 | #include <linux/irq_work.h> | 45 | #include <linux/irq_work.h> |
47 | #include <linux/utsname.h> | 46 | #include <linux/utsname.h> |
48 | #include <linux/ctype.h> | 47 | #include <linux/ctype.h> |
48 | #include <linux/uio.h> | ||
49 | 49 | ||
50 | #include <asm/uaccess.h> | 50 | #include <asm/uaccess.h> |
51 | 51 | ||
@@ -521,7 +521,7 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) | |||
521 | int i; | 521 | int i; |
522 | int level = default_message_loglevel; | 522 | int level = default_message_loglevel; |
523 | int facility = 1; /* LOG_USER */ | 523 | int facility = 1; /* LOG_USER */ |
524 | size_t len = iocb->ki_nbytes; | 524 | size_t len = iov_iter_count(from); |
525 | ssize_t ret = len; | 525 | ssize_t ret = len; |
526 | 526 | ||
527 | if (len > LOG_LINE_MAX) | 527 | if (len > LOG_LINE_MAX) |
@@ -2017,24 +2017,6 @@ int add_preferred_console(char *name, int idx, char *options) | |||
2017 | return __add_preferred_console(name, idx, options, NULL); | 2017 | return __add_preferred_console(name, idx, options, NULL); |
2018 | } | 2018 | } |
2019 | 2019 | ||
2020 | int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options) | ||
2021 | { | ||
2022 | struct console_cmdline *c; | ||
2023 | int i; | ||
2024 | |||
2025 | for (i = 0, c = console_cmdline; | ||
2026 | i < MAX_CMDLINECONSOLES && c->name[0]; | ||
2027 | i++, c++) | ||
2028 | if (strcmp(c->name, name) == 0 && c->index == idx) { | ||
2029 | strlcpy(c->name, name_new, sizeof(c->name)); | ||
2030 | c->options = options; | ||
2031 | c->index = idx_new; | ||
2032 | return i; | ||
2033 | } | ||
2034 | /* not found */ | ||
2035 | return -1; | ||
2036 | } | ||
2037 | |||
2038 | bool console_suspend_enabled = true; | 2020 | bool console_suspend_enabled = true; |
2039 | EXPORT_SYMBOL(console_suspend_enabled); | 2021 | EXPORT_SYMBOL(console_suspend_enabled); |
2040 | 2022 | ||
@@ -2436,9 +2418,6 @@ void register_console(struct console *newcon) | |||
2436 | if (preferred_console < 0 || bcon || !console_drivers) | 2418 | if (preferred_console < 0 || bcon || !console_drivers) |
2437 | preferred_console = selected_console; | 2419 | preferred_console = selected_console; |
2438 | 2420 | ||
2439 | if (newcon->early_setup) | ||
2440 | newcon->early_setup(); | ||
2441 | |||
2442 | /* | 2421 | /* |
2443 | * See if we want to use this console driver. If we | 2422 | * See if we want to use this console driver. If we |
2444 | * didn't select a console we take the first one | 2423 | * didn't select a console we take the first one |
@@ -2464,23 +2443,27 @@ void register_console(struct console *newcon) | |||
2464 | for (i = 0, c = console_cmdline; | 2443 | for (i = 0, c = console_cmdline; |
2465 | i < MAX_CMDLINECONSOLES && c->name[0]; | 2444 | i < MAX_CMDLINECONSOLES && c->name[0]; |
2466 | i++, c++) { | 2445 | i++, c++) { |
2467 | BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name)); | 2446 | if (!newcon->match || |
2468 | if (strcmp(c->name, newcon->name) != 0) | 2447 | newcon->match(newcon, c->name, c->index, c->options) != 0) { |
2469 | continue; | 2448 | /* default matching */ |
2470 | if (newcon->index >= 0 && | 2449 | BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name)); |
2471 | newcon->index != c->index) | 2450 | if (strcmp(c->name, newcon->name) != 0) |
2472 | continue; | 2451 | continue; |
2473 | if (newcon->index < 0) | 2452 | if (newcon->index >= 0 && |
2474 | newcon->index = c->index; | 2453 | newcon->index != c->index) |
2454 | continue; | ||
2455 | if (newcon->index < 0) | ||
2456 | newcon->index = c->index; | ||
2475 | 2457 | ||
2476 | if (_braille_register_console(newcon, c)) | 2458 | if (_braille_register_console(newcon, c)) |
2477 | return; | 2459 | return; |
2460 | |||
2461 | if (newcon->setup && | ||
2462 | newcon->setup(newcon, c->options) != 0) | ||
2463 | break; | ||
2464 | } | ||
2478 | 2465 | ||
2479 | if (newcon->setup && | ||
2480 | newcon->setup(newcon, console_cmdline[i].options) != 0) | ||
2481 | break; | ||
2482 | newcon->flags |= CON_ENABLED; | 2466 | newcon->flags |= CON_ENABLED; |
2483 | newcon->index = c->index; | ||
2484 | if (i == selected_console) { | 2467 | if (i == selected_console) { |
2485 | newcon->flags |= CON_CONSDEV; | 2468 | newcon->flags |= CON_CONSDEV; |
2486 | preferred_console = selected_console; | 2469 | preferred_console = selected_console; |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 227fec36b12a..c8e0e050a36a 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -456,8 +456,6 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) | |||
456 | 456 | ||
457 | static int ptrace_detach(struct task_struct *child, unsigned int data) | 457 | static int ptrace_detach(struct task_struct *child, unsigned int data) |
458 | { | 458 | { |
459 | bool dead = false; | ||
460 | |||
461 | if (!valid_signal(data)) | 459 | if (!valid_signal(data)) |
462 | return -EIO; | 460 | return -EIO; |
463 | 461 | ||
@@ -467,18 +465,19 @@ static int ptrace_detach(struct task_struct *child, unsigned int data) | |||
467 | 465 | ||
468 | write_lock_irq(&tasklist_lock); | 466 | write_lock_irq(&tasklist_lock); |
469 | /* | 467 | /* |
470 | * This child can be already killed. Make sure de_thread() or | 468 | * We rely on ptrace_freeze_traced(). It can't be killed and |
471 | * our sub-thread doing do_wait() didn't do release_task() yet. | 469 | * untraced by another thread, it can't be a zombie. |
472 | */ | 470 | */ |
473 | if (child->ptrace) { | 471 | WARN_ON(!child->ptrace || child->exit_state); |
474 | child->exit_code = data; | 472 | /* |
475 | dead = __ptrace_detach(current, child); | 473 | * tasklist_lock avoids the race with wait_task_stopped(), see |
476 | } | 474 | * the comment in ptrace_resume(). |
475 | */ | ||
476 | child->exit_code = data; | ||
477 | __ptrace_detach(current, child); | ||
477 | write_unlock_irq(&tasklist_lock); | 478 | write_unlock_irq(&tasklist_lock); |
478 | 479 | ||
479 | proc_ptrace_connector(child, PTRACE_DETACH); | 480 | proc_ptrace_connector(child, PTRACE_DETACH); |
480 | if (unlikely(dead)) | ||
481 | release_task(child); | ||
482 | 481 | ||
483 | return 0; | 482 | return 0; |
484 | } | 483 | } |
@@ -697,6 +696,8 @@ static int ptrace_peek_siginfo(struct task_struct *child, | |||
697 | static int ptrace_resume(struct task_struct *child, long request, | 696 | static int ptrace_resume(struct task_struct *child, long request, |
698 | unsigned long data) | 697 | unsigned long data) |
699 | { | 698 | { |
699 | bool need_siglock; | ||
700 | |||
700 | if (!valid_signal(data)) | 701 | if (!valid_signal(data)) |
701 | return -EIO; | 702 | return -EIO; |
702 | 703 | ||
@@ -724,8 +725,26 @@ static int ptrace_resume(struct task_struct *child, long request, | |||
724 | user_disable_single_step(child); | 725 | user_disable_single_step(child); |
725 | } | 726 | } |
726 | 727 | ||
728 | /* | ||
729 | * Change ->exit_code and ->state under siglock to avoid the race | ||
730 | * with wait_task_stopped() in between; a non-zero ->exit_code will | ||
731 | * wrongly look like another report from tracee. | ||
732 | * | ||
733 | * Note that we need siglock even if ->exit_code == data and/or this | ||
734 | * status was not reported yet, the new status must not be cleared by | ||
735 | * wait_task_stopped() after resume. | ||
736 | * | ||
737 | * If data == 0 we do not care if wait_task_stopped() reports the old | ||
738 | * status and clears the code too; this can't race with the tracee, it | ||
739 | * takes siglock after resume. | ||
740 | */ | ||
741 | need_siglock = data && !thread_group_empty(current); | ||
742 | if (need_siglock) | ||
743 | spin_lock_irq(&child->sighand->siglock); | ||
727 | child->exit_code = data; | 744 | child->exit_code = data; |
728 | wake_up_state(child, __TASK_TRACED); | 745 | wake_up_state(child, __TASK_TRACED); |
746 | if (need_siglock) | ||
747 | spin_unlock_irq(&child->sighand->siglock); | ||
729 | 748 | ||
730 | return 0; | 749 | return 0; |
731 | } | 750 | } |
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 30d42aa55d83..8dbe27611ec3 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c | |||
@@ -853,6 +853,8 @@ rcu_torture_fqs(void *arg) | |||
853 | static int | 853 | static int |
854 | rcu_torture_writer(void *arg) | 854 | rcu_torture_writer(void *arg) |
855 | { | 855 | { |
856 | bool can_expedite = !rcu_gp_is_expedited(); | ||
857 | int expediting = 0; | ||
856 | unsigned long gp_snap; | 858 | unsigned long gp_snap; |
857 | bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal; | 859 | bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal; |
858 | bool gp_sync1 = gp_sync; | 860 | bool gp_sync1 = gp_sync; |
@@ -865,9 +867,15 @@ rcu_torture_writer(void *arg) | |||
865 | int nsynctypes = 0; | 867 | int nsynctypes = 0; |
866 | 868 | ||
867 | VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); | 869 | VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); |
870 | pr_alert("%s" TORTURE_FLAG | ||
871 | " Grace periods expedited from boot/sysfs for %s,\n", | ||
872 | torture_type, cur_ops->name); | ||
873 | pr_alert("%s" TORTURE_FLAG | ||
874 | " Testing of dynamic grace-period expediting diabled.\n", | ||
875 | torture_type); | ||
868 | 876 | ||
869 | /* Initialize synctype[] array. If none set, take default. */ | 877 | /* Initialize synctype[] array. If none set, take default. */ |
870 | if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync) | 878 | if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1) |
871 | gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true; | 879 | gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true; |
872 | if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) | 880 | if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) |
873 | synctype[nsynctypes++] = RTWS_COND_GET; | 881 | synctype[nsynctypes++] = RTWS_COND_GET; |
@@ -949,9 +957,26 @@ rcu_torture_writer(void *arg) | |||
949 | } | 957 | } |
950 | } | 958 | } |
951 | rcutorture_record_progress(++rcu_torture_current_version); | 959 | rcutorture_record_progress(++rcu_torture_current_version); |
960 | /* Cycle through nesting levels of rcu_expedite_gp() calls. */ | ||
961 | if (can_expedite && | ||
962 | !(torture_random(&rand) & 0xff & (!!expediting - 1))) { | ||
963 | WARN_ON_ONCE(expediting == 0 && rcu_gp_is_expedited()); | ||
964 | if (expediting >= 0) | ||
965 | rcu_expedite_gp(); | ||
966 | else | ||
967 | rcu_unexpedite_gp(); | ||
968 | if (++expediting > 3) | ||
969 | expediting = -expediting; | ||
970 | } | ||
952 | rcu_torture_writer_state = RTWS_STUTTER; | 971 | rcu_torture_writer_state = RTWS_STUTTER; |
953 | stutter_wait("rcu_torture_writer"); | 972 | stutter_wait("rcu_torture_writer"); |
954 | } while (!torture_must_stop()); | 973 | } while (!torture_must_stop()); |
974 | /* Reset expediting back to unexpedited. */ | ||
975 | if (expediting > 0) | ||
976 | expediting = -expediting; | ||
977 | while (can_expedite && expediting++ < 0) | ||
978 | rcu_unexpedite_gp(); | ||
979 | WARN_ON_ONCE(can_expedite && rcu_gp_is_expedited()); | ||
955 | rcu_torture_writer_state = RTWS_STOPPING; | 980 | rcu_torture_writer_state = RTWS_STOPPING; |
956 | torture_kthread_stopping("rcu_torture_writer"); | 981 | torture_kthread_stopping("rcu_torture_writer"); |
957 | return 0; | 982 | return 0; |
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index 445bf8ffe3fb..cad76e76b4e7 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c | |||
@@ -402,23 +402,6 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head, | |||
402 | } | 402 | } |
403 | EXPORT_SYMBOL_GPL(call_srcu); | 403 | EXPORT_SYMBOL_GPL(call_srcu); |
404 | 404 | ||
405 | struct rcu_synchronize { | ||
406 | struct rcu_head head; | ||
407 | struct completion completion; | ||
408 | }; | ||
409 | |||
410 | /* | ||
411 | * Awaken the corresponding synchronize_srcu() instance now that a | ||
412 | * grace period has elapsed. | ||
413 | */ | ||
414 | static void wakeme_after_rcu(struct rcu_head *head) | ||
415 | { | ||
416 | struct rcu_synchronize *rcu; | ||
417 | |||
418 | rcu = container_of(head, struct rcu_synchronize, head); | ||
419 | complete(&rcu->completion); | ||
420 | } | ||
421 | |||
422 | static void srcu_advance_batches(struct srcu_struct *sp, int trycount); | 405 | static void srcu_advance_batches(struct srcu_struct *sp, int trycount); |
423 | static void srcu_reschedule(struct srcu_struct *sp); | 406 | static void srcu_reschedule(struct srcu_struct *sp); |
424 | 407 | ||
@@ -507,7 +490,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) | |||
507 | */ | 490 | */ |
508 | void synchronize_srcu(struct srcu_struct *sp) | 491 | void synchronize_srcu(struct srcu_struct *sp) |
509 | { | 492 | { |
510 | __synchronize_srcu(sp, rcu_expedited | 493 | __synchronize_srcu(sp, rcu_gp_is_expedited() |
511 | ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT | 494 | ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT |
512 | : SYNCHRONIZE_SRCU_TRYCOUNT); | 495 | : SYNCHRONIZE_SRCU_TRYCOUNT); |
513 | } | 496 | } |
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index cc9ceca7bde1..069742d61c68 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c | |||
@@ -103,8 +103,7 @@ EXPORT_SYMBOL(__rcu_is_watching); | |||
103 | static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) | 103 | static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) |
104 | { | 104 | { |
105 | RCU_TRACE(reset_cpu_stall_ticks(rcp)); | 105 | RCU_TRACE(reset_cpu_stall_ticks(rcp)); |
106 | if (rcp->rcucblist != NULL && | 106 | if (rcp->donetail != rcp->curtail) { |
107 | rcp->donetail != rcp->curtail) { | ||
108 | rcp->donetail = rcp->curtail; | 107 | rcp->donetail = rcp->curtail; |
109 | return 1; | 108 | return 1; |
110 | } | 109 | } |
@@ -169,17 +168,6 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
169 | unsigned long flags; | 168 | unsigned long flags; |
170 | RCU_TRACE(int cb_count = 0); | 169 | RCU_TRACE(int cb_count = 0); |
171 | 170 | ||
172 | /* If no RCU callbacks ready to invoke, just return. */ | ||
173 | if (&rcp->rcucblist == rcp->donetail) { | ||
174 | RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1)); | ||
175 | RCU_TRACE(trace_rcu_batch_end(rcp->name, 0, | ||
176 | !!ACCESS_ONCE(rcp->rcucblist), | ||
177 | need_resched(), | ||
178 | is_idle_task(current), | ||
179 | false)); | ||
180 | return; | ||
181 | } | ||
182 | |||
183 | /* Move the ready-to-invoke callbacks to a local list. */ | 171 | /* Move the ready-to-invoke callbacks to a local list. */ |
184 | local_irq_save(flags); | 172 | local_irq_save(flags); |
185 | RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1)); | 173 | RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1)); |
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 48d640ca1a05..233165da782f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c | |||
@@ -91,8 +91,10 @@ static const char *tp_##sname##_varname __used __tracepoint_string = sname##_var | |||
91 | 91 | ||
92 | #define RCU_STATE_INITIALIZER(sname, sabbr, cr) \ | 92 | #define RCU_STATE_INITIALIZER(sname, sabbr, cr) \ |
93 | DEFINE_RCU_TPS(sname) \ | 93 | DEFINE_RCU_TPS(sname) \ |
94 | DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data); \ | ||
94 | struct rcu_state sname##_state = { \ | 95 | struct rcu_state sname##_state = { \ |
95 | .level = { &sname##_state.node[0] }, \ | 96 | .level = { &sname##_state.node[0] }, \ |
97 | .rda = &sname##_data, \ | ||
96 | .call = cr, \ | 98 | .call = cr, \ |
97 | .fqs_state = RCU_GP_IDLE, \ | 99 | .fqs_state = RCU_GP_IDLE, \ |
98 | .gpnum = 0UL - 300UL, \ | 100 | .gpnum = 0UL - 300UL, \ |
@@ -101,11 +103,9 @@ struct rcu_state sname##_state = { \ | |||
101 | .orphan_nxttail = &sname##_state.orphan_nxtlist, \ | 103 | .orphan_nxttail = &sname##_state.orphan_nxtlist, \ |
102 | .orphan_donetail = &sname##_state.orphan_donelist, \ | 104 | .orphan_donetail = &sname##_state.orphan_donelist, \ |
103 | .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ | 105 | .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ |
104 | .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ | ||
105 | .name = RCU_STATE_NAME(sname), \ | 106 | .name = RCU_STATE_NAME(sname), \ |
106 | .abbr = sabbr, \ | 107 | .abbr = sabbr, \ |
107 | }; \ | 108 | } |
108 | DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data) | ||
109 | 109 | ||
110 | RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); | 110 | RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); |
111 | RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); | 111 | RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); |
@@ -152,6 +152,8 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active); | |||
152 | */ | 152 | */ |
153 | static int rcu_scheduler_fully_active __read_mostly; | 153 | static int rcu_scheduler_fully_active __read_mostly; |
154 | 154 | ||
155 | static void rcu_init_new_rnp(struct rcu_node *rnp_leaf); | ||
156 | static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf); | ||
155 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); | 157 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); |
156 | static void invoke_rcu_core(void); | 158 | static void invoke_rcu_core(void); |
157 | static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); | 159 | static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); |
@@ -160,6 +162,12 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); | |||
160 | static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO; | 162 | static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO; |
161 | module_param(kthread_prio, int, 0644); | 163 | module_param(kthread_prio, int, 0644); |
162 | 164 | ||
165 | /* Delay in jiffies for grace-period initialization delays. */ | ||
166 | static int gp_init_delay = IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_INIT) | ||
167 | ? CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY | ||
168 | : 0; | ||
169 | module_param(gp_init_delay, int, 0644); | ||
170 | |||
163 | /* | 171 | /* |
164 | * Track the rcutorture test sequence number and the update version | 172 | * Track the rcutorture test sequence number and the update version |
165 | * number within a given test. The rcutorture_testseq is incremented | 173 | * number within a given test. The rcutorture_testseq is incremented |
@@ -173,6 +181,17 @@ unsigned long rcutorture_testseq; | |||
173 | unsigned long rcutorture_vernum; | 181 | unsigned long rcutorture_vernum; |
174 | 182 | ||
175 | /* | 183 | /* |
184 | * Compute the mask of online CPUs for the specified rcu_node structure. | ||
185 | * This will not be stable unless the rcu_node structure's ->lock is | ||
186 | * held, but the bit corresponding to the current CPU will be stable | ||
187 | * in most contexts. | ||
188 | */ | ||
189 | unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) | ||
190 | { | ||
191 | return ACCESS_ONCE(rnp->qsmaskinitnext); | ||
192 | } | ||
193 | |||
194 | /* | ||
176 | * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s | 195 | * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s |
177 | * permit this function to be invoked without holding the root rcu_node | 196 | * permit this function to be invoked without holding the root rcu_node |
178 | * structure's ->lock, but of course results can be subject to change. | 197 | * structure's ->lock, but of course results can be subject to change. |
@@ -292,10 +311,10 @@ void rcu_note_context_switch(void) | |||
292 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); | 311 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); |
293 | 312 | ||
294 | /* | 313 | /* |
295 | * Register a quiesecent state for all RCU flavors. If there is an | 314 | * Register a quiescent state for all RCU flavors. If there is an |
296 | * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight | 315 | * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight |
297 | * dyntick-idle quiescent state visible to other CPUs (but only for those | 316 | * dyntick-idle quiescent state visible to other CPUs (but only for those |
298 | * RCU flavors in desparate need of a quiescent state, which will normally | 317 | * RCU flavors in desperate need of a quiescent state, which will normally |
299 | * be none of them). Either way, do a lightweight quiescent state for | 318 | * be none of them). Either way, do a lightweight quiescent state for |
300 | * all RCU flavors. | 319 | * all RCU flavors. |
301 | */ | 320 | */ |
@@ -410,6 +429,15 @@ void rcu_bh_force_quiescent_state(void) | |||
410 | EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); | 429 | EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); |
411 | 430 | ||
412 | /* | 431 | /* |
432 | * Force a quiescent state for RCU-sched. | ||
433 | */ | ||
434 | void rcu_sched_force_quiescent_state(void) | ||
435 | { | ||
436 | force_quiescent_state(&rcu_sched_state); | ||
437 | } | ||
438 | EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); | ||
439 | |||
440 | /* | ||
413 | * Show the state of the grace-period kthreads. | 441 | * Show the state of the grace-period kthreads. |
414 | */ | 442 | */ |
415 | void show_rcu_gp_kthreads(void) | 443 | void show_rcu_gp_kthreads(void) |
@@ -483,15 +511,6 @@ void rcutorture_record_progress(unsigned long vernum) | |||
483 | EXPORT_SYMBOL_GPL(rcutorture_record_progress); | 511 | EXPORT_SYMBOL_GPL(rcutorture_record_progress); |
484 | 512 | ||
485 | /* | 513 | /* |
486 | * Force a quiescent state for RCU-sched. | ||
487 | */ | ||
488 | void rcu_sched_force_quiescent_state(void) | ||
489 | { | ||
490 | force_quiescent_state(&rcu_sched_state); | ||
491 | } | ||
492 | EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); | ||
493 | |||
494 | /* | ||
495 | * Does the CPU have callbacks ready to be invoked? | 514 | * Does the CPU have callbacks ready to be invoked? |
496 | */ | 515 | */ |
497 | static int | 516 | static int |
@@ -954,7 +973,7 @@ bool rcu_lockdep_current_cpu_online(void) | |||
954 | preempt_disable(); | 973 | preempt_disable(); |
955 | rdp = this_cpu_ptr(&rcu_sched_data); | 974 | rdp = this_cpu_ptr(&rcu_sched_data); |
956 | rnp = rdp->mynode; | 975 | rnp = rdp->mynode; |
957 | ret = (rdp->grpmask & rnp->qsmaskinit) || | 976 | ret = (rdp->grpmask & rcu_rnp_online_cpus(rnp)) || |
958 | !rcu_scheduler_fully_active; | 977 | !rcu_scheduler_fully_active; |
959 | preempt_enable(); | 978 | preempt_enable(); |
960 | return ret; | 979 | return ret; |
@@ -1196,9 +1215,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) | |||
1196 | } else { | 1215 | } else { |
1197 | j = jiffies; | 1216 | j = jiffies; |
1198 | gpa = ACCESS_ONCE(rsp->gp_activity); | 1217 | gpa = ACCESS_ONCE(rsp->gp_activity); |
1199 | pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld\n", | 1218 | pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n", |
1200 | rsp->name, j - gpa, j, gpa, | 1219 | rsp->name, j - gpa, j, gpa, |
1201 | jiffies_till_next_fqs); | 1220 | jiffies_till_next_fqs, |
1221 | rcu_get_root(rsp)->qsmask); | ||
1202 | /* In this case, the current CPU might be at fault. */ | 1222 | /* In this case, the current CPU might be at fault. */ |
1203 | sched_show_task(current); | 1223 | sched_show_task(current); |
1204 | } | 1224 | } |
@@ -1328,20 +1348,30 @@ void rcu_cpu_stall_reset(void) | |||
1328 | } | 1348 | } |
1329 | 1349 | ||
1330 | /* | 1350 | /* |
1331 | * Initialize the specified rcu_data structure's callback list to empty. | 1351 | * Initialize the specified rcu_data structure's default callback list |
1352 | * to empty. The default callback list is the one that is not used by | ||
1353 | * no-callbacks CPUs. | ||
1332 | */ | 1354 | */ |
1333 | static void init_callback_list(struct rcu_data *rdp) | 1355 | static void init_default_callback_list(struct rcu_data *rdp) |
1334 | { | 1356 | { |
1335 | int i; | 1357 | int i; |
1336 | 1358 | ||
1337 | if (init_nocb_callback_list(rdp)) | ||
1338 | return; | ||
1339 | rdp->nxtlist = NULL; | 1359 | rdp->nxtlist = NULL; |
1340 | for (i = 0; i < RCU_NEXT_SIZE; i++) | 1360 | for (i = 0; i < RCU_NEXT_SIZE; i++) |
1341 | rdp->nxttail[i] = &rdp->nxtlist; | 1361 | rdp->nxttail[i] = &rdp->nxtlist; |
1342 | } | 1362 | } |
1343 | 1363 | ||
1344 | /* | 1364 | /* |
1365 | * Initialize the specified rcu_data structure's callback list to empty. | ||
1366 | */ | ||
1367 | static void init_callback_list(struct rcu_data *rdp) | ||
1368 | { | ||
1369 | if (init_nocb_callback_list(rdp)) | ||
1370 | return; | ||
1371 | init_default_callback_list(rdp); | ||
1372 | } | ||
1373 | |||
1374 | /* | ||
1345 | * Determine the value that ->completed will have at the end of the | 1375 | * Determine the value that ->completed will have at the end of the |
1346 | * next subsequent grace period. This is used to tag callbacks so that | 1376 | * next subsequent grace period. This is used to tag callbacks so that |
1347 | * a CPU can invoke callbacks in a timely fashion even if that CPU has | 1377 | * a CPU can invoke callbacks in a timely fashion even if that CPU has |
@@ -1703,11 +1733,11 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1703 | */ | 1733 | */ |
1704 | static int rcu_gp_init(struct rcu_state *rsp) | 1734 | static int rcu_gp_init(struct rcu_state *rsp) |
1705 | { | 1735 | { |
1736 | unsigned long oldmask; | ||
1706 | struct rcu_data *rdp; | 1737 | struct rcu_data *rdp; |
1707 | struct rcu_node *rnp = rcu_get_root(rsp); | 1738 | struct rcu_node *rnp = rcu_get_root(rsp); |
1708 | 1739 | ||
1709 | ACCESS_ONCE(rsp->gp_activity) = jiffies; | 1740 | ACCESS_ONCE(rsp->gp_activity) = jiffies; |
1710 | rcu_bind_gp_kthread(); | ||
1711 | raw_spin_lock_irq(&rnp->lock); | 1741 | raw_spin_lock_irq(&rnp->lock); |
1712 | smp_mb__after_unlock_lock(); | 1742 | smp_mb__after_unlock_lock(); |
1713 | if (!ACCESS_ONCE(rsp->gp_flags)) { | 1743 | if (!ACCESS_ONCE(rsp->gp_flags)) { |
@@ -1733,9 +1763,54 @@ static int rcu_gp_init(struct rcu_state *rsp) | |||
1733 | trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); | 1763 | trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); |
1734 | raw_spin_unlock_irq(&rnp->lock); | 1764 | raw_spin_unlock_irq(&rnp->lock); |
1735 | 1765 | ||
1736 | /* Exclude any concurrent CPU-hotplug operations. */ | 1766 | /* |
1737 | mutex_lock(&rsp->onoff_mutex); | 1767 | * Apply per-leaf buffered online and offline operations to the |
1738 | smp_mb__after_unlock_lock(); /* ->gpnum increment before GP! */ | 1768 | * rcu_node tree. Note that this new grace period need not wait |
1769 | * for subsequent online CPUs, and that quiescent-state forcing | ||
1770 | * will handle subsequent offline CPUs. | ||
1771 | */ | ||
1772 | rcu_for_each_leaf_node(rsp, rnp) { | ||
1773 | raw_spin_lock_irq(&rnp->lock); | ||
1774 | smp_mb__after_unlock_lock(); | ||
1775 | if (rnp->qsmaskinit == rnp->qsmaskinitnext && | ||
1776 | !rnp->wait_blkd_tasks) { | ||
1777 | /* Nothing to do on this leaf rcu_node structure. */ | ||
1778 | raw_spin_unlock_irq(&rnp->lock); | ||
1779 | continue; | ||
1780 | } | ||
1781 | |||
1782 | /* Record old state, apply changes to ->qsmaskinit field. */ | ||
1783 | oldmask = rnp->qsmaskinit; | ||
1784 | rnp->qsmaskinit = rnp->qsmaskinitnext; | ||
1785 | |||
1786 | /* If zero-ness of ->qsmaskinit changed, propagate up tree. */ | ||
1787 | if (!oldmask != !rnp->qsmaskinit) { | ||
1788 | if (!oldmask) /* First online CPU for this rcu_node. */ | ||
1789 | rcu_init_new_rnp(rnp); | ||
1790 | else if (rcu_preempt_has_tasks(rnp)) /* blocked tasks */ | ||
1791 | rnp->wait_blkd_tasks = true; | ||
1792 | else /* Last offline CPU and can propagate. */ | ||
1793 | rcu_cleanup_dead_rnp(rnp); | ||
1794 | } | ||
1795 | |||
1796 | /* | ||
1797 | * If all waited-on tasks from prior grace period are | ||
1798 | * done, and if all this rcu_node structure's CPUs are | ||
1799 | * still offline, propagate up the rcu_node tree and | ||
1800 | * clear ->wait_blkd_tasks. Otherwise, if one of this | ||
1801 | * rcu_node structure's CPUs has since come back online, | ||
1802 | * simply clear ->wait_blkd_tasks (but rcu_cleanup_dead_rnp() | ||
1803 | * checks for this, so just call it unconditionally). | ||
1804 | */ | ||
1805 | if (rnp->wait_blkd_tasks && | ||
1806 | (!rcu_preempt_has_tasks(rnp) || | ||
1807 | rnp->qsmaskinit)) { | ||
1808 | rnp->wait_blkd_tasks = false; | ||
1809 | rcu_cleanup_dead_rnp(rnp); | ||
1810 | } | ||
1811 | |||
1812 | raw_spin_unlock_irq(&rnp->lock); | ||
1813 | } | ||
1739 | 1814 | ||
1740 | /* | 1815 | /* |
1741 | * Set the quiescent-state-needed bits in all the rcu_node | 1816 | * Set the quiescent-state-needed bits in all the rcu_node |
@@ -1757,8 +1832,8 @@ static int rcu_gp_init(struct rcu_state *rsp) | |||
1757 | rcu_preempt_check_blocked_tasks(rnp); | 1832 | rcu_preempt_check_blocked_tasks(rnp); |
1758 | rnp->qsmask = rnp->qsmaskinit; | 1833 | rnp->qsmask = rnp->qsmaskinit; |
1759 | ACCESS_ONCE(rnp->gpnum) = rsp->gpnum; | 1834 | ACCESS_ONCE(rnp->gpnum) = rsp->gpnum; |
1760 | WARN_ON_ONCE(rnp->completed != rsp->completed); | 1835 | if (WARN_ON_ONCE(rnp->completed != rsp->completed)) |
1761 | ACCESS_ONCE(rnp->completed) = rsp->completed; | 1836 | ACCESS_ONCE(rnp->completed) = rsp->completed; |
1762 | if (rnp == rdp->mynode) | 1837 | if (rnp == rdp->mynode) |
1763 | (void)__note_gp_changes(rsp, rnp, rdp); | 1838 | (void)__note_gp_changes(rsp, rnp, rdp); |
1764 | rcu_preempt_boost_start_gp(rnp); | 1839 | rcu_preempt_boost_start_gp(rnp); |
@@ -1768,9 +1843,12 @@ static int rcu_gp_init(struct rcu_state *rsp) | |||
1768 | raw_spin_unlock_irq(&rnp->lock); | 1843 | raw_spin_unlock_irq(&rnp->lock); |
1769 | cond_resched_rcu_qs(); | 1844 | cond_resched_rcu_qs(); |
1770 | ACCESS_ONCE(rsp->gp_activity) = jiffies; | 1845 | ACCESS_ONCE(rsp->gp_activity) = jiffies; |
1846 | if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_INIT) && | ||
1847 | gp_init_delay > 0 && | ||
1848 | !(rsp->gpnum % (rcu_num_nodes * 10))) | ||
1849 | schedule_timeout_uninterruptible(gp_init_delay); | ||
1771 | } | 1850 | } |
1772 | 1851 | ||
1773 | mutex_unlock(&rsp->onoff_mutex); | ||
1774 | return 1; | 1852 | return 1; |
1775 | } | 1853 | } |
1776 | 1854 | ||
@@ -1798,7 +1876,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) | |||
1798 | fqs_state = RCU_FORCE_QS; | 1876 | fqs_state = RCU_FORCE_QS; |
1799 | } else { | 1877 | } else { |
1800 | /* Handle dyntick-idle and offline CPUs. */ | 1878 | /* Handle dyntick-idle and offline CPUs. */ |
1801 | isidle = false; | 1879 | isidle = true; |
1802 | force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj); | 1880 | force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj); |
1803 | } | 1881 | } |
1804 | /* Clear flag to prevent immediate re-entry. */ | 1882 | /* Clear flag to prevent immediate re-entry. */ |
@@ -1852,6 +1930,8 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) | |||
1852 | rcu_for_each_node_breadth_first(rsp, rnp) { | 1930 | rcu_for_each_node_breadth_first(rsp, rnp) { |
1853 | raw_spin_lock_irq(&rnp->lock); | 1931 | raw_spin_lock_irq(&rnp->lock); |
1854 | smp_mb__after_unlock_lock(); | 1932 | smp_mb__after_unlock_lock(); |
1933 | WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); | ||
1934 | WARN_ON_ONCE(rnp->qsmask); | ||
1855 | ACCESS_ONCE(rnp->completed) = rsp->gpnum; | 1935 | ACCESS_ONCE(rnp->completed) = rsp->gpnum; |
1856 | rdp = this_cpu_ptr(rsp->rda); | 1936 | rdp = this_cpu_ptr(rsp->rda); |
1857 | if (rnp == rdp->mynode) | 1937 | if (rnp == rdp->mynode) |
@@ -1895,6 +1975,7 @@ static int __noreturn rcu_gp_kthread(void *arg) | |||
1895 | struct rcu_state *rsp = arg; | 1975 | struct rcu_state *rsp = arg; |
1896 | struct rcu_node *rnp = rcu_get_root(rsp); | 1976 | struct rcu_node *rnp = rcu_get_root(rsp); |
1897 | 1977 | ||
1978 | rcu_bind_gp_kthread(); | ||
1898 | for (;;) { | 1979 | for (;;) { |
1899 | 1980 | ||
1900 | /* Handle grace-period start. */ | 1981 | /* Handle grace-period start. */ |
@@ -2062,25 +2143,32 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) | |||
2062 | * Similar to rcu_report_qs_rdp(), for which it is a helper function. | 2143 | * Similar to rcu_report_qs_rdp(), for which it is a helper function. |
2063 | * Allows quiescent states for a group of CPUs to be reported at one go | 2144 | * Allows quiescent states for a group of CPUs to be reported at one go |
2064 | * to the specified rcu_node structure, though all the CPUs in the group | 2145 | * to the specified rcu_node structure, though all the CPUs in the group |
2065 | * must be represented by the same rcu_node structure (which need not be | 2146 | * must be represented by the same rcu_node structure (which need not be a |
2066 | * a leaf rcu_node structure, though it often will be). That structure's | 2147 | * leaf rcu_node structure, though it often will be). The gps parameter |
2067 | * lock must be held upon entry, and it is released before return. | 2148 | * is the grace-period snapshot, which means that the quiescent states |
2149 | * are valid only if rnp->gpnum is equal to gps. That structure's lock | ||
2150 | * must be held upon entry, and it is released before return. | ||
2068 | */ | 2151 | */ |
2069 | static void | 2152 | static void |
2070 | rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, | 2153 | rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, |
2071 | struct rcu_node *rnp, unsigned long flags) | 2154 | struct rcu_node *rnp, unsigned long gps, unsigned long flags) |
2072 | __releases(rnp->lock) | 2155 | __releases(rnp->lock) |
2073 | { | 2156 | { |
2157 | unsigned long oldmask = 0; | ||
2074 | struct rcu_node *rnp_c; | 2158 | struct rcu_node *rnp_c; |
2075 | 2159 | ||
2076 | /* Walk up the rcu_node hierarchy. */ | 2160 | /* Walk up the rcu_node hierarchy. */ |
2077 | for (;;) { | 2161 | for (;;) { |
2078 | if (!(rnp->qsmask & mask)) { | 2162 | if (!(rnp->qsmask & mask) || rnp->gpnum != gps) { |
2079 | 2163 | ||
2080 | /* Our bit has already been cleared, so done. */ | 2164 | /* |
2165 | * Our bit has already been cleared, or the | ||
2166 | * relevant grace period is already over, so done. | ||
2167 | */ | ||
2081 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 2168 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
2082 | return; | 2169 | return; |
2083 | } | 2170 | } |
2171 | WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */ | ||
2084 | rnp->qsmask &= ~mask; | 2172 | rnp->qsmask &= ~mask; |
2085 | trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum, | 2173 | trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum, |
2086 | mask, rnp->qsmask, rnp->level, | 2174 | mask, rnp->qsmask, rnp->level, |
@@ -2104,7 +2192,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, | |||
2104 | rnp = rnp->parent; | 2192 | rnp = rnp->parent; |
2105 | raw_spin_lock_irqsave(&rnp->lock, flags); | 2193 | raw_spin_lock_irqsave(&rnp->lock, flags); |
2106 | smp_mb__after_unlock_lock(); | 2194 | smp_mb__after_unlock_lock(); |
2107 | WARN_ON_ONCE(rnp_c->qsmask); | 2195 | oldmask = rnp_c->qsmask; |
2108 | } | 2196 | } |
2109 | 2197 | ||
2110 | /* | 2198 | /* |
@@ -2116,6 +2204,46 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, | |||
2116 | } | 2204 | } |
2117 | 2205 | ||
2118 | /* | 2206 | /* |
2207 | * Record a quiescent state for all tasks that were previously queued | ||
2208 | * on the specified rcu_node structure and that were blocking the current | ||
2209 | * RCU grace period. The caller must hold the specified rnp->lock with | ||
2210 | * irqs disabled, and this lock is released upon return, but irqs remain | ||
2211 | * disabled. | ||
2212 | */ | ||
2213 | static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, | ||
2214 | struct rcu_node *rnp, unsigned long flags) | ||
2215 | __releases(rnp->lock) | ||
2216 | { | ||
2217 | unsigned long gps; | ||
2218 | unsigned long mask; | ||
2219 | struct rcu_node *rnp_p; | ||
2220 | |||
2221 | if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || | ||
2222 | rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { | ||
2223 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
2224 | return; /* Still need more quiescent states! */ | ||
2225 | } | ||
2226 | |||
2227 | rnp_p = rnp->parent; | ||
2228 | if (rnp_p == NULL) { | ||
2229 | /* | ||
2230 | * Only one rcu_node structure in the tree, so don't | ||
2231 | * try to report up to its nonexistent parent! | ||
2232 | */ | ||
2233 | rcu_report_qs_rsp(rsp, flags); | ||
2234 | return; | ||
2235 | } | ||
2236 | |||
2237 | /* Report up the rest of the hierarchy, tracking current ->gpnum. */ | ||
2238 | gps = rnp->gpnum; | ||
2239 | mask = rnp->grpmask; | ||
2240 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | ||
2241 | raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */ | ||
2242 | smp_mb__after_unlock_lock(); | ||
2243 | rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags); | ||
2244 | } | ||
2245 | |||
2246 | /* | ||
2119 | * Record a quiescent state for the specified CPU to that CPU's rcu_data | 2247 | * Record a quiescent state for the specified CPU to that CPU's rcu_data |
2120 | * structure. This must be either called from the specified CPU, or | 2248 | * structure. This must be either called from the specified CPU, or |
2121 | * called when the specified CPU is known to be offline (and when it is | 2249 | * called when the specified CPU is known to be offline (and when it is |
@@ -2163,7 +2291,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) | |||
2163 | */ | 2291 | */ |
2164 | needwake = rcu_accelerate_cbs(rsp, rnp, rdp); | 2292 | needwake = rcu_accelerate_cbs(rsp, rnp, rdp); |
2165 | 2293 | ||
2166 | rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */ | 2294 | rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags); |
2295 | /* ^^^ Released rnp->lock */ | ||
2167 | if (needwake) | 2296 | if (needwake) |
2168 | rcu_gp_kthread_wake(rsp); | 2297 | rcu_gp_kthread_wake(rsp); |
2169 | } | 2298 | } |
@@ -2256,8 +2385,12 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, | |||
2256 | rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL]; | 2385 | rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL]; |
2257 | } | 2386 | } |
2258 | 2387 | ||
2259 | /* Finally, initialize the rcu_data structure's list to empty. */ | 2388 | /* |
2389 | * Finally, initialize the rcu_data structure's list to empty and | ||
2390 | * disallow further callbacks on this CPU. | ||
2391 | */ | ||
2260 | init_callback_list(rdp); | 2392 | init_callback_list(rdp); |
2393 | rdp->nxttail[RCU_NEXT_TAIL] = NULL; | ||
2261 | } | 2394 | } |
2262 | 2395 | ||
2263 | /* | 2396 | /* |
@@ -2355,6 +2488,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) | |||
2355 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | 2488 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ |
2356 | smp_mb__after_unlock_lock(); /* GP memory ordering. */ | 2489 | smp_mb__after_unlock_lock(); /* GP memory ordering. */ |
2357 | rnp->qsmaskinit &= ~mask; | 2490 | rnp->qsmaskinit &= ~mask; |
2491 | rnp->qsmask &= ~mask; | ||
2358 | if (rnp->qsmaskinit) { | 2492 | if (rnp->qsmaskinit) { |
2359 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 2493 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
2360 | return; | 2494 | return; |
@@ -2364,6 +2498,26 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) | |||
2364 | } | 2498 | } |
2365 | 2499 | ||
2366 | /* | 2500 | /* |
2501 | * The CPU is exiting the idle loop into the arch_cpu_idle_dead() | ||
2502 | * function. We now remove it from the rcu_node tree's ->qsmaskinit | ||
2503 | * bit masks. | ||
2504 | */ | ||
2505 | static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) | ||
2506 | { | ||
2507 | unsigned long flags; | ||
2508 | unsigned long mask; | ||
2509 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | ||
2510 | struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ | ||
2511 | |||
2512 | /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ | ||
2513 | mask = rdp->grpmask; | ||
2514 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
2515 | smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */ | ||
2516 | rnp->qsmaskinitnext &= ~mask; | ||
2517 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
2518 | } | ||
2519 | |||
2520 | /* | ||
2367 | * The CPU has been completely removed, and some other CPU is reporting | 2521 | * The CPU has been completely removed, and some other CPU is reporting |
2368 | * this fact from process context. Do the remainder of the cleanup, | 2522 | * this fact from process context. Do the remainder of the cleanup, |
2369 | * including orphaning the outgoing CPU's RCU callbacks, and also | 2523 | * including orphaning the outgoing CPU's RCU callbacks, and also |
@@ -2379,29 +2533,15 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) | |||
2379 | /* Adjust any no-longer-needed kthreads. */ | 2533 | /* Adjust any no-longer-needed kthreads. */ |
2380 | rcu_boost_kthread_setaffinity(rnp, -1); | 2534 | rcu_boost_kthread_setaffinity(rnp, -1); |
2381 | 2535 | ||
2382 | /* Exclude any attempts to start a new grace period. */ | ||
2383 | mutex_lock(&rsp->onoff_mutex); | ||
2384 | raw_spin_lock_irqsave(&rsp->orphan_lock, flags); | ||
2385 | |||
2386 | /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ | 2536 | /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ |
2537 | raw_spin_lock_irqsave(&rsp->orphan_lock, flags); | ||
2387 | rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); | 2538 | rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); |
2388 | rcu_adopt_orphan_cbs(rsp, flags); | 2539 | rcu_adopt_orphan_cbs(rsp, flags); |
2389 | raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); | 2540 | raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); |
2390 | 2541 | ||
2391 | /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ | ||
2392 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
2393 | smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */ | ||
2394 | rnp->qsmaskinit &= ~rdp->grpmask; | ||
2395 | if (rnp->qsmaskinit == 0 && !rcu_preempt_has_tasks(rnp)) | ||
2396 | rcu_cleanup_dead_rnp(rnp); | ||
2397 | rcu_report_qs_rnp(rdp->grpmask, rsp, rnp, flags); /* Rlses rnp->lock. */ | ||
2398 | WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, | 2542 | WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, |
2399 | "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", | 2543 | "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", |
2400 | cpu, rdp->qlen, rdp->nxtlist); | 2544 | cpu, rdp->qlen, rdp->nxtlist); |
2401 | init_callback_list(rdp); | ||
2402 | /* Disallow further callbacks on this CPU. */ | ||
2403 | rdp->nxttail[RCU_NEXT_TAIL] = NULL; | ||
2404 | mutex_unlock(&rsp->onoff_mutex); | ||
2405 | } | 2545 | } |
2406 | 2546 | ||
2407 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ | 2547 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ |
@@ -2414,6 +2554,10 @@ static void __maybe_unused rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) | |||
2414 | { | 2554 | { |
2415 | } | 2555 | } |
2416 | 2556 | ||
2557 | static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) | ||
2558 | { | ||
2559 | } | ||
2560 | |||
2417 | static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) | 2561 | static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) |
2418 | { | 2562 | { |
2419 | } | 2563 | } |
@@ -2589,26 +2733,47 @@ static void force_qs_rnp(struct rcu_state *rsp, | |||
2589 | return; | 2733 | return; |
2590 | } | 2734 | } |
2591 | if (rnp->qsmask == 0) { | 2735 | if (rnp->qsmask == 0) { |
2592 | rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ | 2736 | if (rcu_state_p == &rcu_sched_state || |
2593 | continue; | 2737 | rsp != rcu_state_p || |
2738 | rcu_preempt_blocked_readers_cgp(rnp)) { | ||
2739 | /* | ||
2740 | * No point in scanning bits because they | ||
2741 | * are all zero. But we might need to | ||
2742 | * priority-boost blocked readers. | ||
2743 | */ | ||
2744 | rcu_initiate_boost(rnp, flags); | ||
2745 | /* rcu_initiate_boost() releases rnp->lock */ | ||
2746 | continue; | ||
2747 | } | ||
2748 | if (rnp->parent && | ||
2749 | (rnp->parent->qsmask & rnp->grpmask)) { | ||
2750 | /* | ||
2751 | * Race between grace-period | ||
2752 | * initialization and task exiting RCU | ||
2753 | * read-side critical section: Report. | ||
2754 | */ | ||
2755 | rcu_report_unblock_qs_rnp(rsp, rnp, flags); | ||
2756 | /* rcu_report_unblock_qs_rnp() rlses ->lock */ | ||
2757 | continue; | ||
2758 | } | ||
2594 | } | 2759 | } |
2595 | cpu = rnp->grplo; | 2760 | cpu = rnp->grplo; |
2596 | bit = 1; | 2761 | bit = 1; |
2597 | for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { | 2762 | for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { |
2598 | if ((rnp->qsmask & bit) != 0) { | 2763 | if ((rnp->qsmask & bit) != 0) { |
2599 | if ((rnp->qsmaskinit & bit) != 0) | 2764 | if ((rnp->qsmaskinit & bit) == 0) |
2600 | *isidle = false; | 2765 | *isidle = false; /* Pending hotplug. */ |
2601 | if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) | 2766 | if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) |
2602 | mask |= bit; | 2767 | mask |= bit; |
2603 | } | 2768 | } |
2604 | } | 2769 | } |
2605 | if (mask != 0) { | 2770 | if (mask != 0) { |
2606 | 2771 | /* Idle/offline CPUs, report (releases rnp->lock. */ | |
2607 | /* rcu_report_qs_rnp() releases rnp->lock. */ | 2772 | rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags); |
2608 | rcu_report_qs_rnp(mask, rsp, rnp, flags); | 2773 | } else { |
2609 | continue; | 2774 | /* Nothing to do here, so just drop the lock. */ |
2775 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
2610 | } | 2776 | } |
2611 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
2612 | } | 2777 | } |
2613 | } | 2778 | } |
2614 | 2779 | ||
@@ -2741,7 +2906,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, | |||
2741 | * If called from an extended quiescent state, invoke the RCU | 2906 | * If called from an extended quiescent state, invoke the RCU |
2742 | * core in order to force a re-evaluation of RCU's idleness. | 2907 | * core in order to force a re-evaluation of RCU's idleness. |
2743 | */ | 2908 | */ |
2744 | if (!rcu_is_watching() && cpu_online(smp_processor_id())) | 2909 | if (!rcu_is_watching()) |
2745 | invoke_rcu_core(); | 2910 | invoke_rcu_core(); |
2746 | 2911 | ||
2747 | /* If interrupts were disabled or CPU offline, don't invoke RCU core. */ | 2912 | /* If interrupts were disabled or CPU offline, don't invoke RCU core. */ |
@@ -2827,11 +2992,22 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
2827 | 2992 | ||
2828 | if (cpu != -1) | 2993 | if (cpu != -1) |
2829 | rdp = per_cpu_ptr(rsp->rda, cpu); | 2994 | rdp = per_cpu_ptr(rsp->rda, cpu); |
2830 | offline = !__call_rcu_nocb(rdp, head, lazy, flags); | 2995 | if (likely(rdp->mynode)) { |
2831 | WARN_ON_ONCE(offline); | 2996 | /* Post-boot, so this should be for a no-CBs CPU. */ |
2832 | /* _call_rcu() is illegal on offline CPU; leak the callback. */ | 2997 | offline = !__call_rcu_nocb(rdp, head, lazy, flags); |
2833 | local_irq_restore(flags); | 2998 | WARN_ON_ONCE(offline); |
2834 | return; | 2999 | /* Offline CPU, _call_rcu() illegal, leak callback. */ |
3000 | local_irq_restore(flags); | ||
3001 | return; | ||
3002 | } | ||
3003 | /* | ||
3004 | * Very early boot, before rcu_init(). Initialize if needed | ||
3005 | * and then drop through to queue the callback. | ||
3006 | */ | ||
3007 | BUG_ON(cpu != -1); | ||
3008 | WARN_ON_ONCE(!rcu_is_watching()); | ||
3009 | if (!likely(rdp->nxtlist)) | ||
3010 | init_default_callback_list(rdp); | ||
2835 | } | 3011 | } |
2836 | ACCESS_ONCE(rdp->qlen) = rdp->qlen + 1; | 3012 | ACCESS_ONCE(rdp->qlen) = rdp->qlen + 1; |
2837 | if (lazy) | 3013 | if (lazy) |
@@ -2954,7 +3130,7 @@ void synchronize_sched(void) | |||
2954 | "Illegal synchronize_sched() in RCU-sched read-side critical section"); | 3130 | "Illegal synchronize_sched() in RCU-sched read-side critical section"); |
2955 | if (rcu_blocking_is_gp()) | 3131 | if (rcu_blocking_is_gp()) |
2956 | return; | 3132 | return; |
2957 | if (rcu_expedited) | 3133 | if (rcu_gp_is_expedited()) |
2958 | synchronize_sched_expedited(); | 3134 | synchronize_sched_expedited(); |
2959 | else | 3135 | else |
2960 | wait_rcu_gp(call_rcu_sched); | 3136 | wait_rcu_gp(call_rcu_sched); |
@@ -2981,7 +3157,7 @@ void synchronize_rcu_bh(void) | |||
2981 | "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); | 3157 | "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); |
2982 | if (rcu_blocking_is_gp()) | 3158 | if (rcu_blocking_is_gp()) |
2983 | return; | 3159 | return; |
2984 | if (rcu_expedited) | 3160 | if (rcu_gp_is_expedited()) |
2985 | synchronize_rcu_bh_expedited(); | 3161 | synchronize_rcu_bh_expedited(); |
2986 | else | 3162 | else |
2987 | wait_rcu_gp(call_rcu_bh); | 3163 | wait_rcu_gp(call_rcu_bh); |
@@ -3518,6 +3694,28 @@ void rcu_barrier_sched(void) | |||
3518 | EXPORT_SYMBOL_GPL(rcu_barrier_sched); | 3694 | EXPORT_SYMBOL_GPL(rcu_barrier_sched); |
3519 | 3695 | ||
3520 | /* | 3696 | /* |
3697 | * Propagate ->qsinitmask bits up the rcu_node tree to account for the | ||
3698 | * first CPU in a given leaf rcu_node structure coming online. The caller | ||
3699 | * must hold the corresponding leaf rcu_node ->lock with interrrupts | ||
3700 | * disabled. | ||
3701 | */ | ||
3702 | static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) | ||
3703 | { | ||
3704 | long mask; | ||
3705 | struct rcu_node *rnp = rnp_leaf; | ||
3706 | |||
3707 | for (;;) { | ||
3708 | mask = rnp->grpmask; | ||
3709 | rnp = rnp->parent; | ||
3710 | if (rnp == NULL) | ||
3711 | return; | ||
3712 | raw_spin_lock(&rnp->lock); /* Interrupts already disabled. */ | ||
3713 | rnp->qsmaskinit |= mask; | ||
3714 | raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ | ||
3715 | } | ||
3716 | } | ||
3717 | |||
3718 | /* | ||
3521 | * Do boot-time initialization of a CPU's per-CPU RCU data. | 3719 | * Do boot-time initialization of a CPU's per-CPU RCU data. |
3522 | */ | 3720 | */ |
3523 | static void __init | 3721 | static void __init |
@@ -3553,49 +3751,37 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
3553 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | 3751 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); |
3554 | struct rcu_node *rnp = rcu_get_root(rsp); | 3752 | struct rcu_node *rnp = rcu_get_root(rsp); |
3555 | 3753 | ||
3556 | /* Exclude new grace periods. */ | ||
3557 | mutex_lock(&rsp->onoff_mutex); | ||
3558 | |||
3559 | /* Set up local state, ensuring consistent view of global state. */ | 3754 | /* Set up local state, ensuring consistent view of global state. */ |
3560 | raw_spin_lock_irqsave(&rnp->lock, flags); | 3755 | raw_spin_lock_irqsave(&rnp->lock, flags); |
3561 | rdp->beenonline = 1; /* We have now been online. */ | 3756 | rdp->beenonline = 1; /* We have now been online. */ |
3562 | rdp->qlen_last_fqs_check = 0; | 3757 | rdp->qlen_last_fqs_check = 0; |
3563 | rdp->n_force_qs_snap = rsp->n_force_qs; | 3758 | rdp->n_force_qs_snap = rsp->n_force_qs; |
3564 | rdp->blimit = blimit; | 3759 | rdp->blimit = blimit; |
3565 | init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ | 3760 | if (!rdp->nxtlist) |
3761 | init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ | ||
3566 | rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; | 3762 | rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; |
3567 | rcu_sysidle_init_percpu_data(rdp->dynticks); | 3763 | rcu_sysidle_init_percpu_data(rdp->dynticks); |
3568 | atomic_set(&rdp->dynticks->dynticks, | 3764 | atomic_set(&rdp->dynticks->dynticks, |
3569 | (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); | 3765 | (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); |
3570 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 3766 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
3571 | 3767 | ||
3572 | /* Add CPU to rcu_node bitmasks. */ | 3768 | /* |
3769 | * Add CPU to leaf rcu_node pending-online bitmask. Any needed | ||
3770 | * propagation up the rcu_node tree will happen at the beginning | ||
3771 | * of the next grace period. | ||
3772 | */ | ||
3573 | rnp = rdp->mynode; | 3773 | rnp = rdp->mynode; |
3574 | mask = rdp->grpmask; | 3774 | mask = rdp->grpmask; |
3575 | do { | 3775 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ |
3576 | /* Exclude any attempts to start a new GP on small systems. */ | 3776 | smp_mb__after_unlock_lock(); |
3577 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | 3777 | rnp->qsmaskinitnext |= mask; |
3578 | rnp->qsmaskinit |= mask; | 3778 | rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ |
3579 | mask = rnp->grpmask; | 3779 | rdp->completed = rnp->completed; |
3580 | if (rnp == rdp->mynode) { | 3780 | rdp->passed_quiesce = false; |
3581 | /* | 3781 | rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); |
3582 | * If there is a grace period in progress, we will | 3782 | rdp->qs_pending = false; |
3583 | * set up to wait for it next time we run the | 3783 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); |
3584 | * RCU core code. | 3784 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
3585 | */ | ||
3586 | rdp->gpnum = rnp->completed; | ||
3587 | rdp->completed = rnp->completed; | ||
3588 | rdp->passed_quiesce = 0; | ||
3589 | rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); | ||
3590 | rdp->qs_pending = 0; | ||
3591 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); | ||
3592 | } | ||
3593 | raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ | ||
3594 | rnp = rnp->parent; | ||
3595 | } while (rnp != NULL && !(rnp->qsmaskinit & mask)); | ||
3596 | local_irq_restore(flags); | ||
3597 | |||
3598 | mutex_unlock(&rsp->onoff_mutex); | ||
3599 | } | 3785 | } |
3600 | 3786 | ||
3601 | static void rcu_prepare_cpu(int cpu) | 3787 | static void rcu_prepare_cpu(int cpu) |
@@ -3609,15 +3795,14 @@ static void rcu_prepare_cpu(int cpu) | |||
3609 | /* | 3795 | /* |
3610 | * Handle CPU online/offline notification events. | 3796 | * Handle CPU online/offline notification events. |
3611 | */ | 3797 | */ |
3612 | static int rcu_cpu_notify(struct notifier_block *self, | 3798 | int rcu_cpu_notify(struct notifier_block *self, |
3613 | unsigned long action, void *hcpu) | 3799 | unsigned long action, void *hcpu) |
3614 | { | 3800 | { |
3615 | long cpu = (long)hcpu; | 3801 | long cpu = (long)hcpu; |
3616 | struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); | 3802 | struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); |
3617 | struct rcu_node *rnp = rdp->mynode; | 3803 | struct rcu_node *rnp = rdp->mynode; |
3618 | struct rcu_state *rsp; | 3804 | struct rcu_state *rsp; |
3619 | 3805 | ||
3620 | trace_rcu_utilization(TPS("Start CPU hotplug")); | ||
3621 | switch (action) { | 3806 | switch (action) { |
3622 | case CPU_UP_PREPARE: | 3807 | case CPU_UP_PREPARE: |
3623 | case CPU_UP_PREPARE_FROZEN: | 3808 | case CPU_UP_PREPARE_FROZEN: |
@@ -3637,6 +3822,11 @@ static int rcu_cpu_notify(struct notifier_block *self, | |||
3637 | for_each_rcu_flavor(rsp) | 3822 | for_each_rcu_flavor(rsp) |
3638 | rcu_cleanup_dying_cpu(rsp); | 3823 | rcu_cleanup_dying_cpu(rsp); |
3639 | break; | 3824 | break; |
3825 | case CPU_DYING_IDLE: | ||
3826 | for_each_rcu_flavor(rsp) { | ||
3827 | rcu_cleanup_dying_idle_cpu(cpu, rsp); | ||
3828 | } | ||
3829 | break; | ||
3640 | case CPU_DEAD: | 3830 | case CPU_DEAD: |
3641 | case CPU_DEAD_FROZEN: | 3831 | case CPU_DEAD_FROZEN: |
3642 | case CPU_UP_CANCELED: | 3832 | case CPU_UP_CANCELED: |
@@ -3649,7 +3839,6 @@ static int rcu_cpu_notify(struct notifier_block *self, | |||
3649 | default: | 3839 | default: |
3650 | break; | 3840 | break; |
3651 | } | 3841 | } |
3652 | trace_rcu_utilization(TPS("End CPU hotplug")); | ||
3653 | return NOTIFY_OK; | 3842 | return NOTIFY_OK; |
3654 | } | 3843 | } |
3655 | 3844 | ||
@@ -3660,11 +3849,12 @@ static int rcu_pm_notify(struct notifier_block *self, | |||
3660 | case PM_HIBERNATION_PREPARE: | 3849 | case PM_HIBERNATION_PREPARE: |
3661 | case PM_SUSPEND_PREPARE: | 3850 | case PM_SUSPEND_PREPARE: |
3662 | if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ | 3851 | if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ |
3663 | rcu_expedited = 1; | 3852 | rcu_expedite_gp(); |
3664 | break; | 3853 | break; |
3665 | case PM_POST_HIBERNATION: | 3854 | case PM_POST_HIBERNATION: |
3666 | case PM_POST_SUSPEND: | 3855 | case PM_POST_SUSPEND: |
3667 | rcu_expedited = 0; | 3856 | if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ |
3857 | rcu_unexpedite_gp(); | ||
3668 | break; | 3858 | break; |
3669 | default: | 3859 | default: |
3670 | break; | 3860 | break; |
@@ -3734,30 +3924,26 @@ void rcu_scheduler_starting(void) | |||
3734 | * Compute the per-level fanout, either using the exact fanout specified | 3924 | * Compute the per-level fanout, either using the exact fanout specified |
3735 | * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT. | 3925 | * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT. |
3736 | */ | 3926 | */ |
3737 | #ifdef CONFIG_RCU_FANOUT_EXACT | ||
3738 | static void __init rcu_init_levelspread(struct rcu_state *rsp) | ||
3739 | { | ||
3740 | int i; | ||
3741 | |||
3742 | rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; | ||
3743 | for (i = rcu_num_lvls - 2; i >= 0; i--) | ||
3744 | rsp->levelspread[i] = CONFIG_RCU_FANOUT; | ||
3745 | } | ||
3746 | #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ | ||
3747 | static void __init rcu_init_levelspread(struct rcu_state *rsp) | 3927 | static void __init rcu_init_levelspread(struct rcu_state *rsp) |
3748 | { | 3928 | { |
3749 | int ccur; | ||
3750 | int cprv; | ||
3751 | int i; | 3929 | int i; |
3752 | 3930 | ||
3753 | cprv = nr_cpu_ids; | 3931 | if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT)) { |
3754 | for (i = rcu_num_lvls - 1; i >= 0; i--) { | 3932 | rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; |
3755 | ccur = rsp->levelcnt[i]; | 3933 | for (i = rcu_num_lvls - 2; i >= 0; i--) |
3756 | rsp->levelspread[i] = (cprv + ccur - 1) / ccur; | 3934 | rsp->levelspread[i] = CONFIG_RCU_FANOUT; |
3757 | cprv = ccur; | 3935 | } else { |
3936 | int ccur; | ||
3937 | int cprv; | ||
3938 | |||
3939 | cprv = nr_cpu_ids; | ||
3940 | for (i = rcu_num_lvls - 1; i >= 0; i--) { | ||
3941 | ccur = rsp->levelcnt[i]; | ||
3942 | rsp->levelspread[i] = (cprv + ccur - 1) / ccur; | ||
3943 | cprv = ccur; | ||
3944 | } | ||
3758 | } | 3945 | } |
3759 | } | 3946 | } |
3760 | #endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */ | ||
3761 | 3947 | ||
3762 | /* | 3948 | /* |
3763 | * Helper function for rcu_init() that initializes one rcu_state structure. | 3949 | * Helper function for rcu_init() that initializes one rcu_state structure. |
@@ -3833,7 +4019,6 @@ static void __init rcu_init_one(struct rcu_state *rsp, | |||
3833 | } | 4019 | } |
3834 | } | 4020 | } |
3835 | 4021 | ||
3836 | rsp->rda = rda; | ||
3837 | init_waitqueue_head(&rsp->gp_wq); | 4022 | init_waitqueue_head(&rsp->gp_wq); |
3838 | rnp = rsp->level[rcu_num_lvls - 1]; | 4023 | rnp = rsp->level[rcu_num_lvls - 1]; |
3839 | for_each_possible_cpu(i) { | 4024 | for_each_possible_cpu(i) { |
@@ -3926,6 +4111,8 @@ void __init rcu_init(void) | |||
3926 | { | 4111 | { |
3927 | int cpu; | 4112 | int cpu; |
3928 | 4113 | ||
4114 | rcu_early_boot_tests(); | ||
4115 | |||
3929 | rcu_bootup_announce(); | 4116 | rcu_bootup_announce(); |
3930 | rcu_init_geometry(); | 4117 | rcu_init_geometry(); |
3931 | rcu_init_one(&rcu_bh_state, &rcu_bh_data); | 4118 | rcu_init_one(&rcu_bh_state, &rcu_bh_data); |
@@ -3942,8 +4129,6 @@ void __init rcu_init(void) | |||
3942 | pm_notifier(rcu_pm_notify, 0); | 4129 | pm_notifier(rcu_pm_notify, 0); |
3943 | for_each_online_cpu(cpu) | 4130 | for_each_online_cpu(cpu) |
3944 | rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); | 4131 | rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); |
3945 | |||
3946 | rcu_early_boot_tests(); | ||
3947 | } | 4132 | } |
3948 | 4133 | ||
3949 | #include "tree_plugin.h" | 4134 | #include "tree_plugin.h" |
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 119de399eb2f..a69d3dab2ec4 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h | |||
@@ -141,12 +141,20 @@ struct rcu_node { | |||
141 | /* complete (only for PREEMPT_RCU). */ | 141 | /* complete (only for PREEMPT_RCU). */ |
142 | unsigned long qsmaskinit; | 142 | unsigned long qsmaskinit; |
143 | /* Per-GP initial value for qsmask & expmask. */ | 143 | /* Per-GP initial value for qsmask & expmask. */ |
144 | /* Initialized from ->qsmaskinitnext at the */ | ||
145 | /* beginning of each grace period. */ | ||
146 | unsigned long qsmaskinitnext; | ||
147 | /* Online CPUs for next grace period. */ | ||
144 | unsigned long grpmask; /* Mask to apply to parent qsmask. */ | 148 | unsigned long grpmask; /* Mask to apply to parent qsmask. */ |
145 | /* Only one bit will be set in this mask. */ | 149 | /* Only one bit will be set in this mask. */ |
146 | int grplo; /* lowest-numbered CPU or group here. */ | 150 | int grplo; /* lowest-numbered CPU or group here. */ |
147 | int grphi; /* highest-numbered CPU or group here. */ | 151 | int grphi; /* highest-numbered CPU or group here. */ |
148 | u8 grpnum; /* CPU/group number for next level up. */ | 152 | u8 grpnum; /* CPU/group number for next level up. */ |
149 | u8 level; /* root is at level 0. */ | 153 | u8 level; /* root is at level 0. */ |
154 | bool wait_blkd_tasks;/* Necessary to wait for blocked tasks to */ | ||
155 | /* exit RCU read-side critical sections */ | ||
156 | /* before propagating offline up the */ | ||
157 | /* rcu_node tree? */ | ||
150 | struct rcu_node *parent; | 158 | struct rcu_node *parent; |
151 | struct list_head blkd_tasks; | 159 | struct list_head blkd_tasks; |
152 | /* Tasks blocked in RCU read-side critical */ | 160 | /* Tasks blocked in RCU read-side critical */ |
@@ -448,8 +456,6 @@ struct rcu_state { | |||
448 | long qlen; /* Total number of callbacks. */ | 456 | long qlen; /* Total number of callbacks. */ |
449 | /* End of fields guarded by orphan_lock. */ | 457 | /* End of fields guarded by orphan_lock. */ |
450 | 458 | ||
451 | struct mutex onoff_mutex; /* Coordinate hotplug & GPs. */ | ||
452 | |||
453 | struct mutex barrier_mutex; /* Guards barrier fields. */ | 459 | struct mutex barrier_mutex; /* Guards barrier fields. */ |
454 | atomic_t barrier_cpu_count; /* # CPUs waiting on. */ | 460 | atomic_t barrier_cpu_count; /* # CPUs waiting on. */ |
455 | struct completion barrier_completion; /* Wake at barrier end. */ | 461 | struct completion barrier_completion; /* Wake at barrier end. */ |
@@ -559,6 +565,7 @@ static void rcu_prepare_kthreads(int cpu); | |||
559 | static void rcu_cleanup_after_idle(void); | 565 | static void rcu_cleanup_after_idle(void); |
560 | static void rcu_prepare_for_idle(void); | 566 | static void rcu_prepare_for_idle(void); |
561 | static void rcu_idle_count_callbacks_posted(void); | 567 | static void rcu_idle_count_callbacks_posted(void); |
568 | static bool rcu_preempt_has_tasks(struct rcu_node *rnp); | ||
562 | static void print_cpu_stall_info_begin(void); | 569 | static void print_cpu_stall_info_begin(void); |
563 | static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); | 570 | static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); |
564 | static void print_cpu_stall_info_end(void); | 571 | static void print_cpu_stall_info_end(void); |
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0a571e9a0f1d..8c0ec0f5a027 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h | |||
@@ -58,38 +58,33 @@ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ | |||
58 | */ | 58 | */ |
59 | static void __init rcu_bootup_announce_oddness(void) | 59 | static void __init rcu_bootup_announce_oddness(void) |
60 | { | 60 | { |
61 | #ifdef CONFIG_RCU_TRACE | 61 | if (IS_ENABLED(CONFIG_RCU_TRACE)) |
62 | pr_info("\tRCU debugfs-based tracing is enabled.\n"); | 62 | pr_info("\tRCU debugfs-based tracing is enabled.\n"); |
63 | #endif | 63 | if ((IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || |
64 | #if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32) | 64 | (!IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32)) |
65 | pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n", | 65 | pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n", |
66 | CONFIG_RCU_FANOUT); | 66 | CONFIG_RCU_FANOUT); |
67 | #endif | 67 | if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT)) |
68 | #ifdef CONFIG_RCU_FANOUT_EXACT | 68 | pr_info("\tHierarchical RCU autobalancing is disabled.\n"); |
69 | pr_info("\tHierarchical RCU autobalancing is disabled.\n"); | 69 | if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ)) |
70 | #endif | 70 | pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); |
71 | #ifdef CONFIG_RCU_FAST_NO_HZ | 71 | if (IS_ENABLED(CONFIG_PROVE_RCU)) |
72 | pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); | 72 | pr_info("\tRCU lockdep checking is enabled.\n"); |
73 | #endif | 73 | if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_RUNNABLE)) |
74 | #ifdef CONFIG_PROVE_RCU | 74 | pr_info("\tRCU torture testing starts during boot.\n"); |
75 | pr_info("\tRCU lockdep checking is enabled.\n"); | 75 | if (IS_ENABLED(CONFIG_RCU_CPU_STALL_INFO)) |
76 | #endif | 76 | pr_info("\tAdditional per-CPU info printed with stalls.\n"); |
77 | #ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE | 77 | if (NUM_RCU_LVL_4 != 0) |
78 | pr_info("\tRCU torture testing starts during boot.\n"); | 78 | pr_info("\tFour-level hierarchy is enabled.\n"); |
79 | #endif | 79 | if (CONFIG_RCU_FANOUT_LEAF != 16) |
80 | #if defined(CONFIG_RCU_CPU_STALL_INFO) | 80 | pr_info("\tBuild-time adjustment of leaf fanout to %d.\n", |
81 | pr_info("\tAdditional per-CPU info printed with stalls.\n"); | 81 | CONFIG_RCU_FANOUT_LEAF); |
82 | #endif | ||
83 | #if NUM_RCU_LVL_4 != 0 | ||
84 | pr_info("\tFour-level hierarchy is enabled.\n"); | ||
85 | #endif | ||
86 | if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF) | 82 | if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF) |
87 | pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); | 83 | pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); |
88 | if (nr_cpu_ids != NR_CPUS) | 84 | if (nr_cpu_ids != NR_CPUS) |
89 | pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); | 85 | pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); |
90 | #ifdef CONFIG_RCU_BOOST | 86 | if (IS_ENABLED(CONFIG_RCU_BOOST)) |
91 | pr_info("\tRCU kthread priority: %d.\n", kthread_prio); | 87 | pr_info("\tRCU kthread priority: %d.\n", kthread_prio); |
92 | #endif | ||
93 | } | 88 | } |
94 | 89 | ||
95 | #ifdef CONFIG_PREEMPT_RCU | 90 | #ifdef CONFIG_PREEMPT_RCU |
@@ -180,7 +175,7 @@ static void rcu_preempt_note_context_switch(void) | |||
180 | * But first, note that the current CPU must still be | 175 | * But first, note that the current CPU must still be |
181 | * on line! | 176 | * on line! |
182 | */ | 177 | */ |
183 | WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0); | 178 | WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0); |
184 | WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); | 179 | WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); |
185 | if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) { | 180 | if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) { |
186 | list_add(&t->rcu_node_entry, rnp->gp_tasks->prev); | 181 | list_add(&t->rcu_node_entry, rnp->gp_tasks->prev); |
@@ -233,43 +228,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) | |||
233 | } | 228 | } |
234 | 229 | ||
235 | /* | 230 | /* |
236 | * Record a quiescent state for all tasks that were previously queued | ||
237 | * on the specified rcu_node structure and that were blocking the current | ||
238 | * RCU grace period. The caller must hold the specified rnp->lock with | ||
239 | * irqs disabled, and this lock is released upon return, but irqs remain | ||
240 | * disabled. | ||
241 | */ | ||
242 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) | ||
243 | __releases(rnp->lock) | ||
244 | { | ||
245 | unsigned long mask; | ||
246 | struct rcu_node *rnp_p; | ||
247 | |||
248 | if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { | ||
249 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
250 | return; /* Still need more quiescent states! */ | ||
251 | } | ||
252 | |||
253 | rnp_p = rnp->parent; | ||
254 | if (rnp_p == NULL) { | ||
255 | /* | ||
256 | * Either there is only one rcu_node in the tree, | ||
257 | * or tasks were kicked up to root rcu_node due to | ||
258 | * CPUs going offline. | ||
259 | */ | ||
260 | rcu_report_qs_rsp(&rcu_preempt_state, flags); | ||
261 | return; | ||
262 | } | ||
263 | |||
264 | /* Report up the rest of the hierarchy. */ | ||
265 | mask = rnp->grpmask; | ||
266 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | ||
267 | raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */ | ||
268 | smp_mb__after_unlock_lock(); | ||
269 | rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags); | ||
270 | } | ||
271 | |||
272 | /* | ||
273 | * Advance a ->blkd_tasks-list pointer to the next entry, instead | 231 | * Advance a ->blkd_tasks-list pointer to the next entry, instead |
274 | * returning NULL if at the end of the list. | 232 | * returning NULL if at the end of the list. |
275 | */ | 233 | */ |
@@ -300,7 +258,6 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp) | |||
300 | */ | 258 | */ |
301 | void rcu_read_unlock_special(struct task_struct *t) | 259 | void rcu_read_unlock_special(struct task_struct *t) |
302 | { | 260 | { |
303 | bool empty; | ||
304 | bool empty_exp; | 261 | bool empty_exp; |
305 | bool empty_norm; | 262 | bool empty_norm; |
306 | bool empty_exp_now; | 263 | bool empty_exp_now; |
@@ -334,7 +291,13 @@ void rcu_read_unlock_special(struct task_struct *t) | |||
334 | } | 291 | } |
335 | 292 | ||
336 | /* Hardware IRQ handlers cannot block, complain if they get here. */ | 293 | /* Hardware IRQ handlers cannot block, complain if they get here. */ |
337 | if (WARN_ON_ONCE(in_irq() || in_serving_softirq())) { | 294 | if (in_irq() || in_serving_softirq()) { |
295 | lockdep_rcu_suspicious(__FILE__, __LINE__, | ||
296 | "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n"); | ||
297 | pr_alert("->rcu_read_unlock_special: %#x (b: %d, nq: %d)\n", | ||
298 | t->rcu_read_unlock_special.s, | ||
299 | t->rcu_read_unlock_special.b.blocked, | ||
300 | t->rcu_read_unlock_special.b.need_qs); | ||
338 | local_irq_restore(flags); | 301 | local_irq_restore(flags); |
339 | return; | 302 | return; |
340 | } | 303 | } |
@@ -356,7 +319,6 @@ void rcu_read_unlock_special(struct task_struct *t) | |||
356 | break; | 319 | break; |
357 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 320 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
358 | } | 321 | } |
359 | empty = !rcu_preempt_has_tasks(rnp); | ||
360 | empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); | 322 | empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); |
361 | empty_exp = !rcu_preempted_readers_exp(rnp); | 323 | empty_exp = !rcu_preempted_readers_exp(rnp); |
362 | smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ | 324 | smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ |
@@ -377,14 +339,6 @@ void rcu_read_unlock_special(struct task_struct *t) | |||
377 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 339 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
378 | 340 | ||
379 | /* | 341 | /* |
380 | * If this was the last task on the list, go see if we | ||
381 | * need to propagate ->qsmaskinit bit clearing up the | ||
382 | * rcu_node tree. | ||
383 | */ | ||
384 | if (!empty && !rcu_preempt_has_tasks(rnp)) | ||
385 | rcu_cleanup_dead_rnp(rnp); | ||
386 | |||
387 | /* | ||
388 | * If this was the last task on the current list, and if | 342 | * If this was the last task on the current list, and if |
389 | * we aren't waiting on any CPUs, report the quiescent state. | 343 | * we aren't waiting on any CPUs, report the quiescent state. |
390 | * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, | 344 | * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, |
@@ -399,7 +353,8 @@ void rcu_read_unlock_special(struct task_struct *t) | |||
399 | rnp->grplo, | 353 | rnp->grplo, |
400 | rnp->grphi, | 354 | rnp->grphi, |
401 | !!rnp->gp_tasks); | 355 | !!rnp->gp_tasks); |
402 | rcu_report_unblock_qs_rnp(rnp, flags); | 356 | rcu_report_unblock_qs_rnp(&rcu_preempt_state, |
357 | rnp, flags); | ||
403 | } else { | 358 | } else { |
404 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 359 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
405 | } | 360 | } |
@@ -520,10 +475,6 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) | |||
520 | WARN_ON_ONCE(rnp->qsmask); | 475 | WARN_ON_ONCE(rnp->qsmask); |
521 | } | 476 | } |
522 | 477 | ||
523 | #ifdef CONFIG_HOTPLUG_CPU | ||
524 | |||
525 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
526 | |||
527 | /* | 478 | /* |
528 | * Check for a quiescent state from the current CPU. When a task blocks, | 479 | * Check for a quiescent state from the current CPU. When a task blocks, |
529 | * the task is recorded in the corresponding CPU's rcu_node structure, | 480 | * the task is recorded in the corresponding CPU's rcu_node structure, |
@@ -585,7 +536,7 @@ void synchronize_rcu(void) | |||
585 | "Illegal synchronize_rcu() in RCU read-side critical section"); | 536 | "Illegal synchronize_rcu() in RCU read-side critical section"); |
586 | if (!rcu_scheduler_active) | 537 | if (!rcu_scheduler_active) |
587 | return; | 538 | return; |
588 | if (rcu_expedited) | 539 | if (rcu_gp_is_expedited()) |
589 | synchronize_rcu_expedited(); | 540 | synchronize_rcu_expedited(); |
590 | else | 541 | else |
591 | wait_rcu_gp(call_rcu); | 542 | wait_rcu_gp(call_rcu); |
@@ -630,9 +581,6 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) | |||
630 | * recursively up the tree. (Calm down, calm down, we do the recursion | 581 | * recursively up the tree. (Calm down, calm down, we do the recursion |
631 | * iteratively!) | 582 | * iteratively!) |
632 | * | 583 | * |
633 | * Most callers will set the "wake" flag, but the task initiating the | ||
634 | * expedited grace period need not wake itself. | ||
635 | * | ||
636 | * Caller must hold sync_rcu_preempt_exp_mutex. | 584 | * Caller must hold sync_rcu_preempt_exp_mutex. |
637 | */ | 585 | */ |
638 | static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, | 586 | static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, |
@@ -667,29 +615,85 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, | |||
667 | 615 | ||
668 | /* | 616 | /* |
669 | * Snapshot the tasks blocking the newly started preemptible-RCU expedited | 617 | * Snapshot the tasks blocking the newly started preemptible-RCU expedited |
670 | * grace period for the specified rcu_node structure. If there are no such | 618 | * grace period for the specified rcu_node structure, phase 1. If there |
671 | * tasks, report it up the rcu_node hierarchy. | 619 | * are such tasks, set the ->expmask bits up the rcu_node tree and also |
620 | * set the ->expmask bits on the leaf rcu_node structures to tell phase 2 | ||
621 | * that work is needed here. | ||
672 | * | 622 | * |
673 | * Caller must hold sync_rcu_preempt_exp_mutex and must exclude | 623 | * Caller must hold sync_rcu_preempt_exp_mutex. |
674 | * CPU hotplug operations. | ||
675 | */ | 624 | */ |
676 | static void | 625 | static void |
677 | sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) | 626 | sync_rcu_preempt_exp_init1(struct rcu_state *rsp, struct rcu_node *rnp) |
678 | { | 627 | { |
679 | unsigned long flags; | 628 | unsigned long flags; |
680 | int must_wait = 0; | 629 | unsigned long mask; |
630 | struct rcu_node *rnp_up; | ||
681 | 631 | ||
682 | raw_spin_lock_irqsave(&rnp->lock, flags); | 632 | raw_spin_lock_irqsave(&rnp->lock, flags); |
683 | smp_mb__after_unlock_lock(); | 633 | smp_mb__after_unlock_lock(); |
634 | WARN_ON_ONCE(rnp->expmask); | ||
635 | WARN_ON_ONCE(rnp->exp_tasks); | ||
684 | if (!rcu_preempt_has_tasks(rnp)) { | 636 | if (!rcu_preempt_has_tasks(rnp)) { |
637 | /* No blocked tasks, nothing to do. */ | ||
685 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 638 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
686 | } else { | 639 | return; |
640 | } | ||
641 | /* Call for Phase 2 and propagate ->expmask bits up the tree. */ | ||
642 | rnp->expmask = 1; | ||
643 | rnp_up = rnp; | ||
644 | while (rnp_up->parent) { | ||
645 | mask = rnp_up->grpmask; | ||
646 | rnp_up = rnp_up->parent; | ||
647 | if (rnp_up->expmask & mask) | ||
648 | break; | ||
649 | raw_spin_lock(&rnp_up->lock); /* irqs already off */ | ||
650 | smp_mb__after_unlock_lock(); | ||
651 | rnp_up->expmask |= mask; | ||
652 | raw_spin_unlock(&rnp_up->lock); /* irqs still off */ | ||
653 | } | ||
654 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
655 | } | ||
656 | |||
657 | /* | ||
658 | * Snapshot the tasks blocking the newly started preemptible-RCU expedited | ||
659 | * grace period for the specified rcu_node structure, phase 2. If the | ||
660 | * leaf rcu_node structure has its ->expmask field set, check for tasks. | ||
661 | * If there are some, clear ->expmask and set ->exp_tasks accordingly, | ||
662 | * then initiate RCU priority boosting. Otherwise, clear ->expmask and | ||
663 | * invoke rcu_report_exp_rnp() to clear out the upper-level ->expmask bits, | ||
664 | * enabling rcu_read_unlock_special() to do the bit-clearing. | ||
665 | * | ||
666 | * Caller must hold sync_rcu_preempt_exp_mutex. | ||
667 | */ | ||
668 | static void | ||
669 | sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp) | ||
670 | { | ||
671 | unsigned long flags; | ||
672 | |||
673 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
674 | smp_mb__after_unlock_lock(); | ||
675 | if (!rnp->expmask) { | ||
676 | /* Phase 1 didn't do anything, so Phase 2 doesn't either. */ | ||
677 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
678 | return; | ||
679 | } | ||
680 | |||
681 | /* Phase 1 is over. */ | ||
682 | rnp->expmask = 0; | ||
683 | |||
684 | /* | ||
685 | * If there are still blocked tasks, set up ->exp_tasks so that | ||
686 | * rcu_read_unlock_special() will wake us and then boost them. | ||
687 | */ | ||
688 | if (rcu_preempt_has_tasks(rnp)) { | ||
687 | rnp->exp_tasks = rnp->blkd_tasks.next; | 689 | rnp->exp_tasks = rnp->blkd_tasks.next; |
688 | rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ | 690 | rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ |
689 | must_wait = 1; | 691 | return; |
690 | } | 692 | } |
691 | if (!must_wait) | 693 | |
692 | rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */ | 694 | /* No longer any blocked tasks, so undo bit setting. */ |
695 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
696 | rcu_report_exp_rnp(rsp, rnp, false); | ||
693 | } | 697 | } |
694 | 698 | ||
695 | /** | 699 | /** |
@@ -706,7 +710,6 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) | |||
706 | */ | 710 | */ |
707 | void synchronize_rcu_expedited(void) | 711 | void synchronize_rcu_expedited(void) |
708 | { | 712 | { |
709 | unsigned long flags; | ||
710 | struct rcu_node *rnp; | 713 | struct rcu_node *rnp; |
711 | struct rcu_state *rsp = &rcu_preempt_state; | 714 | struct rcu_state *rsp = &rcu_preempt_state; |
712 | unsigned long snap; | 715 | unsigned long snap; |
@@ -757,19 +760,16 @@ void synchronize_rcu_expedited(void) | |||
757 | /* force all RCU readers onto ->blkd_tasks lists. */ | 760 | /* force all RCU readers onto ->blkd_tasks lists. */ |
758 | synchronize_sched_expedited(); | 761 | synchronize_sched_expedited(); |
759 | 762 | ||
760 | /* Initialize ->expmask for all non-leaf rcu_node structures. */ | 763 | /* |
761 | rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { | 764 | * Snapshot current state of ->blkd_tasks lists into ->expmask. |
762 | raw_spin_lock_irqsave(&rnp->lock, flags); | 765 | * Phase 1 sets bits and phase 2 permits rcu_read_unlock_special() |
763 | smp_mb__after_unlock_lock(); | 766 | * to start clearing them. Doing this in one phase leads to |
764 | rnp->expmask = rnp->qsmaskinit; | 767 | * strange races between setting and clearing bits, so just say "no"! |
765 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 768 | */ |
766 | } | 769 | rcu_for_each_leaf_node(rsp, rnp) |
767 | 770 | sync_rcu_preempt_exp_init1(rsp, rnp); | |
768 | /* Snapshot current state of ->blkd_tasks lists. */ | ||
769 | rcu_for_each_leaf_node(rsp, rnp) | 771 | rcu_for_each_leaf_node(rsp, rnp) |
770 | sync_rcu_preempt_exp_init(rsp, rnp); | 772 | sync_rcu_preempt_exp_init2(rsp, rnp); |
771 | if (NUM_RCU_NODES > 1) | ||
772 | sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp)); | ||
773 | 773 | ||
774 | put_online_cpus(); | 774 | put_online_cpus(); |
775 | 775 | ||
@@ -859,8 +859,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) | |||
859 | return 0; | 859 | return 0; |
860 | } | 860 | } |
861 | 861 | ||
862 | #ifdef CONFIG_HOTPLUG_CPU | ||
863 | |||
864 | /* | 862 | /* |
865 | * Because there is no preemptible RCU, there can be no readers blocked. | 863 | * Because there is no preemptible RCU, there can be no readers blocked. |
866 | */ | 864 | */ |
@@ -869,8 +867,6 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp) | |||
869 | return false; | 867 | return false; |
870 | } | 868 | } |
871 | 869 | ||
872 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
873 | |||
874 | /* | 870 | /* |
875 | * Because preemptible RCU does not exist, we never have to check for | 871 | * Because preemptible RCU does not exist, we never have to check for |
876 | * tasks blocked within RCU read-side critical sections. | 872 | * tasks blocked within RCU read-side critical sections. |
@@ -1170,7 +1166,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) | |||
1170 | * Returns zero if all is well, a negated errno otherwise. | 1166 | * Returns zero if all is well, a negated errno otherwise. |
1171 | */ | 1167 | */ |
1172 | static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, | 1168 | static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, |
1173 | struct rcu_node *rnp) | 1169 | struct rcu_node *rnp) |
1174 | { | 1170 | { |
1175 | int rnp_index = rnp - &rsp->node[0]; | 1171 | int rnp_index = rnp - &rsp->node[0]; |
1176 | unsigned long flags; | 1172 | unsigned long flags; |
@@ -1180,7 +1176,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, | |||
1180 | if (&rcu_preempt_state != rsp) | 1176 | if (&rcu_preempt_state != rsp) |
1181 | return 0; | 1177 | return 0; |
1182 | 1178 | ||
1183 | if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0) | 1179 | if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0) |
1184 | return 0; | 1180 | return 0; |
1185 | 1181 | ||
1186 | rsp->boost = 1; | 1182 | rsp->boost = 1; |
@@ -1273,7 +1269,7 @@ static void rcu_cpu_kthread(unsigned int cpu) | |||
1273 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) | 1269 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) |
1274 | { | 1270 | { |
1275 | struct task_struct *t = rnp->boost_kthread_task; | 1271 | struct task_struct *t = rnp->boost_kthread_task; |
1276 | unsigned long mask = rnp->qsmaskinit; | 1272 | unsigned long mask = rcu_rnp_online_cpus(rnp); |
1277 | cpumask_var_t cm; | 1273 | cpumask_var_t cm; |
1278 | int cpu; | 1274 | int cpu; |
1279 | 1275 | ||
@@ -1945,7 +1941,8 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) | |||
1945 | rhp = ACCESS_ONCE(rdp->nocb_follower_head); | 1941 | rhp = ACCESS_ONCE(rdp->nocb_follower_head); |
1946 | 1942 | ||
1947 | /* Having no rcuo kthread but CBs after scheduler starts is bad! */ | 1943 | /* Having no rcuo kthread but CBs after scheduler starts is bad! */ |
1948 | if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp) { | 1944 | if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp && |
1945 | rcu_scheduler_fully_active) { | ||
1949 | /* RCU callback enqueued before CPU first came online??? */ | 1946 | /* RCU callback enqueued before CPU first came online??? */ |
1950 | pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n", | 1947 | pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n", |
1951 | cpu, rhp->func); | 1948 | cpu, rhp->func); |
@@ -2392,18 +2389,8 @@ void __init rcu_init_nohz(void) | |||
2392 | pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); | 2389 | pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); |
2393 | 2390 | ||
2394 | for_each_rcu_flavor(rsp) { | 2391 | for_each_rcu_flavor(rsp) { |
2395 | for_each_cpu(cpu, rcu_nocb_mask) { | 2392 | for_each_cpu(cpu, rcu_nocb_mask) |
2396 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | 2393 | init_nocb_callback_list(per_cpu_ptr(rsp->rda, cpu)); |
2397 | |||
2398 | /* | ||
2399 | * If there are early callbacks, they will need | ||
2400 | * to be moved to the nocb lists. | ||
2401 | */ | ||
2402 | WARN_ON_ONCE(rdp->nxttail[RCU_NEXT_TAIL] != | ||
2403 | &rdp->nxtlist && | ||
2404 | rdp->nxttail[RCU_NEXT_TAIL] != NULL); | ||
2405 | init_nocb_callback_list(rdp); | ||
2406 | } | ||
2407 | rcu_organize_nocb_kthreads(rsp); | 2394 | rcu_organize_nocb_kthreads(rsp); |
2408 | } | 2395 | } |
2409 | } | 2396 | } |
@@ -2540,6 +2527,16 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) | |||
2540 | if (!rcu_is_nocb_cpu(rdp->cpu)) | 2527 | if (!rcu_is_nocb_cpu(rdp->cpu)) |
2541 | return false; | 2528 | return false; |
2542 | 2529 | ||
2530 | /* If there are early-boot callbacks, move them to nocb lists. */ | ||
2531 | if (rdp->nxtlist) { | ||
2532 | rdp->nocb_head = rdp->nxtlist; | ||
2533 | rdp->nocb_tail = rdp->nxttail[RCU_NEXT_TAIL]; | ||
2534 | atomic_long_set(&rdp->nocb_q_count, rdp->qlen); | ||
2535 | atomic_long_set(&rdp->nocb_q_count_lazy, rdp->qlen_lazy); | ||
2536 | rdp->nxtlist = NULL; | ||
2537 | rdp->qlen = 0; | ||
2538 | rdp->qlen_lazy = 0; | ||
2539 | } | ||
2543 | rdp->nxttail[RCU_NEXT_TAIL] = NULL; | 2540 | rdp->nxttail[RCU_NEXT_TAIL] = NULL; |
2544 | return true; | 2541 | return true; |
2545 | } | 2542 | } |
@@ -2763,7 +2760,8 @@ static void rcu_sysidle_exit(int irq) | |||
2763 | 2760 | ||
2764 | /* | 2761 | /* |
2765 | * Check to see if the current CPU is idle. Note that usermode execution | 2762 | * Check to see if the current CPU is idle. Note that usermode execution |
2766 | * does not count as idle. The caller must have disabled interrupts. | 2763 | * does not count as idle. The caller must have disabled interrupts, |
2764 | * and must be running on tick_do_timer_cpu. | ||
2767 | */ | 2765 | */ |
2768 | static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, | 2766 | static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, |
2769 | unsigned long *maxj) | 2767 | unsigned long *maxj) |
@@ -2784,8 +2782,8 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, | |||
2784 | if (!*isidle || rdp->rsp != rcu_state_p || | 2782 | if (!*isidle || rdp->rsp != rcu_state_p || |
2785 | cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) | 2783 | cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) |
2786 | return; | 2784 | return; |
2787 | if (rcu_gp_in_progress(rdp->rsp)) | 2785 | /* Verify affinity of current kthread. */ |
2788 | WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); | 2786 | WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); |
2789 | 2787 | ||
2790 | /* Pick up current idle and NMI-nesting counter and check. */ | 2788 | /* Pick up current idle and NMI-nesting counter and check. */ |
2791 | cur = atomic_read(&rdtp->dynticks_idle); | 2789 | cur = atomic_read(&rdtp->dynticks_idle); |
@@ -3068,11 +3066,10 @@ static void rcu_bind_gp_kthread(void) | |||
3068 | return; | 3066 | return; |
3069 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE | 3067 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE |
3070 | cpu = tick_do_timer_cpu; | 3068 | cpu = tick_do_timer_cpu; |
3071 | if (cpu >= 0 && cpu < nr_cpu_ids && raw_smp_processor_id() != cpu) | 3069 | if (cpu >= 0 && cpu < nr_cpu_ids) |
3072 | set_cpus_allowed_ptr(current, cpumask_of(cpu)); | 3070 | set_cpus_allowed_ptr(current, cpumask_of(cpu)); |
3073 | #else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | 3071 | #else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ |
3074 | if (!is_housekeeping_cpu(raw_smp_processor_id())) | 3072 | housekeeping_affine(current); |
3075 | housekeeping_affine(current); | ||
3076 | #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | 3073 | #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ |
3077 | } | 3074 | } |
3078 | 3075 | ||
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index fbb6240509ea..f92361efd0f5 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c | |||
@@ -283,8 +283,8 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) | |||
283 | seq_puts(m, "\n"); | 283 | seq_puts(m, "\n"); |
284 | level = rnp->level; | 284 | level = rnp->level; |
285 | } | 285 | } |
286 | seq_printf(m, "%lx/%lx %c%c>%c %d:%d ^%d ", | 286 | seq_printf(m, "%lx/%lx->%lx %c%c>%c %d:%d ^%d ", |
287 | rnp->qsmask, rnp->qsmaskinit, | 287 | rnp->qsmask, rnp->qsmaskinit, rnp->qsmaskinitnext, |
288 | ".G"[rnp->gp_tasks != NULL], | 288 | ".G"[rnp->gp_tasks != NULL], |
289 | ".E"[rnp->exp_tasks != NULL], | 289 | ".E"[rnp->exp_tasks != NULL], |
290 | ".T"[!list_empty(&rnp->blkd_tasks)], | 290 | ".T"[!list_empty(&rnp->blkd_tasks)], |
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index e0d31a345ee6..1f133350da01 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c | |||
@@ -62,6 +62,63 @@ MODULE_ALIAS("rcupdate"); | |||
62 | 62 | ||
63 | module_param(rcu_expedited, int, 0); | 63 | module_param(rcu_expedited, int, 0); |
64 | 64 | ||
65 | #ifndef CONFIG_TINY_RCU | ||
66 | |||
67 | static atomic_t rcu_expedited_nesting = | ||
68 | ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0); | ||
69 | |||
70 | /* | ||
71 | * Should normal grace-period primitives be expedited? Intended for | ||
72 | * use within RCU. Note that this function takes the rcu_expedited | ||
73 | * sysfs/boot variable into account as well as the rcu_expedite_gp() | ||
74 | * nesting. So looping on rcu_unexpedite_gp() until rcu_gp_is_expedited() | ||
75 | * returns false is a -really- bad idea. | ||
76 | */ | ||
77 | bool rcu_gp_is_expedited(void) | ||
78 | { | ||
79 | return rcu_expedited || atomic_read(&rcu_expedited_nesting); | ||
80 | } | ||
81 | EXPORT_SYMBOL_GPL(rcu_gp_is_expedited); | ||
82 | |||
83 | /** | ||
84 | * rcu_expedite_gp - Expedite future RCU grace periods | ||
85 | * | ||
86 | * After a call to this function, future calls to synchronize_rcu() and | ||
87 | * friends act as the corresponding synchronize_rcu_expedited() function | ||
88 | * had instead been called. | ||
89 | */ | ||
90 | void rcu_expedite_gp(void) | ||
91 | { | ||
92 | atomic_inc(&rcu_expedited_nesting); | ||
93 | } | ||
94 | EXPORT_SYMBOL_GPL(rcu_expedite_gp); | ||
95 | |||
96 | /** | ||
97 | * rcu_unexpedite_gp - Cancel prior rcu_expedite_gp() invocation | ||
98 | * | ||
99 | * Undo a prior call to rcu_expedite_gp(). If all prior calls to | ||
100 | * rcu_expedite_gp() are undone by a subsequent call to rcu_unexpedite_gp(), | ||
101 | * and if the rcu_expedited sysfs/boot parameter is not set, then all | ||
102 | * subsequent calls to synchronize_rcu() and friends will return to | ||
103 | * their normal non-expedited behavior. | ||
104 | */ | ||
105 | void rcu_unexpedite_gp(void) | ||
106 | { | ||
107 | atomic_dec(&rcu_expedited_nesting); | ||
108 | } | ||
109 | EXPORT_SYMBOL_GPL(rcu_unexpedite_gp); | ||
110 | |||
111 | #endif /* #ifndef CONFIG_TINY_RCU */ | ||
112 | |||
113 | /* | ||
114 | * Inform RCU of the end of the in-kernel boot sequence. | ||
115 | */ | ||
116 | void rcu_end_inkernel_boot(void) | ||
117 | { | ||
118 | if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT)) | ||
119 | rcu_unexpedite_gp(); | ||
120 | } | ||
121 | |||
65 | #ifdef CONFIG_PREEMPT_RCU | 122 | #ifdef CONFIG_PREEMPT_RCU |
66 | 123 | ||
67 | /* | 124 | /* |
@@ -199,16 +256,13 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); | |||
199 | 256 | ||
200 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | 257 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ |
201 | 258 | ||
202 | struct rcu_synchronize { | 259 | /** |
203 | struct rcu_head head; | 260 | * wakeme_after_rcu() - Callback function to awaken a task after grace period |
204 | struct completion completion; | 261 | * @head: Pointer to rcu_head member within rcu_synchronize structure |
205 | }; | 262 | * |
206 | 263 | * Awaken the corresponding task now that a grace period has elapsed. | |
207 | /* | ||
208 | * Awaken the corresponding synchronize_rcu() instance now that a | ||
209 | * grace period has elapsed. | ||
210 | */ | 264 | */ |
211 | static void wakeme_after_rcu(struct rcu_head *head) | 265 | void wakeme_after_rcu(struct rcu_head *head) |
212 | { | 266 | { |
213 | struct rcu_synchronize *rcu; | 267 | struct rcu_synchronize *rcu; |
214 | 268 | ||
diff --git a/kernel/reboot.c b/kernel/reboot.c index 5925f5ae8dff..d20c85d9f8c0 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c | |||
@@ -387,8 +387,9 @@ void ctrl_alt_del(void) | |||
387 | } | 387 | } |
388 | 388 | ||
389 | char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; | 389 | char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; |
390 | static const char reboot_cmd[] = "/sbin/reboot"; | ||
390 | 391 | ||
391 | static int __orderly_poweroff(bool force) | 392 | static int run_cmd(const char *cmd) |
392 | { | 393 | { |
393 | char **argv; | 394 | char **argv; |
394 | static char *envp[] = { | 395 | static char *envp[] = { |
@@ -397,8 +398,7 @@ static int __orderly_poweroff(bool force) | |||
397 | NULL | 398 | NULL |
398 | }; | 399 | }; |
399 | int ret; | 400 | int ret; |
400 | 401 | argv = argv_split(GFP_KERNEL, cmd, NULL); | |
401 | argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL); | ||
402 | if (argv) { | 402 | if (argv) { |
403 | ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); | 403 | ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); |
404 | argv_free(argv); | 404 | argv_free(argv); |
@@ -406,8 +406,33 @@ static int __orderly_poweroff(bool force) | |||
406 | ret = -ENOMEM; | 406 | ret = -ENOMEM; |
407 | } | 407 | } |
408 | 408 | ||
409 | return ret; | ||
410 | } | ||
411 | |||
412 | static int __orderly_reboot(void) | ||
413 | { | ||
414 | int ret; | ||
415 | |||
416 | ret = run_cmd(reboot_cmd); | ||
417 | |||
418 | if (ret) { | ||
419 | pr_warn("Failed to start orderly reboot: forcing the issue\n"); | ||
420 | emergency_sync(); | ||
421 | kernel_restart(NULL); | ||
422 | } | ||
423 | |||
424 | return ret; | ||
425 | } | ||
426 | |||
427 | static int __orderly_poweroff(bool force) | ||
428 | { | ||
429 | int ret; | ||
430 | |||
431 | ret = run_cmd(poweroff_cmd); | ||
432 | |||
409 | if (ret && force) { | 433 | if (ret && force) { |
410 | pr_warn("Failed to start orderly shutdown: forcing the issue\n"); | 434 | pr_warn("Failed to start orderly shutdown: forcing the issue\n"); |
435 | |||
411 | /* | 436 | /* |
412 | * I guess this should try to kick off some daemon to sync and | 437 | * I guess this should try to kick off some daemon to sync and |
413 | * poweroff asap. Or not even bother syncing if we're doing an | 438 | * poweroff asap. Or not even bother syncing if we're doing an |
@@ -436,15 +461,33 @@ static DECLARE_WORK(poweroff_work, poweroff_work_func); | |||
436 | * This may be called from any context to trigger a system shutdown. | 461 | * This may be called from any context to trigger a system shutdown. |
437 | * If the orderly shutdown fails, it will force an immediate shutdown. | 462 | * If the orderly shutdown fails, it will force an immediate shutdown. |
438 | */ | 463 | */ |
439 | int orderly_poweroff(bool force) | 464 | void orderly_poweroff(bool force) |
440 | { | 465 | { |
441 | if (force) /* do not override the pending "true" */ | 466 | if (force) /* do not override the pending "true" */ |
442 | poweroff_force = true; | 467 | poweroff_force = true; |
443 | schedule_work(&poweroff_work); | 468 | schedule_work(&poweroff_work); |
444 | return 0; | ||
445 | } | 469 | } |
446 | EXPORT_SYMBOL_GPL(orderly_poweroff); | 470 | EXPORT_SYMBOL_GPL(orderly_poweroff); |
447 | 471 | ||
472 | static void reboot_work_func(struct work_struct *work) | ||
473 | { | ||
474 | __orderly_reboot(); | ||
475 | } | ||
476 | |||
477 | static DECLARE_WORK(reboot_work, reboot_work_func); | ||
478 | |||
479 | /** | ||
480 | * orderly_reboot - Trigger an orderly system reboot | ||
481 | * | ||
482 | * This may be called from any context to trigger a system reboot. | ||
483 | * If the orderly reboot fails, it will force an immediate reboot. | ||
484 | */ | ||
485 | void orderly_reboot(void) | ||
486 | { | ||
487 | schedule_work(&reboot_work); | ||
488 | } | ||
489 | EXPORT_SYMBOL_GPL(orderly_reboot); | ||
490 | |||
448 | static int __init reboot_setup(char *str) | 491 | static int __init reboot_setup(char *str) |
449 | { | 492 | { |
450 | for (;;) { | 493 | for (;;) { |
diff --git a/kernel/resource.c b/kernel/resource.c index 19f2357dfda3..90552aab5f2d 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
@@ -1034,8 +1034,6 @@ resource_size_t resource_alignment(struct resource *res) | |||
1034 | * | 1034 | * |
1035 | * request_region creates a new busy region. | 1035 | * request_region creates a new busy region. |
1036 | * | 1036 | * |
1037 | * check_region returns non-zero if the area is already busy. | ||
1038 | * | ||
1039 | * release_region releases a matching busy region. | 1037 | * release_region releases a matching busy region. |
1040 | */ | 1038 | */ |
1041 | 1039 | ||
@@ -1098,36 +1096,6 @@ struct resource * __request_region(struct resource *parent, | |||
1098 | EXPORT_SYMBOL(__request_region); | 1096 | EXPORT_SYMBOL(__request_region); |
1099 | 1097 | ||
1100 | /** | 1098 | /** |
1101 | * __check_region - check if a resource region is busy or free | ||
1102 | * @parent: parent resource descriptor | ||
1103 | * @start: resource start address | ||
1104 | * @n: resource region size | ||
1105 | * | ||
1106 | * Returns 0 if the region is free at the moment it is checked, | ||
1107 | * returns %-EBUSY if the region is busy. | ||
1108 | * | ||
1109 | * NOTE: | ||
1110 | * This function is deprecated because its use is racy. | ||
1111 | * Even if it returns 0, a subsequent call to request_region() | ||
1112 | * may fail because another driver etc. just allocated the region. | ||
1113 | * Do NOT use it. It will be removed from the kernel. | ||
1114 | */ | ||
1115 | int __check_region(struct resource *parent, resource_size_t start, | ||
1116 | resource_size_t n) | ||
1117 | { | ||
1118 | struct resource * res; | ||
1119 | |||
1120 | res = __request_region(parent, start, n, "check-region", 0); | ||
1121 | if (!res) | ||
1122 | return -EBUSY; | ||
1123 | |||
1124 | release_resource(res); | ||
1125 | free_resource(res); | ||
1126 | return 0; | ||
1127 | } | ||
1128 | EXPORT_SYMBOL(__check_region); | ||
1129 | |||
1130 | /** | ||
1131 | * __release_region - release a previously reserved resource region | 1099 | * __release_region - release a previously reserved resource region |
1132 | * @parent: parent resource descriptor | 1100 | * @parent: parent resource descriptor |
1133 | * @start: resource start address | 1101 | * @start: resource start address |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f0f831e8a345..f9123a82cbb6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -306,6 +306,9 @@ __read_mostly int scheduler_running; | |||
306 | */ | 306 | */ |
307 | int sysctl_sched_rt_runtime = 950000; | 307 | int sysctl_sched_rt_runtime = 950000; |
308 | 308 | ||
309 | /* cpus with isolated domains */ | ||
310 | cpumask_var_t cpu_isolated_map; | ||
311 | |||
309 | /* | 312 | /* |
310 | * this_rq_lock - lock this runqueue and disable interrupts. | 313 | * this_rq_lock - lock this runqueue and disable interrupts. |
311 | */ | 314 | */ |
@@ -690,6 +693,23 @@ static inline bool got_nohz_idle_kick(void) | |||
690 | bool sched_can_stop_tick(void) | 693 | bool sched_can_stop_tick(void) |
691 | { | 694 | { |
692 | /* | 695 | /* |
696 | * FIFO realtime policy runs the highest priority task. Other runnable | ||
697 | * tasks are of a lower priority. The scheduler tick does nothing. | ||
698 | */ | ||
699 | if (current->policy == SCHED_FIFO) | ||
700 | return true; | ||
701 | |||
702 | /* | ||
703 | * Round-robin realtime tasks time slice with other tasks at the same | ||
704 | * realtime priority. Is this task the only one at this priority? | ||
705 | */ | ||
706 | if (current->policy == SCHED_RR) { | ||
707 | struct sched_rt_entity *rt_se = ¤t->rt; | ||
708 | |||
709 | return rt_se->run_list.prev == rt_se->run_list.next; | ||
710 | } | ||
711 | |||
712 | /* | ||
693 | * More than one running task need preemption. | 713 | * More than one running task need preemption. |
694 | * nr_running update is assumed to be visible | 714 | * nr_running update is assumed to be visible |
695 | * after IPI is sent from wakers. | 715 | * after IPI is sent from wakers. |
@@ -996,6 +1016,13 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | |||
996 | rq_clock_skip_update(rq, true); | 1016 | rq_clock_skip_update(rq, true); |
997 | } | 1017 | } |
998 | 1018 | ||
1019 | static ATOMIC_NOTIFIER_HEAD(task_migration_notifier); | ||
1020 | |||
1021 | void register_task_migration_notifier(struct notifier_block *n) | ||
1022 | { | ||
1023 | atomic_notifier_chain_register(&task_migration_notifier, n); | ||
1024 | } | ||
1025 | |||
999 | #ifdef CONFIG_SMP | 1026 | #ifdef CONFIG_SMP |
1000 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | 1027 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) |
1001 | { | 1028 | { |
@@ -1026,10 +1053,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
1026 | trace_sched_migrate_task(p, new_cpu); | 1053 | trace_sched_migrate_task(p, new_cpu); |
1027 | 1054 | ||
1028 | if (task_cpu(p) != new_cpu) { | 1055 | if (task_cpu(p) != new_cpu) { |
1056 | struct task_migration_notifier tmn; | ||
1057 | |||
1029 | if (p->sched_class->migrate_task_rq) | 1058 | if (p->sched_class->migrate_task_rq) |
1030 | p->sched_class->migrate_task_rq(p, new_cpu); | 1059 | p->sched_class->migrate_task_rq(p, new_cpu); |
1031 | p->se.nr_migrations++; | 1060 | p->se.nr_migrations++; |
1032 | perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0); | 1061 | perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0); |
1062 | |||
1063 | tmn.task = p; | ||
1064 | tmn.from_cpu = task_cpu(p); | ||
1065 | tmn.to_cpu = new_cpu; | ||
1066 | |||
1067 | atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn); | ||
1033 | } | 1068 | } |
1034 | 1069 | ||
1035 | __set_task_cpu(p, new_cpu); | 1070 | __set_task_cpu(p, new_cpu); |
@@ -2818,7 +2853,7 @@ asmlinkage __visible void __sched schedule_user(void) | |||
2818 | * we find a better solution. | 2853 | * we find a better solution. |
2819 | * | 2854 | * |
2820 | * NB: There are buggy callers of this function. Ideally we | 2855 | * NB: There are buggy callers of this function. Ideally we |
2821 | * should warn if prev_state != IN_USER, but that will trigger | 2856 | * should warn if prev_state != CONTEXT_USER, but that will trigger |
2822 | * too frequently to make sense yet. | 2857 | * too frequently to make sense yet. |
2823 | */ | 2858 | */ |
2824 | enum ctx_state prev_state = exception_enter(); | 2859 | enum ctx_state prev_state = exception_enter(); |
@@ -3034,6 +3069,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
3034 | } else { | 3069 | } else { |
3035 | if (dl_prio(oldprio)) | 3070 | if (dl_prio(oldprio)) |
3036 | p->dl.dl_boosted = 0; | 3071 | p->dl.dl_boosted = 0; |
3072 | if (rt_prio(oldprio)) | ||
3073 | p->rt.timeout = 0; | ||
3037 | p->sched_class = &fair_sched_class; | 3074 | p->sched_class = &fair_sched_class; |
3038 | } | 3075 | } |
3039 | 3076 | ||
@@ -5318,36 +5355,13 @@ static int sched_cpu_active(struct notifier_block *nfb, | |||
5318 | static int sched_cpu_inactive(struct notifier_block *nfb, | 5355 | static int sched_cpu_inactive(struct notifier_block *nfb, |
5319 | unsigned long action, void *hcpu) | 5356 | unsigned long action, void *hcpu) |
5320 | { | 5357 | { |
5321 | unsigned long flags; | ||
5322 | long cpu = (long)hcpu; | ||
5323 | struct dl_bw *dl_b; | ||
5324 | |||
5325 | switch (action & ~CPU_TASKS_FROZEN) { | 5358 | switch (action & ~CPU_TASKS_FROZEN) { |
5326 | case CPU_DOWN_PREPARE: | 5359 | case CPU_DOWN_PREPARE: |
5327 | set_cpu_active(cpu, false); | 5360 | set_cpu_active((long)hcpu, false); |
5328 | |||
5329 | /* explicitly allow suspend */ | ||
5330 | if (!(action & CPU_TASKS_FROZEN)) { | ||
5331 | bool overflow; | ||
5332 | int cpus; | ||
5333 | |||
5334 | rcu_read_lock_sched(); | ||
5335 | dl_b = dl_bw_of(cpu); | ||
5336 | |||
5337 | raw_spin_lock_irqsave(&dl_b->lock, flags); | ||
5338 | cpus = dl_bw_cpus(cpu); | ||
5339 | overflow = __dl_overflow(dl_b, cpus, 0, 0); | ||
5340 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | ||
5341 | |||
5342 | rcu_read_unlock_sched(); | ||
5343 | |||
5344 | if (overflow) | ||
5345 | return notifier_from_errno(-EBUSY); | ||
5346 | } | ||
5347 | return NOTIFY_OK; | 5361 | return NOTIFY_OK; |
5362 | default: | ||
5363 | return NOTIFY_DONE; | ||
5348 | } | 5364 | } |
5349 | |||
5350 | return NOTIFY_DONE; | ||
5351 | } | 5365 | } |
5352 | 5366 | ||
5353 | static int __init migration_init(void) | 5367 | static int __init migration_init(void) |
@@ -5428,17 +5442,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
5428 | break; | 5442 | break; |
5429 | } | 5443 | } |
5430 | 5444 | ||
5431 | /* | ||
5432 | * Even though we initialize ->capacity to something semi-sane, | ||
5433 | * we leave capacity_orig unset. This allows us to detect if | ||
5434 | * domain iteration is still funny without causing /0 traps. | ||
5435 | */ | ||
5436 | if (!group->sgc->capacity_orig) { | ||
5437 | printk(KERN_CONT "\n"); | ||
5438 | printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n"); | ||
5439 | break; | ||
5440 | } | ||
5441 | |||
5442 | if (!cpumask_weight(sched_group_cpus(group))) { | 5445 | if (!cpumask_weight(sched_group_cpus(group))) { |
5443 | printk(KERN_CONT "\n"); | 5446 | printk(KERN_CONT "\n"); |
5444 | printk(KERN_ERR "ERROR: empty group\n"); | 5447 | printk(KERN_ERR "ERROR: empty group\n"); |
@@ -5811,9 +5814,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
5811 | update_top_cache_domain(cpu); | 5814 | update_top_cache_domain(cpu); |
5812 | } | 5815 | } |
5813 | 5816 | ||
5814 | /* cpus with isolated domains */ | ||
5815 | static cpumask_var_t cpu_isolated_map; | ||
5816 | |||
5817 | /* Setup the mask of cpus configured for isolated domains */ | 5817 | /* Setup the mask of cpus configured for isolated domains */ |
5818 | static int __init isolated_cpu_setup(char *str) | 5818 | static int __init isolated_cpu_setup(char *str) |
5819 | { | 5819 | { |
@@ -5922,7 +5922,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
5922 | * die on a /0 trap. | 5922 | * die on a /0 trap. |
5923 | */ | 5923 | */ |
5924 | sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); | 5924 | sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); |
5925 | sg->sgc->capacity_orig = sg->sgc->capacity; | ||
5926 | 5925 | ||
5927 | /* | 5926 | /* |
5928 | * Make sure the first group of this domain contains the | 5927 | * Make sure the first group of this domain contains the |
@@ -6233,6 +6232,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) | |||
6233 | */ | 6232 | */ |
6234 | 6233 | ||
6235 | if (sd->flags & SD_SHARE_CPUCAPACITY) { | 6234 | if (sd->flags & SD_SHARE_CPUCAPACITY) { |
6235 | sd->flags |= SD_PREFER_SIBLING; | ||
6236 | sd->imbalance_pct = 110; | 6236 | sd->imbalance_pct = 110; |
6237 | sd->smt_gain = 1178; /* ~15% */ | 6237 | sd->smt_gain = 1178; /* ~15% */ |
6238 | 6238 | ||
@@ -6998,7 +6998,6 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, | |||
6998 | */ | 6998 | */ |
6999 | 6999 | ||
7000 | case CPU_ONLINE: | 7000 | case CPU_ONLINE: |
7001 | case CPU_DOWN_FAILED: | ||
7002 | cpuset_update_active_cpus(true); | 7001 | cpuset_update_active_cpus(true); |
7003 | break; | 7002 | break; |
7004 | default: | 7003 | default: |
@@ -7010,8 +7009,30 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, | |||
7010 | static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, | 7009 | static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, |
7011 | void *hcpu) | 7010 | void *hcpu) |
7012 | { | 7011 | { |
7013 | switch (action) { | 7012 | unsigned long flags; |
7013 | long cpu = (long)hcpu; | ||
7014 | struct dl_bw *dl_b; | ||
7015 | |||
7016 | switch (action & ~CPU_TASKS_FROZEN) { | ||
7014 | case CPU_DOWN_PREPARE: | 7017 | case CPU_DOWN_PREPARE: |
7018 | /* explicitly allow suspend */ | ||
7019 | if (!(action & CPU_TASKS_FROZEN)) { | ||
7020 | bool overflow; | ||
7021 | int cpus; | ||
7022 | |||
7023 | rcu_read_lock_sched(); | ||
7024 | dl_b = dl_bw_of(cpu); | ||
7025 | |||
7026 | raw_spin_lock_irqsave(&dl_b->lock, flags); | ||
7027 | cpus = dl_bw_cpus(cpu); | ||
7028 | overflow = __dl_overflow(dl_b, cpus, 0, 0); | ||
7029 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | ||
7030 | |||
7031 | rcu_read_unlock_sched(); | ||
7032 | |||
7033 | if (overflow) | ||
7034 | return notifier_from_errno(-EBUSY); | ||
7035 | } | ||
7015 | cpuset_update_active_cpus(false); | 7036 | cpuset_update_active_cpus(false); |
7016 | break; | 7037 | break; |
7017 | case CPU_DOWN_PREPARE_FROZEN: | 7038 | case CPU_DOWN_PREPARE_FROZEN: |
@@ -7156,8 +7177,8 @@ void __init sched_init(void) | |||
7156 | rq->calc_load_active = 0; | 7177 | rq->calc_load_active = 0; |
7157 | rq->calc_load_update = jiffies + LOAD_FREQ; | 7178 | rq->calc_load_update = jiffies + LOAD_FREQ; |
7158 | init_cfs_rq(&rq->cfs); | 7179 | init_cfs_rq(&rq->cfs); |
7159 | init_rt_rq(&rq->rt, rq); | 7180 | init_rt_rq(&rq->rt); |
7160 | init_dl_rq(&rq->dl, rq); | 7181 | init_dl_rq(&rq->dl); |
7161 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7182 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7162 | root_task_group.shares = ROOT_TASK_GROUP_LOAD; | 7183 | root_task_group.shares = ROOT_TASK_GROUP_LOAD; |
7163 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | 7184 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
@@ -7197,7 +7218,7 @@ void __init sched_init(void) | |||
7197 | #ifdef CONFIG_SMP | 7218 | #ifdef CONFIG_SMP |
7198 | rq->sd = NULL; | 7219 | rq->sd = NULL; |
7199 | rq->rd = NULL; | 7220 | rq->rd = NULL; |
7200 | rq->cpu_capacity = SCHED_CAPACITY_SCALE; | 7221 | rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE; |
7201 | rq->post_schedule = 0; | 7222 | rq->post_schedule = 0; |
7202 | rq->active_balance = 0; | 7223 | rq->active_balance = 0; |
7203 | rq->next_balance = jiffies; | 7224 | rq->next_balance = jiffies; |
@@ -7796,7 +7817,7 @@ static int sched_rt_global_constraints(void) | |||
7796 | } | 7817 | } |
7797 | #endif /* CONFIG_RT_GROUP_SCHED */ | 7818 | #endif /* CONFIG_RT_GROUP_SCHED */ |
7798 | 7819 | ||
7799 | static int sched_dl_global_constraints(void) | 7820 | static int sched_dl_global_validate(void) |
7800 | { | 7821 | { |
7801 | u64 runtime = global_rt_runtime(); | 7822 | u64 runtime = global_rt_runtime(); |
7802 | u64 period = global_rt_period(); | 7823 | u64 period = global_rt_period(); |
@@ -7897,11 +7918,11 @@ int sched_rt_handler(struct ctl_table *table, int write, | |||
7897 | if (ret) | 7918 | if (ret) |
7898 | goto undo; | 7919 | goto undo; |
7899 | 7920 | ||
7900 | ret = sched_rt_global_constraints(); | 7921 | ret = sched_dl_global_validate(); |
7901 | if (ret) | 7922 | if (ret) |
7902 | goto undo; | 7923 | goto undo; |
7903 | 7924 | ||
7904 | ret = sched_dl_global_constraints(); | 7925 | ret = sched_rt_global_constraints(); |
7905 | if (ret) | 7926 | if (ret) |
7906 | goto undo; | 7927 | goto undo; |
7907 | 7928 | ||
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 3fa8fa6d9403..5e95145088fd 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
@@ -69,7 +69,7 @@ void init_dl_bw(struct dl_bw *dl_b) | |||
69 | dl_b->total_bw = 0; | 69 | dl_b->total_bw = 0; |
70 | } | 70 | } |
71 | 71 | ||
72 | void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq) | 72 | void init_dl_rq(struct dl_rq *dl_rq) |
73 | { | 73 | { |
74 | dl_rq->rb_root = RB_ROOT; | 74 | dl_rq->rb_root = RB_ROOT; |
75 | 75 | ||
@@ -218,6 +218,52 @@ static inline void set_post_schedule(struct rq *rq) | |||
218 | rq->post_schedule = has_pushable_dl_tasks(rq); | 218 | rq->post_schedule = has_pushable_dl_tasks(rq); |
219 | } | 219 | } |
220 | 220 | ||
221 | static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq); | ||
222 | |||
223 | static void dl_task_offline_migration(struct rq *rq, struct task_struct *p) | ||
224 | { | ||
225 | struct rq *later_rq = NULL; | ||
226 | bool fallback = false; | ||
227 | |||
228 | later_rq = find_lock_later_rq(p, rq); | ||
229 | |||
230 | if (!later_rq) { | ||
231 | int cpu; | ||
232 | |||
233 | /* | ||
234 | * If we cannot preempt any rq, fall back to pick any | ||
235 | * online cpu. | ||
236 | */ | ||
237 | fallback = true; | ||
238 | cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p)); | ||
239 | if (cpu >= nr_cpu_ids) { | ||
240 | /* | ||
241 | * Fail to find any suitable cpu. | ||
242 | * The task will never come back! | ||
243 | */ | ||
244 | BUG_ON(dl_bandwidth_enabled()); | ||
245 | |||
246 | /* | ||
247 | * If admission control is disabled we | ||
248 | * try a little harder to let the task | ||
249 | * run. | ||
250 | */ | ||
251 | cpu = cpumask_any(cpu_active_mask); | ||
252 | } | ||
253 | later_rq = cpu_rq(cpu); | ||
254 | double_lock_balance(rq, later_rq); | ||
255 | } | ||
256 | |||
257 | deactivate_task(rq, p, 0); | ||
258 | set_task_cpu(p, later_rq->cpu); | ||
259 | activate_task(later_rq, p, ENQUEUE_REPLENISH); | ||
260 | |||
261 | if (!fallback) | ||
262 | resched_curr(later_rq); | ||
263 | |||
264 | double_unlock_balance(rq, later_rq); | ||
265 | } | ||
266 | |||
221 | #else | 267 | #else |
222 | 268 | ||
223 | static inline | 269 | static inline |
@@ -514,7 +560,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) | |||
514 | unsigned long flags; | 560 | unsigned long flags; |
515 | struct rq *rq; | 561 | struct rq *rq; |
516 | 562 | ||
517 | rq = task_rq_lock(current, &flags); | 563 | rq = task_rq_lock(p, &flags); |
518 | 564 | ||
519 | /* | 565 | /* |
520 | * We need to take care of several possible races here: | 566 | * We need to take care of several possible races here: |
@@ -536,6 +582,17 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) | |||
536 | sched_clock_tick(); | 582 | sched_clock_tick(); |
537 | update_rq_clock(rq); | 583 | update_rq_clock(rq); |
538 | 584 | ||
585 | #ifdef CONFIG_SMP | ||
586 | /* | ||
587 | * If we find that the rq the task was on is no longer | ||
588 | * available, we need to select a new rq. | ||
589 | */ | ||
590 | if (unlikely(!rq->online)) { | ||
591 | dl_task_offline_migration(rq, p); | ||
592 | goto unlock; | ||
593 | } | ||
594 | #endif | ||
595 | |||
539 | /* | 596 | /* |
540 | * If the throttle happened during sched-out; like: | 597 | * If the throttle happened during sched-out; like: |
541 | * | 598 | * |
@@ -569,7 +626,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) | |||
569 | push_dl_task(rq); | 626 | push_dl_task(rq); |
570 | #endif | 627 | #endif |
571 | unlock: | 628 | unlock: |
572 | task_rq_unlock(rq, current, &flags); | 629 | task_rq_unlock(rq, p, &flags); |
573 | 630 | ||
574 | return HRTIMER_NORESTART; | 631 | return HRTIMER_NORESTART; |
575 | } | 632 | } |
@@ -914,6 +971,12 @@ static void yield_task_dl(struct rq *rq) | |||
914 | } | 971 | } |
915 | update_rq_clock(rq); | 972 | update_rq_clock(rq); |
916 | update_curr_dl(rq); | 973 | update_curr_dl(rq); |
974 | /* | ||
975 | * Tell update_rq_clock() that we've just updated, | ||
976 | * so we don't do microscopic update in schedule() | ||
977 | * and double the fastpath cost. | ||
978 | */ | ||
979 | rq_clock_skip_update(rq, true); | ||
917 | } | 980 | } |
918 | 981 | ||
919 | #ifdef CONFIG_SMP | 982 | #ifdef CONFIG_SMP |
@@ -1659,14 +1722,6 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) | |||
1659 | { | 1722 | { |
1660 | int check_resched = 1; | 1723 | int check_resched = 1; |
1661 | 1724 | ||
1662 | /* | ||
1663 | * If p is throttled, don't consider the possibility | ||
1664 | * of preempting rq->curr, the check will be done right | ||
1665 | * after its runtime will get replenished. | ||
1666 | */ | ||
1667 | if (unlikely(p->dl.dl_throttled)) | ||
1668 | return; | ||
1669 | |||
1670 | if (task_on_rq_queued(p) && rq->curr != p) { | 1725 | if (task_on_rq_queued(p) && rq->curr != p) { |
1671 | #ifdef CONFIG_SMP | 1726 | #ifdef CONFIG_SMP |
1672 | if (p->nr_cpus_allowed > 1 && rq->dl.overloaded && | 1727 | if (p->nr_cpus_allowed > 1 && rq->dl.overloaded && |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 8baaf858d25c..a245c1fc6f0a 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -71,7 +71,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group | |||
71 | if (!se) { | 71 | if (!se) { |
72 | struct sched_avg *avg = &cpu_rq(cpu)->avg; | 72 | struct sched_avg *avg = &cpu_rq(cpu)->avg; |
73 | P(avg->runnable_avg_sum); | 73 | P(avg->runnable_avg_sum); |
74 | P(avg->runnable_avg_period); | 74 | P(avg->avg_period); |
75 | return; | 75 | return; |
76 | } | 76 | } |
77 | 77 | ||
@@ -94,8 +94,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group | |||
94 | P(se->load.weight); | 94 | P(se->load.weight); |
95 | #ifdef CONFIG_SMP | 95 | #ifdef CONFIG_SMP |
96 | P(se->avg.runnable_avg_sum); | 96 | P(se->avg.runnable_avg_sum); |
97 | P(se->avg.runnable_avg_period); | 97 | P(se->avg.running_avg_sum); |
98 | P(se->avg.avg_period); | ||
98 | P(se->avg.load_avg_contrib); | 99 | P(se->avg.load_avg_contrib); |
100 | P(se->avg.utilization_avg_contrib); | ||
99 | P(se->avg.decay_count); | 101 | P(se->avg.decay_count); |
100 | #endif | 102 | #endif |
101 | #undef PN | 103 | #undef PN |
@@ -214,6 +216,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
214 | cfs_rq->runnable_load_avg); | 216 | cfs_rq->runnable_load_avg); |
215 | SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg", | 217 | SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg", |
216 | cfs_rq->blocked_load_avg); | 218 | cfs_rq->blocked_load_avg); |
219 | SEQ_printf(m, " .%-30s: %ld\n", "utilization_load_avg", | ||
220 | cfs_rq->utilization_load_avg); | ||
217 | #ifdef CONFIG_FAIR_GROUP_SCHED | 221 | #ifdef CONFIG_FAIR_GROUP_SCHED |
218 | SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib", | 222 | SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib", |
219 | cfs_rq->tg_load_contrib); | 223 | cfs_rq->tg_load_contrib); |
@@ -636,8 +640,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
636 | P(se.load.weight); | 640 | P(se.load.weight); |
637 | #ifdef CONFIG_SMP | 641 | #ifdef CONFIG_SMP |
638 | P(se.avg.runnable_avg_sum); | 642 | P(se.avg.runnable_avg_sum); |
639 | P(se.avg.runnable_avg_period); | 643 | P(se.avg.running_avg_sum); |
644 | P(se.avg.avg_period); | ||
640 | P(se.avg.load_avg_contrib); | 645 | P(se.avg.load_avg_contrib); |
646 | P(se.avg.utilization_avg_contrib); | ||
641 | P(se.avg.decay_count); | 647 | P(se.avg.decay_count); |
642 | #endif | 648 | #endif |
643 | P(policy); | 649 | P(policy); |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7ce18f3c097a..ffeaa4105e48 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -670,6 +670,7 @@ static int select_idle_sibling(struct task_struct *p, int cpu); | |||
670 | static unsigned long task_h_load(struct task_struct *p); | 670 | static unsigned long task_h_load(struct task_struct *p); |
671 | 671 | ||
672 | static inline void __update_task_entity_contrib(struct sched_entity *se); | 672 | static inline void __update_task_entity_contrib(struct sched_entity *se); |
673 | static inline void __update_task_entity_utilization(struct sched_entity *se); | ||
673 | 674 | ||
674 | /* Give new task start runnable values to heavy its load in infant time */ | 675 | /* Give new task start runnable values to heavy its load in infant time */ |
675 | void init_task_runnable_average(struct task_struct *p) | 676 | void init_task_runnable_average(struct task_struct *p) |
@@ -677,9 +678,10 @@ void init_task_runnable_average(struct task_struct *p) | |||
677 | u32 slice; | 678 | u32 slice; |
678 | 679 | ||
679 | slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; | 680 | slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; |
680 | p->se.avg.runnable_avg_sum = slice; | 681 | p->se.avg.runnable_avg_sum = p->se.avg.running_avg_sum = slice; |
681 | p->se.avg.runnable_avg_period = slice; | 682 | p->se.avg.avg_period = slice; |
682 | __update_task_entity_contrib(&p->se); | 683 | __update_task_entity_contrib(&p->se); |
684 | __update_task_entity_utilization(&p->se); | ||
683 | } | 685 | } |
684 | #else | 686 | #else |
685 | void init_task_runnable_average(struct task_struct *p) | 687 | void init_task_runnable_average(struct task_struct *p) |
@@ -1196,9 +1198,11 @@ static void task_numa_assign(struct task_numa_env *env, | |||
1196 | static bool load_too_imbalanced(long src_load, long dst_load, | 1198 | static bool load_too_imbalanced(long src_load, long dst_load, |
1197 | struct task_numa_env *env) | 1199 | struct task_numa_env *env) |
1198 | { | 1200 | { |
1199 | long imb, old_imb; | ||
1200 | long orig_src_load, orig_dst_load; | ||
1201 | long src_capacity, dst_capacity; | 1201 | long src_capacity, dst_capacity; |
1202 | long orig_src_load; | ||
1203 | long load_a, load_b; | ||
1204 | long moved_load; | ||
1205 | long imb; | ||
1202 | 1206 | ||
1203 | /* | 1207 | /* |
1204 | * The load is corrected for the CPU capacity available on each node. | 1208 | * The load is corrected for the CPU capacity available on each node. |
@@ -1211,30 +1215,39 @@ static bool load_too_imbalanced(long src_load, long dst_load, | |||
1211 | dst_capacity = env->dst_stats.compute_capacity; | 1215 | dst_capacity = env->dst_stats.compute_capacity; |
1212 | 1216 | ||
1213 | /* We care about the slope of the imbalance, not the direction. */ | 1217 | /* We care about the slope of the imbalance, not the direction. */ |
1214 | if (dst_load < src_load) | 1218 | load_a = dst_load; |
1215 | swap(dst_load, src_load); | 1219 | load_b = src_load; |
1220 | if (load_a < load_b) | ||
1221 | swap(load_a, load_b); | ||
1216 | 1222 | ||
1217 | /* Is the difference below the threshold? */ | 1223 | /* Is the difference below the threshold? */ |
1218 | imb = dst_load * src_capacity * 100 - | 1224 | imb = load_a * src_capacity * 100 - |
1219 | src_load * dst_capacity * env->imbalance_pct; | 1225 | load_b * dst_capacity * env->imbalance_pct; |
1220 | if (imb <= 0) | 1226 | if (imb <= 0) |
1221 | return false; | 1227 | return false; |
1222 | 1228 | ||
1223 | /* | 1229 | /* |
1224 | * The imbalance is above the allowed threshold. | 1230 | * The imbalance is above the allowed threshold. |
1225 | * Compare it with the old imbalance. | 1231 | * Allow a move that brings us closer to a balanced situation, |
1232 | * without moving things past the point of balance. | ||
1226 | */ | 1233 | */ |
1227 | orig_src_load = env->src_stats.load; | 1234 | orig_src_load = env->src_stats.load; |
1228 | orig_dst_load = env->dst_stats.load; | ||
1229 | 1235 | ||
1230 | if (orig_dst_load < orig_src_load) | 1236 | /* |
1231 | swap(orig_dst_load, orig_src_load); | 1237 | * In a task swap, there will be one load moving from src to dst, |
1232 | 1238 | * and another moving back. This is the net sum of both moves. | |
1233 | old_imb = orig_dst_load * src_capacity * 100 - | 1239 | * A simple task move will always have a positive value. |
1234 | orig_src_load * dst_capacity * env->imbalance_pct; | 1240 | * Allow the move if it brings the system closer to a balanced |
1241 | * situation, without crossing over the balance point. | ||
1242 | */ | ||
1243 | moved_load = orig_src_load - src_load; | ||
1235 | 1244 | ||
1236 | /* Would this change make things worse? */ | 1245 | if (moved_load > 0) |
1237 | return (imb > old_imb); | 1246 | /* Moving src -> dst. Did we overshoot balance? */ |
1247 | return src_load * dst_capacity < dst_load * src_capacity; | ||
1248 | else | ||
1249 | /* Moving dst -> src. Did we overshoot balance? */ | ||
1250 | return dst_load * src_capacity < src_load * dst_capacity; | ||
1238 | } | 1251 | } |
1239 | 1252 | ||
1240 | /* | 1253 | /* |
@@ -1609,9 +1622,11 @@ static void update_task_scan_period(struct task_struct *p, | |||
1609 | /* | 1622 | /* |
1610 | * If there were no record hinting faults then either the task is | 1623 | * If there were no record hinting faults then either the task is |
1611 | * completely idle or all activity is areas that are not of interest | 1624 | * completely idle or all activity is areas that are not of interest |
1612 | * to automatic numa balancing. Scan slower | 1625 | * to automatic numa balancing. Related to that, if there were failed |
1626 | * migration then it implies we are migrating too quickly or the local | ||
1627 | * node is overloaded. In either case, scan slower | ||
1613 | */ | 1628 | */ |
1614 | if (local + shared == 0) { | 1629 | if (local + shared == 0 || p->numa_faults_locality[2]) { |
1615 | p->numa_scan_period = min(p->numa_scan_period_max, | 1630 | p->numa_scan_period = min(p->numa_scan_period_max, |
1616 | p->numa_scan_period << 1); | 1631 | p->numa_scan_period << 1); |
1617 | 1632 | ||
@@ -1673,7 +1688,7 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) | |||
1673 | *period = now - p->last_task_numa_placement; | 1688 | *period = now - p->last_task_numa_placement; |
1674 | } else { | 1689 | } else { |
1675 | delta = p->se.avg.runnable_avg_sum; | 1690 | delta = p->se.avg.runnable_avg_sum; |
1676 | *period = p->se.avg.runnable_avg_period; | 1691 | *period = p->se.avg.avg_period; |
1677 | } | 1692 | } |
1678 | 1693 | ||
1679 | p->last_sum_exec_runtime = runtime; | 1694 | p->last_sum_exec_runtime = runtime; |
@@ -1763,6 +1778,8 @@ static int preferred_group_nid(struct task_struct *p, int nid) | |||
1763 | } | 1778 | } |
1764 | } | 1779 | } |
1765 | /* Next round, evaluate the nodes within max_group. */ | 1780 | /* Next round, evaluate the nodes within max_group. */ |
1781 | if (!max_faults) | ||
1782 | break; | ||
1766 | nodes = max_group; | 1783 | nodes = max_group; |
1767 | } | 1784 | } |
1768 | return nid; | 1785 | return nid; |
@@ -2080,6 +2097,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) | |||
2080 | 2097 | ||
2081 | if (migrated) | 2098 | if (migrated) |
2082 | p->numa_pages_migrated += pages; | 2099 | p->numa_pages_migrated += pages; |
2100 | if (flags & TNF_MIGRATE_FAIL) | ||
2101 | p->numa_faults_locality[2] += pages; | ||
2083 | 2102 | ||
2084 | p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages; | 2103 | p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages; |
2085 | p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages; | 2104 | p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages; |
@@ -2161,8 +2180,10 @@ void task_numa_work(struct callback_head *work) | |||
2161 | vma = mm->mmap; | 2180 | vma = mm->mmap; |
2162 | } | 2181 | } |
2163 | for (; vma; vma = vma->vm_next) { | 2182 | for (; vma; vma = vma->vm_next) { |
2164 | if (!vma_migratable(vma) || !vma_policy_mof(vma)) | 2183 | if (!vma_migratable(vma) || !vma_policy_mof(vma) || |
2184 | is_vm_hugetlb_page(vma)) { | ||
2165 | continue; | 2185 | continue; |
2186 | } | ||
2166 | 2187 | ||
2167 | /* | 2188 | /* |
2168 | * Shared library pages mapped by multiple processes are not | 2189 | * Shared library pages mapped by multiple processes are not |
@@ -2497,13 +2518,15 @@ static u32 __compute_runnable_contrib(u64 n) | |||
2497 | * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) | 2518 | * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) |
2498 | * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] | 2519 | * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] |
2499 | */ | 2520 | */ |
2500 | static __always_inline int __update_entity_runnable_avg(u64 now, | 2521 | static __always_inline int __update_entity_runnable_avg(u64 now, int cpu, |
2501 | struct sched_avg *sa, | 2522 | struct sched_avg *sa, |
2502 | int runnable) | 2523 | int runnable, |
2524 | int running) | ||
2503 | { | 2525 | { |
2504 | u64 delta, periods; | 2526 | u64 delta, periods; |
2505 | u32 runnable_contrib; | 2527 | u32 runnable_contrib; |
2506 | int delta_w, decayed = 0; | 2528 | int delta_w, decayed = 0; |
2529 | unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu); | ||
2507 | 2530 | ||
2508 | delta = now - sa->last_runnable_update; | 2531 | delta = now - sa->last_runnable_update; |
2509 | /* | 2532 | /* |
@@ -2525,7 +2548,7 @@ static __always_inline int __update_entity_runnable_avg(u64 now, | |||
2525 | sa->last_runnable_update = now; | 2548 | sa->last_runnable_update = now; |
2526 | 2549 | ||
2527 | /* delta_w is the amount already accumulated against our next period */ | 2550 | /* delta_w is the amount already accumulated against our next period */ |
2528 | delta_w = sa->runnable_avg_period % 1024; | 2551 | delta_w = sa->avg_period % 1024; |
2529 | if (delta + delta_w >= 1024) { | 2552 | if (delta + delta_w >= 1024) { |
2530 | /* period roll-over */ | 2553 | /* period roll-over */ |
2531 | decayed = 1; | 2554 | decayed = 1; |
@@ -2538,7 +2561,10 @@ static __always_inline int __update_entity_runnable_avg(u64 now, | |||
2538 | delta_w = 1024 - delta_w; | 2561 | delta_w = 1024 - delta_w; |
2539 | if (runnable) | 2562 | if (runnable) |
2540 | sa->runnable_avg_sum += delta_w; | 2563 | sa->runnable_avg_sum += delta_w; |
2541 | sa->runnable_avg_period += delta_w; | 2564 | if (running) |
2565 | sa->running_avg_sum += delta_w * scale_freq | ||
2566 | >> SCHED_CAPACITY_SHIFT; | ||
2567 | sa->avg_period += delta_w; | ||
2542 | 2568 | ||
2543 | delta -= delta_w; | 2569 | delta -= delta_w; |
2544 | 2570 | ||
@@ -2548,20 +2574,28 @@ static __always_inline int __update_entity_runnable_avg(u64 now, | |||
2548 | 2574 | ||
2549 | sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, | 2575 | sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, |
2550 | periods + 1); | 2576 | periods + 1); |
2551 | sa->runnable_avg_period = decay_load(sa->runnable_avg_period, | 2577 | sa->running_avg_sum = decay_load(sa->running_avg_sum, |
2578 | periods + 1); | ||
2579 | sa->avg_period = decay_load(sa->avg_period, | ||
2552 | periods + 1); | 2580 | periods + 1); |
2553 | 2581 | ||
2554 | /* Efficiently calculate \sum (1..n_period) 1024*y^i */ | 2582 | /* Efficiently calculate \sum (1..n_period) 1024*y^i */ |
2555 | runnable_contrib = __compute_runnable_contrib(periods); | 2583 | runnable_contrib = __compute_runnable_contrib(periods); |
2556 | if (runnable) | 2584 | if (runnable) |
2557 | sa->runnable_avg_sum += runnable_contrib; | 2585 | sa->runnable_avg_sum += runnable_contrib; |
2558 | sa->runnable_avg_period += runnable_contrib; | 2586 | if (running) |
2587 | sa->running_avg_sum += runnable_contrib * scale_freq | ||
2588 | >> SCHED_CAPACITY_SHIFT; | ||
2589 | sa->avg_period += runnable_contrib; | ||
2559 | } | 2590 | } |
2560 | 2591 | ||
2561 | /* Remainder of delta accrued against u_0` */ | 2592 | /* Remainder of delta accrued against u_0` */ |
2562 | if (runnable) | 2593 | if (runnable) |
2563 | sa->runnable_avg_sum += delta; | 2594 | sa->runnable_avg_sum += delta; |
2564 | sa->runnable_avg_period += delta; | 2595 | if (running) |
2596 | sa->running_avg_sum += delta * scale_freq | ||
2597 | >> SCHED_CAPACITY_SHIFT; | ||
2598 | sa->avg_period += delta; | ||
2565 | 2599 | ||
2566 | return decayed; | 2600 | return decayed; |
2567 | } | 2601 | } |
@@ -2578,6 +2612,8 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se) | |||
2578 | return 0; | 2612 | return 0; |
2579 | 2613 | ||
2580 | se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); | 2614 | se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); |
2615 | se->avg.utilization_avg_contrib = | ||
2616 | decay_load(se->avg.utilization_avg_contrib, decays); | ||
2581 | 2617 | ||
2582 | return decays; | 2618 | return decays; |
2583 | } | 2619 | } |
@@ -2613,7 +2649,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa, | |||
2613 | 2649 | ||
2614 | /* The fraction of a cpu used by this cfs_rq */ | 2650 | /* The fraction of a cpu used by this cfs_rq */ |
2615 | contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT, | 2651 | contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT, |
2616 | sa->runnable_avg_period + 1); | 2652 | sa->avg_period + 1); |
2617 | contrib -= cfs_rq->tg_runnable_contrib; | 2653 | contrib -= cfs_rq->tg_runnable_contrib; |
2618 | 2654 | ||
2619 | if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) { | 2655 | if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) { |
@@ -2666,7 +2702,8 @@ static inline void __update_group_entity_contrib(struct sched_entity *se) | |||
2666 | 2702 | ||
2667 | static inline void update_rq_runnable_avg(struct rq *rq, int runnable) | 2703 | static inline void update_rq_runnable_avg(struct rq *rq, int runnable) |
2668 | { | 2704 | { |
2669 | __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable); | 2705 | __update_entity_runnable_avg(rq_clock_task(rq), cpu_of(rq), &rq->avg, |
2706 | runnable, runnable); | ||
2670 | __update_tg_runnable_avg(&rq->avg, &rq->cfs); | 2707 | __update_tg_runnable_avg(&rq->avg, &rq->cfs); |
2671 | } | 2708 | } |
2672 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 2709 | #else /* CONFIG_FAIR_GROUP_SCHED */ |
@@ -2684,7 +2721,7 @@ static inline void __update_task_entity_contrib(struct sched_entity *se) | |||
2684 | 2721 | ||
2685 | /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ | 2722 | /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ |
2686 | contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); | 2723 | contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); |
2687 | contrib /= (se->avg.runnable_avg_period + 1); | 2724 | contrib /= (se->avg.avg_period + 1); |
2688 | se->avg.load_avg_contrib = scale_load(contrib); | 2725 | se->avg.load_avg_contrib = scale_load(contrib); |
2689 | } | 2726 | } |
2690 | 2727 | ||
@@ -2703,6 +2740,30 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se) | |||
2703 | return se->avg.load_avg_contrib - old_contrib; | 2740 | return se->avg.load_avg_contrib - old_contrib; |
2704 | } | 2741 | } |
2705 | 2742 | ||
2743 | |||
2744 | static inline void __update_task_entity_utilization(struct sched_entity *se) | ||
2745 | { | ||
2746 | u32 contrib; | ||
2747 | |||
2748 | /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ | ||
2749 | contrib = se->avg.running_avg_sum * scale_load_down(SCHED_LOAD_SCALE); | ||
2750 | contrib /= (se->avg.avg_period + 1); | ||
2751 | se->avg.utilization_avg_contrib = scale_load(contrib); | ||
2752 | } | ||
2753 | |||
2754 | static long __update_entity_utilization_avg_contrib(struct sched_entity *se) | ||
2755 | { | ||
2756 | long old_contrib = se->avg.utilization_avg_contrib; | ||
2757 | |||
2758 | if (entity_is_task(se)) | ||
2759 | __update_task_entity_utilization(se); | ||
2760 | else | ||
2761 | se->avg.utilization_avg_contrib = | ||
2762 | group_cfs_rq(se)->utilization_load_avg; | ||
2763 | |||
2764 | return se->avg.utilization_avg_contrib - old_contrib; | ||
2765 | } | ||
2766 | |||
2706 | static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, | 2767 | static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, |
2707 | long load_contrib) | 2768 | long load_contrib) |
2708 | { | 2769 | { |
@@ -2719,7 +2780,8 @@ static inline void update_entity_load_avg(struct sched_entity *se, | |||
2719 | int update_cfs_rq) | 2780 | int update_cfs_rq) |
2720 | { | 2781 | { |
2721 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 2782 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
2722 | long contrib_delta; | 2783 | long contrib_delta, utilization_delta; |
2784 | int cpu = cpu_of(rq_of(cfs_rq)); | ||
2723 | u64 now; | 2785 | u64 now; |
2724 | 2786 | ||
2725 | /* | 2787 | /* |
@@ -2731,18 +2793,22 @@ static inline void update_entity_load_avg(struct sched_entity *se, | |||
2731 | else | 2793 | else |
2732 | now = cfs_rq_clock_task(group_cfs_rq(se)); | 2794 | now = cfs_rq_clock_task(group_cfs_rq(se)); |
2733 | 2795 | ||
2734 | if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq)) | 2796 | if (!__update_entity_runnable_avg(now, cpu, &se->avg, se->on_rq, |
2797 | cfs_rq->curr == se)) | ||
2735 | return; | 2798 | return; |
2736 | 2799 | ||
2737 | contrib_delta = __update_entity_load_avg_contrib(se); | 2800 | contrib_delta = __update_entity_load_avg_contrib(se); |
2801 | utilization_delta = __update_entity_utilization_avg_contrib(se); | ||
2738 | 2802 | ||
2739 | if (!update_cfs_rq) | 2803 | if (!update_cfs_rq) |
2740 | return; | 2804 | return; |
2741 | 2805 | ||
2742 | if (se->on_rq) | 2806 | if (se->on_rq) { |
2743 | cfs_rq->runnable_load_avg += contrib_delta; | 2807 | cfs_rq->runnable_load_avg += contrib_delta; |
2744 | else | 2808 | cfs_rq->utilization_load_avg += utilization_delta; |
2809 | } else { | ||
2745 | subtract_blocked_load_contrib(cfs_rq, -contrib_delta); | 2810 | subtract_blocked_load_contrib(cfs_rq, -contrib_delta); |
2811 | } | ||
2746 | } | 2812 | } |
2747 | 2813 | ||
2748 | /* | 2814 | /* |
@@ -2817,6 +2883,7 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, | |||
2817 | } | 2883 | } |
2818 | 2884 | ||
2819 | cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; | 2885 | cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; |
2886 | cfs_rq->utilization_load_avg += se->avg.utilization_avg_contrib; | ||
2820 | /* we force update consideration on load-balancer moves */ | 2887 | /* we force update consideration on load-balancer moves */ |
2821 | update_cfs_rq_blocked_load(cfs_rq, !wakeup); | 2888 | update_cfs_rq_blocked_load(cfs_rq, !wakeup); |
2822 | } | 2889 | } |
@@ -2835,6 +2902,7 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, | |||
2835 | update_cfs_rq_blocked_load(cfs_rq, !sleep); | 2902 | update_cfs_rq_blocked_load(cfs_rq, !sleep); |
2836 | 2903 | ||
2837 | cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; | 2904 | cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; |
2905 | cfs_rq->utilization_load_avg -= se->avg.utilization_avg_contrib; | ||
2838 | if (sleep) { | 2906 | if (sleep) { |
2839 | cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; | 2907 | cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; |
2840 | se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); | 2908 | se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); |
@@ -3172,6 +3240,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
3172 | */ | 3240 | */ |
3173 | update_stats_wait_end(cfs_rq, se); | 3241 | update_stats_wait_end(cfs_rq, se); |
3174 | __dequeue_entity(cfs_rq, se); | 3242 | __dequeue_entity(cfs_rq, se); |
3243 | update_entity_load_avg(se, 1); | ||
3175 | } | 3244 | } |
3176 | 3245 | ||
3177 | update_stats_curr_start(cfs_rq, se); | 3246 | update_stats_curr_start(cfs_rq, se); |
@@ -4298,6 +4367,11 @@ static unsigned long capacity_of(int cpu) | |||
4298 | return cpu_rq(cpu)->cpu_capacity; | 4367 | return cpu_rq(cpu)->cpu_capacity; |
4299 | } | 4368 | } |
4300 | 4369 | ||
4370 | static unsigned long capacity_orig_of(int cpu) | ||
4371 | { | ||
4372 | return cpu_rq(cpu)->cpu_capacity_orig; | ||
4373 | } | ||
4374 | |||
4301 | static unsigned long cpu_avg_load_per_task(int cpu) | 4375 | static unsigned long cpu_avg_load_per_task(int cpu) |
4302 | { | 4376 | { |
4303 | struct rq *rq = cpu_rq(cpu); | 4377 | struct rq *rq = cpu_rq(cpu); |
@@ -4711,6 +4785,33 @@ next: | |||
4711 | done: | 4785 | done: |
4712 | return target; | 4786 | return target; |
4713 | } | 4787 | } |
4788 | /* | ||
4789 | * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS | ||
4790 | * tasks. The unit of the return value must be the one of capacity so we can | ||
4791 | * compare the usage with the capacity of the CPU that is available for CFS | ||
4792 | * task (ie cpu_capacity). | ||
4793 | * cfs.utilization_load_avg is the sum of running time of runnable tasks on a | ||
4794 | * CPU. It represents the amount of utilization of a CPU in the range | ||
4795 | * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full | ||
4796 | * capacity of the CPU because it's about the running time on this CPU. | ||
4797 | * Nevertheless, cfs.utilization_load_avg can be higher than SCHED_LOAD_SCALE | ||
4798 | * because of unfortunate rounding in avg_period and running_load_avg or just | ||
4799 | * after migrating tasks until the average stabilizes with the new running | ||
4800 | * time. So we need to check that the usage stays into the range | ||
4801 | * [0..cpu_capacity_orig] and cap if necessary. | ||
4802 | * Without capping the usage, a group could be seen as overloaded (CPU0 usage | ||
4803 | * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity | ||
4804 | */ | ||
4805 | static int get_cpu_usage(int cpu) | ||
4806 | { | ||
4807 | unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg; | ||
4808 | unsigned long capacity = capacity_orig_of(cpu); | ||
4809 | |||
4810 | if (usage >= SCHED_LOAD_SCALE) | ||
4811 | return capacity; | ||
4812 | |||
4813 | return (usage * capacity) >> SCHED_LOAD_SHIFT; | ||
4814 | } | ||
4714 | 4815 | ||
4715 | /* | 4816 | /* |
4716 | * select_task_rq_fair: Select target runqueue for the waking task in domains | 4817 | * select_task_rq_fair: Select target runqueue for the waking task in domains |
@@ -5837,12 +5938,12 @@ struct sg_lb_stats { | |||
5837 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ | 5938 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ |
5838 | unsigned long load_per_task; | 5939 | unsigned long load_per_task; |
5839 | unsigned long group_capacity; | 5940 | unsigned long group_capacity; |
5941 | unsigned long group_usage; /* Total usage of the group */ | ||
5840 | unsigned int sum_nr_running; /* Nr tasks running in the group */ | 5942 | unsigned int sum_nr_running; /* Nr tasks running in the group */ |
5841 | unsigned int group_capacity_factor; | ||
5842 | unsigned int idle_cpus; | 5943 | unsigned int idle_cpus; |
5843 | unsigned int group_weight; | 5944 | unsigned int group_weight; |
5844 | enum group_type group_type; | 5945 | enum group_type group_type; |
5845 | int group_has_free_capacity; | 5946 | int group_no_capacity; |
5846 | #ifdef CONFIG_NUMA_BALANCING | 5947 | #ifdef CONFIG_NUMA_BALANCING |
5847 | unsigned int nr_numa_running; | 5948 | unsigned int nr_numa_running; |
5848 | unsigned int nr_preferred_running; | 5949 | unsigned int nr_preferred_running; |
@@ -5913,16 +6014,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd, | |||
5913 | return load_idx; | 6014 | return load_idx; |
5914 | } | 6015 | } |
5915 | 6016 | ||
5916 | static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu) | ||
5917 | { | ||
5918 | return SCHED_CAPACITY_SCALE; | ||
5919 | } | ||
5920 | |||
5921 | unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu) | ||
5922 | { | ||
5923 | return default_scale_capacity(sd, cpu); | ||
5924 | } | ||
5925 | |||
5926 | static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu) | 6017 | static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu) |
5927 | { | 6018 | { |
5928 | if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) | 6019 | if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) |
@@ -5939,7 +6030,7 @@ unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) | |||
5939 | static unsigned long scale_rt_capacity(int cpu) | 6030 | static unsigned long scale_rt_capacity(int cpu) |
5940 | { | 6031 | { |
5941 | struct rq *rq = cpu_rq(cpu); | 6032 | struct rq *rq = cpu_rq(cpu); |
5942 | u64 total, available, age_stamp, avg; | 6033 | u64 total, used, age_stamp, avg; |
5943 | s64 delta; | 6034 | s64 delta; |
5944 | 6035 | ||
5945 | /* | 6036 | /* |
@@ -5955,19 +6046,12 @@ static unsigned long scale_rt_capacity(int cpu) | |||
5955 | 6046 | ||
5956 | total = sched_avg_period() + delta; | 6047 | total = sched_avg_period() + delta; |
5957 | 6048 | ||
5958 | if (unlikely(total < avg)) { | 6049 | used = div_u64(avg, total); |
5959 | /* Ensures that capacity won't end up being negative */ | ||
5960 | available = 0; | ||
5961 | } else { | ||
5962 | available = total - avg; | ||
5963 | } | ||
5964 | |||
5965 | if (unlikely((s64)total < SCHED_CAPACITY_SCALE)) | ||
5966 | total = SCHED_CAPACITY_SCALE; | ||
5967 | 6050 | ||
5968 | total >>= SCHED_CAPACITY_SHIFT; | 6051 | if (likely(used < SCHED_CAPACITY_SCALE)) |
6052 | return SCHED_CAPACITY_SCALE - used; | ||
5969 | 6053 | ||
5970 | return div_u64(available, total); | 6054 | return 1; |
5971 | } | 6055 | } |
5972 | 6056 | ||
5973 | static void update_cpu_capacity(struct sched_domain *sd, int cpu) | 6057 | static void update_cpu_capacity(struct sched_domain *sd, int cpu) |
@@ -5982,14 +6066,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) | |||
5982 | 6066 | ||
5983 | capacity >>= SCHED_CAPACITY_SHIFT; | 6067 | capacity >>= SCHED_CAPACITY_SHIFT; |
5984 | 6068 | ||
5985 | sdg->sgc->capacity_orig = capacity; | 6069 | cpu_rq(cpu)->cpu_capacity_orig = capacity; |
5986 | |||
5987 | if (sched_feat(ARCH_CAPACITY)) | ||
5988 | capacity *= arch_scale_freq_capacity(sd, cpu); | ||
5989 | else | ||
5990 | capacity *= default_scale_capacity(sd, cpu); | ||
5991 | |||
5992 | capacity >>= SCHED_CAPACITY_SHIFT; | ||
5993 | 6070 | ||
5994 | capacity *= scale_rt_capacity(cpu); | 6071 | capacity *= scale_rt_capacity(cpu); |
5995 | capacity >>= SCHED_CAPACITY_SHIFT; | 6072 | capacity >>= SCHED_CAPACITY_SHIFT; |
@@ -6005,7 +6082,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) | |||
6005 | { | 6082 | { |
6006 | struct sched_domain *child = sd->child; | 6083 | struct sched_domain *child = sd->child; |
6007 | struct sched_group *group, *sdg = sd->groups; | 6084 | struct sched_group *group, *sdg = sd->groups; |
6008 | unsigned long capacity, capacity_orig; | 6085 | unsigned long capacity; |
6009 | unsigned long interval; | 6086 | unsigned long interval; |
6010 | 6087 | ||
6011 | interval = msecs_to_jiffies(sd->balance_interval); | 6088 | interval = msecs_to_jiffies(sd->balance_interval); |
@@ -6017,7 +6094,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) | |||
6017 | return; | 6094 | return; |
6018 | } | 6095 | } |
6019 | 6096 | ||
6020 | capacity_orig = capacity = 0; | 6097 | capacity = 0; |
6021 | 6098 | ||
6022 | if (child->flags & SD_OVERLAP) { | 6099 | if (child->flags & SD_OVERLAP) { |
6023 | /* | 6100 | /* |
@@ -6037,19 +6114,15 @@ void update_group_capacity(struct sched_domain *sd, int cpu) | |||
6037 | * Use capacity_of(), which is set irrespective of domains | 6114 | * Use capacity_of(), which is set irrespective of domains |
6038 | * in update_cpu_capacity(). | 6115 | * in update_cpu_capacity(). |
6039 | * | 6116 | * |
6040 | * This avoids capacity/capacity_orig from being 0 and | 6117 | * This avoids capacity from being 0 and |
6041 | * causing divide-by-zero issues on boot. | 6118 | * causing divide-by-zero issues on boot. |
6042 | * | ||
6043 | * Runtime updates will correct capacity_orig. | ||
6044 | */ | 6119 | */ |
6045 | if (unlikely(!rq->sd)) { | 6120 | if (unlikely(!rq->sd)) { |
6046 | capacity_orig += capacity_of(cpu); | ||
6047 | capacity += capacity_of(cpu); | 6121 | capacity += capacity_of(cpu); |
6048 | continue; | 6122 | continue; |
6049 | } | 6123 | } |
6050 | 6124 | ||
6051 | sgc = rq->sd->groups->sgc; | 6125 | sgc = rq->sd->groups->sgc; |
6052 | capacity_orig += sgc->capacity_orig; | ||
6053 | capacity += sgc->capacity; | 6126 | capacity += sgc->capacity; |
6054 | } | 6127 | } |
6055 | } else { | 6128 | } else { |
@@ -6060,39 +6133,24 @@ void update_group_capacity(struct sched_domain *sd, int cpu) | |||
6060 | 6133 | ||
6061 | group = child->groups; | 6134 | group = child->groups; |
6062 | do { | 6135 | do { |
6063 | capacity_orig += group->sgc->capacity_orig; | ||
6064 | capacity += group->sgc->capacity; | 6136 | capacity += group->sgc->capacity; |
6065 | group = group->next; | 6137 | group = group->next; |
6066 | } while (group != child->groups); | 6138 | } while (group != child->groups); |
6067 | } | 6139 | } |
6068 | 6140 | ||
6069 | sdg->sgc->capacity_orig = capacity_orig; | ||
6070 | sdg->sgc->capacity = capacity; | 6141 | sdg->sgc->capacity = capacity; |
6071 | } | 6142 | } |
6072 | 6143 | ||
6073 | /* | 6144 | /* |
6074 | * Try and fix up capacity for tiny siblings, this is needed when | 6145 | * Check whether the capacity of the rq has been noticeably reduced by side |
6075 | * things like SD_ASYM_PACKING need f_b_g to select another sibling | 6146 | * activity. The imbalance_pct is used for the threshold. |
6076 | * which on its own isn't powerful enough. | 6147 | * Return true is the capacity is reduced |
6077 | * | ||
6078 | * See update_sd_pick_busiest() and check_asym_packing(). | ||
6079 | */ | 6148 | */ |
6080 | static inline int | 6149 | static inline int |
6081 | fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | 6150 | check_cpu_capacity(struct rq *rq, struct sched_domain *sd) |
6082 | { | 6151 | { |
6083 | /* | 6152 | return ((rq->cpu_capacity * sd->imbalance_pct) < |
6084 | * Only siblings can have significantly less than SCHED_CAPACITY_SCALE | 6153 | (rq->cpu_capacity_orig * 100)); |
6085 | */ | ||
6086 | if (!(sd->flags & SD_SHARE_CPUCAPACITY)) | ||
6087 | return 0; | ||
6088 | |||
6089 | /* | ||
6090 | * If ~90% of the cpu_capacity is still there, we're good. | ||
6091 | */ | ||
6092 | if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29) | ||
6093 | return 1; | ||
6094 | |||
6095 | return 0; | ||
6096 | } | 6154 | } |
6097 | 6155 | ||
6098 | /* | 6156 | /* |
@@ -6130,37 +6188,56 @@ static inline int sg_imbalanced(struct sched_group *group) | |||
6130 | } | 6188 | } |
6131 | 6189 | ||
6132 | /* | 6190 | /* |
6133 | * Compute the group capacity factor. | 6191 | * group_has_capacity returns true if the group has spare capacity that could |
6134 | * | 6192 | * be used by some tasks. |
6135 | * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by | 6193 | * We consider that a group has spare capacity if the * number of task is |
6136 | * first dividing out the smt factor and computing the actual number of cores | 6194 | * smaller than the number of CPUs or if the usage is lower than the available |
6137 | * and limit unit capacity with that. | 6195 | * capacity for CFS tasks. |
6196 | * For the latter, we use a threshold to stabilize the state, to take into | ||
6197 | * account the variance of the tasks' load and to return true if the available | ||
6198 | * capacity in meaningful for the load balancer. | ||
6199 | * As an example, an available capacity of 1% can appear but it doesn't make | ||
6200 | * any benefit for the load balance. | ||
6138 | */ | 6201 | */ |
6139 | static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group) | 6202 | static inline bool |
6203 | group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) | ||
6140 | { | 6204 | { |
6141 | unsigned int capacity_factor, smt, cpus; | 6205 | if (sgs->sum_nr_running < sgs->group_weight) |
6142 | unsigned int capacity, capacity_orig; | 6206 | return true; |
6143 | 6207 | ||
6144 | capacity = group->sgc->capacity; | 6208 | if ((sgs->group_capacity * 100) > |
6145 | capacity_orig = group->sgc->capacity_orig; | 6209 | (sgs->group_usage * env->sd->imbalance_pct)) |
6146 | cpus = group->group_weight; | 6210 | return true; |
6211 | |||
6212 | return false; | ||
6213 | } | ||
6147 | 6214 | ||
6148 | /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */ | 6215 | /* |
6149 | smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig); | 6216 | * group_is_overloaded returns true if the group has more tasks than it can |
6150 | capacity_factor = cpus / smt; /* cores */ | 6217 | * handle. |
6218 | * group_is_overloaded is not equals to !group_has_capacity because a group | ||
6219 | * with the exact right number of tasks, has no more spare capacity but is not | ||
6220 | * overloaded so both group_has_capacity and group_is_overloaded return | ||
6221 | * false. | ||
6222 | */ | ||
6223 | static inline bool | ||
6224 | group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) | ||
6225 | { | ||
6226 | if (sgs->sum_nr_running <= sgs->group_weight) | ||
6227 | return false; | ||
6151 | 6228 | ||
6152 | capacity_factor = min_t(unsigned, | 6229 | if ((sgs->group_capacity * 100) < |
6153 | capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE)); | 6230 | (sgs->group_usage * env->sd->imbalance_pct)) |
6154 | if (!capacity_factor) | 6231 | return true; |
6155 | capacity_factor = fix_small_capacity(env->sd, group); | ||
6156 | 6232 | ||
6157 | return capacity_factor; | 6233 | return false; |
6158 | } | 6234 | } |
6159 | 6235 | ||
6160 | static enum group_type | 6236 | static enum group_type group_classify(struct lb_env *env, |
6161 | group_classify(struct sched_group *group, struct sg_lb_stats *sgs) | 6237 | struct sched_group *group, |
6238 | struct sg_lb_stats *sgs) | ||
6162 | { | 6239 | { |
6163 | if (sgs->sum_nr_running > sgs->group_capacity_factor) | 6240 | if (sgs->group_no_capacity) |
6164 | return group_overloaded; | 6241 | return group_overloaded; |
6165 | 6242 | ||
6166 | if (sg_imbalanced(group)) | 6243 | if (sg_imbalanced(group)) |
@@ -6198,6 +6275,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
6198 | load = source_load(i, load_idx); | 6275 | load = source_load(i, load_idx); |
6199 | 6276 | ||
6200 | sgs->group_load += load; | 6277 | sgs->group_load += load; |
6278 | sgs->group_usage += get_cpu_usage(i); | ||
6201 | sgs->sum_nr_running += rq->cfs.h_nr_running; | 6279 | sgs->sum_nr_running += rq->cfs.h_nr_running; |
6202 | 6280 | ||
6203 | if (rq->nr_running > 1) | 6281 | if (rq->nr_running > 1) |
@@ -6220,11 +6298,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
6220 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | 6298 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; |
6221 | 6299 | ||
6222 | sgs->group_weight = group->group_weight; | 6300 | sgs->group_weight = group->group_weight; |
6223 | sgs->group_capacity_factor = sg_capacity_factor(env, group); | ||
6224 | sgs->group_type = group_classify(group, sgs); | ||
6225 | 6301 | ||
6226 | if (sgs->group_capacity_factor > sgs->sum_nr_running) | 6302 | sgs->group_no_capacity = group_is_overloaded(env, sgs); |
6227 | sgs->group_has_free_capacity = 1; | 6303 | sgs->group_type = group_classify(env, group, sgs); |
6228 | } | 6304 | } |
6229 | 6305 | ||
6230 | /** | 6306 | /** |
@@ -6346,18 +6422,19 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd | |||
6346 | 6422 | ||
6347 | /* | 6423 | /* |
6348 | * In case the child domain prefers tasks go to siblings | 6424 | * In case the child domain prefers tasks go to siblings |
6349 | * first, lower the sg capacity factor to one so that we'll try | 6425 | * first, lower the sg capacity so that we'll try |
6350 | * and move all the excess tasks away. We lower the capacity | 6426 | * and move all the excess tasks away. We lower the capacity |
6351 | * of a group only if the local group has the capacity to fit | 6427 | * of a group only if the local group has the capacity to fit |
6352 | * these excess tasks, i.e. nr_running < group_capacity_factor. The | 6428 | * these excess tasks. The extra check prevents the case where |
6353 | * extra check prevents the case where you always pull from the | 6429 | * you always pull from the heaviest group when it is already |
6354 | * heaviest group when it is already under-utilized (possible | 6430 | * under-utilized (possible with a large weight task outweighs |
6355 | * with a large weight task outweighs the tasks on the system). | 6431 | * the tasks on the system). |
6356 | */ | 6432 | */ |
6357 | if (prefer_sibling && sds->local && | 6433 | if (prefer_sibling && sds->local && |
6358 | sds->local_stat.group_has_free_capacity) { | 6434 | group_has_capacity(env, &sds->local_stat) && |
6359 | sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U); | 6435 | (sgs->sum_nr_running > 1)) { |
6360 | sgs->group_type = group_classify(sg, sgs); | 6436 | sgs->group_no_capacity = 1; |
6437 | sgs->group_type = group_overloaded; | ||
6361 | } | 6438 | } |
6362 | 6439 | ||
6363 | if (update_sd_pick_busiest(env, sds, sg, sgs)) { | 6440 | if (update_sd_pick_busiest(env, sds, sg, sgs)) { |
@@ -6537,11 +6614,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
6537 | */ | 6614 | */ |
6538 | if (busiest->group_type == group_overloaded && | 6615 | if (busiest->group_type == group_overloaded && |
6539 | local->group_type == group_overloaded) { | 6616 | local->group_type == group_overloaded) { |
6540 | load_above_capacity = | 6617 | load_above_capacity = busiest->sum_nr_running * |
6541 | (busiest->sum_nr_running - busiest->group_capacity_factor); | 6618 | SCHED_LOAD_SCALE; |
6542 | 6619 | if (load_above_capacity > busiest->group_capacity) | |
6543 | load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE); | 6620 | load_above_capacity -= busiest->group_capacity; |
6544 | load_above_capacity /= busiest->group_capacity; | 6621 | else |
6622 | load_above_capacity = ~0UL; | ||
6545 | } | 6623 | } |
6546 | 6624 | ||
6547 | /* | 6625 | /* |
@@ -6604,6 +6682,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) | |||
6604 | local = &sds.local_stat; | 6682 | local = &sds.local_stat; |
6605 | busiest = &sds.busiest_stat; | 6683 | busiest = &sds.busiest_stat; |
6606 | 6684 | ||
6685 | /* ASYM feature bypasses nice load balance check */ | ||
6607 | if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && | 6686 | if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && |
6608 | check_asym_packing(env, &sds)) | 6687 | check_asym_packing(env, &sds)) |
6609 | return sds.busiest; | 6688 | return sds.busiest; |
@@ -6624,8 +6703,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env) | |||
6624 | goto force_balance; | 6703 | goto force_balance; |
6625 | 6704 | ||
6626 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ | 6705 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ |
6627 | if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity && | 6706 | if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) && |
6628 | !busiest->group_has_free_capacity) | 6707 | busiest->group_no_capacity) |
6629 | goto force_balance; | 6708 | goto force_balance; |
6630 | 6709 | ||
6631 | /* | 6710 | /* |
@@ -6684,7 +6763,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
6684 | int i; | 6763 | int i; |
6685 | 6764 | ||
6686 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { | 6765 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { |
6687 | unsigned long capacity, capacity_factor, wl; | 6766 | unsigned long capacity, wl; |
6688 | enum fbq_type rt; | 6767 | enum fbq_type rt; |
6689 | 6768 | ||
6690 | rq = cpu_rq(i); | 6769 | rq = cpu_rq(i); |
@@ -6713,9 +6792,6 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
6713 | continue; | 6792 | continue; |
6714 | 6793 | ||
6715 | capacity = capacity_of(i); | 6794 | capacity = capacity_of(i); |
6716 | capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE); | ||
6717 | if (!capacity_factor) | ||
6718 | capacity_factor = fix_small_capacity(env->sd, group); | ||
6719 | 6795 | ||
6720 | wl = weighted_cpuload(i); | 6796 | wl = weighted_cpuload(i); |
6721 | 6797 | ||
@@ -6723,7 +6799,9 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
6723 | * When comparing with imbalance, use weighted_cpuload() | 6799 | * When comparing with imbalance, use weighted_cpuload() |
6724 | * which is not scaled with the cpu capacity. | 6800 | * which is not scaled with the cpu capacity. |
6725 | */ | 6801 | */ |
6726 | if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance) | 6802 | |
6803 | if (rq->nr_running == 1 && wl > env->imbalance && | ||
6804 | !check_cpu_capacity(rq, env->sd)) | ||
6727 | continue; | 6805 | continue; |
6728 | 6806 | ||
6729 | /* | 6807 | /* |
@@ -6771,6 +6849,19 @@ static int need_active_balance(struct lb_env *env) | |||
6771 | return 1; | 6849 | return 1; |
6772 | } | 6850 | } |
6773 | 6851 | ||
6852 | /* | ||
6853 | * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task. | ||
6854 | * It's worth migrating the task if the src_cpu's capacity is reduced | ||
6855 | * because of other sched_class or IRQs if more capacity stays | ||
6856 | * available on dst_cpu. | ||
6857 | */ | ||
6858 | if ((env->idle != CPU_NOT_IDLE) && | ||
6859 | (env->src_rq->cfs.h_nr_running == 1)) { | ||
6860 | if ((check_cpu_capacity(env->src_rq, sd)) && | ||
6861 | (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100)) | ||
6862 | return 1; | ||
6863 | } | ||
6864 | |||
6774 | return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); | 6865 | return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); |
6775 | } | 6866 | } |
6776 | 6867 | ||
@@ -6870,6 +6961,9 @@ redo: | |||
6870 | 6961 | ||
6871 | schedstat_add(sd, lb_imbalance[idle], env.imbalance); | 6962 | schedstat_add(sd, lb_imbalance[idle], env.imbalance); |
6872 | 6963 | ||
6964 | env.src_cpu = busiest->cpu; | ||
6965 | env.src_rq = busiest; | ||
6966 | |||
6873 | ld_moved = 0; | 6967 | ld_moved = 0; |
6874 | if (busiest->nr_running > 1) { | 6968 | if (busiest->nr_running > 1) { |
6875 | /* | 6969 | /* |
@@ -6879,8 +6973,6 @@ redo: | |||
6879 | * correctly treated as an imbalance. | 6973 | * correctly treated as an imbalance. |
6880 | */ | 6974 | */ |
6881 | env.flags |= LBF_ALL_PINNED; | 6975 | env.flags |= LBF_ALL_PINNED; |
6882 | env.src_cpu = busiest->cpu; | ||
6883 | env.src_rq = busiest; | ||
6884 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); | 6976 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); |
6885 | 6977 | ||
6886 | more_balance: | 6978 | more_balance: |
@@ -7580,22 +7672,25 @@ end: | |||
7580 | 7672 | ||
7581 | /* | 7673 | /* |
7582 | * Current heuristic for kicking the idle load balancer in the presence | 7674 | * Current heuristic for kicking the idle load balancer in the presence |
7583 | * of an idle cpu is the system. | 7675 | * of an idle cpu in the system. |
7584 | * - This rq has more than one task. | 7676 | * - This rq has more than one task. |
7585 | * - At any scheduler domain level, this cpu's scheduler group has multiple | 7677 | * - This rq has at least one CFS task and the capacity of the CPU is |
7586 | * busy cpu's exceeding the group's capacity. | 7678 | * significantly reduced because of RT tasks or IRQs. |
7679 | * - At parent of LLC scheduler domain level, this cpu's scheduler group has | ||
7680 | * multiple busy cpu. | ||
7587 | * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler | 7681 | * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler |
7588 | * domain span are idle. | 7682 | * domain span are idle. |
7589 | */ | 7683 | */ |
7590 | static inline int nohz_kick_needed(struct rq *rq) | 7684 | static inline bool nohz_kick_needed(struct rq *rq) |
7591 | { | 7685 | { |
7592 | unsigned long now = jiffies; | 7686 | unsigned long now = jiffies; |
7593 | struct sched_domain *sd; | 7687 | struct sched_domain *sd; |
7594 | struct sched_group_capacity *sgc; | 7688 | struct sched_group_capacity *sgc; |
7595 | int nr_busy, cpu = rq->cpu; | 7689 | int nr_busy, cpu = rq->cpu; |
7690 | bool kick = false; | ||
7596 | 7691 | ||
7597 | if (unlikely(rq->idle_balance)) | 7692 | if (unlikely(rq->idle_balance)) |
7598 | return 0; | 7693 | return false; |
7599 | 7694 | ||
7600 | /* | 7695 | /* |
7601 | * We may be recently in ticked or tickless idle mode. At the first | 7696 | * We may be recently in ticked or tickless idle mode. At the first |
@@ -7609,38 +7704,46 @@ static inline int nohz_kick_needed(struct rq *rq) | |||
7609 | * balancing. | 7704 | * balancing. |
7610 | */ | 7705 | */ |
7611 | if (likely(!atomic_read(&nohz.nr_cpus))) | 7706 | if (likely(!atomic_read(&nohz.nr_cpus))) |
7612 | return 0; | 7707 | return false; |
7613 | 7708 | ||
7614 | if (time_before(now, nohz.next_balance)) | 7709 | if (time_before(now, nohz.next_balance)) |
7615 | return 0; | 7710 | return false; |
7616 | 7711 | ||
7617 | if (rq->nr_running >= 2) | 7712 | if (rq->nr_running >= 2) |
7618 | goto need_kick; | 7713 | return true; |
7619 | 7714 | ||
7620 | rcu_read_lock(); | 7715 | rcu_read_lock(); |
7621 | sd = rcu_dereference(per_cpu(sd_busy, cpu)); | 7716 | sd = rcu_dereference(per_cpu(sd_busy, cpu)); |
7622 | |||
7623 | if (sd) { | 7717 | if (sd) { |
7624 | sgc = sd->groups->sgc; | 7718 | sgc = sd->groups->sgc; |
7625 | nr_busy = atomic_read(&sgc->nr_busy_cpus); | 7719 | nr_busy = atomic_read(&sgc->nr_busy_cpus); |
7626 | 7720 | ||
7627 | if (nr_busy > 1) | 7721 | if (nr_busy > 1) { |
7628 | goto need_kick_unlock; | 7722 | kick = true; |
7723 | goto unlock; | ||
7724 | } | ||
7725 | |||
7629 | } | 7726 | } |
7630 | 7727 | ||
7631 | sd = rcu_dereference(per_cpu(sd_asym, cpu)); | 7728 | sd = rcu_dereference(rq->sd); |
7729 | if (sd) { | ||
7730 | if ((rq->cfs.h_nr_running >= 1) && | ||
7731 | check_cpu_capacity(rq, sd)) { | ||
7732 | kick = true; | ||
7733 | goto unlock; | ||
7734 | } | ||
7735 | } | ||
7632 | 7736 | ||
7737 | sd = rcu_dereference(per_cpu(sd_asym, cpu)); | ||
7633 | if (sd && (cpumask_first_and(nohz.idle_cpus_mask, | 7738 | if (sd && (cpumask_first_and(nohz.idle_cpus_mask, |
7634 | sched_domain_span(sd)) < cpu)) | 7739 | sched_domain_span(sd)) < cpu)) { |
7635 | goto need_kick_unlock; | 7740 | kick = true; |
7636 | 7741 | goto unlock; | |
7637 | rcu_read_unlock(); | 7742 | } |
7638 | return 0; | ||
7639 | 7743 | ||
7640 | need_kick_unlock: | 7744 | unlock: |
7641 | rcu_read_unlock(); | 7745 | rcu_read_unlock(); |
7642 | need_kick: | 7746 | return kick; |
7643 | return 1; | ||
7644 | } | 7747 | } |
7645 | #else | 7748 | #else |
7646 | static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } | 7749 | static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } |
@@ -7656,14 +7759,16 @@ static void run_rebalance_domains(struct softirq_action *h) | |||
7656 | enum cpu_idle_type idle = this_rq->idle_balance ? | 7759 | enum cpu_idle_type idle = this_rq->idle_balance ? |
7657 | CPU_IDLE : CPU_NOT_IDLE; | 7760 | CPU_IDLE : CPU_NOT_IDLE; |
7658 | 7761 | ||
7659 | rebalance_domains(this_rq, idle); | ||
7660 | |||
7661 | /* | 7762 | /* |
7662 | * If this cpu has a pending nohz_balance_kick, then do the | 7763 | * If this cpu has a pending nohz_balance_kick, then do the |
7663 | * balancing on behalf of the other idle cpus whose ticks are | 7764 | * balancing on behalf of the other idle cpus whose ticks are |
7664 | * stopped. | 7765 | * stopped. Do nohz_idle_balance *before* rebalance_domains to |
7766 | * give the idle cpus a chance to load balance. Else we may | ||
7767 | * load balance only within the local sched_domain hierarchy | ||
7768 | * and abort nohz_idle_balance altogether if we pull some load. | ||
7665 | */ | 7769 | */ |
7666 | nohz_idle_balance(this_rq, idle); | 7770 | nohz_idle_balance(this_rq, idle); |
7771 | rebalance_domains(this_rq, idle); | ||
7667 | } | 7772 | } |
7668 | 7773 | ||
7669 | /* | 7774 | /* |
diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 90284d117fe6..91e33cd485f6 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h | |||
@@ -56,6 +56,19 @@ SCHED_FEAT(NONTASK_CAPACITY, true) | |||
56 | */ | 56 | */ |
57 | SCHED_FEAT(TTWU_QUEUE, true) | 57 | SCHED_FEAT(TTWU_QUEUE, true) |
58 | 58 | ||
59 | #ifdef HAVE_RT_PUSH_IPI | ||
60 | /* | ||
61 | * In order to avoid a thundering herd attack of CPUs that are | ||
62 | * lowering their priorities at the same time, and there being | ||
63 | * a single CPU that has an RT task that can migrate and is waiting | ||
64 | * to run, where the other CPUs will try to take that CPUs | ||
65 | * rq lock and possibly create a large contention, sending an | ||
66 | * IPI to that CPU and let that CPU push the RT task to where | ||
67 | * it should go may be a better scenario. | ||
68 | */ | ||
69 | SCHED_FEAT(RT_PUSH_IPI, true) | ||
70 | #endif | ||
71 | |||
59 | SCHED_FEAT(FORCE_SD_OVERLAP, false) | 72 | SCHED_FEAT(FORCE_SD_OVERLAP, false) |
60 | SCHED_FEAT(RT_RUNTIME_SHARE, true) | 73 | SCHED_FEAT(RT_RUNTIME_SHARE, true) |
61 | SCHED_FEAT(LB_MIN, false) | 74 | SCHED_FEAT(LB_MIN, false) |
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 80014a178342..deef1caa94c6 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c | |||
@@ -158,8 +158,7 @@ static void cpuidle_idle_call(void) | |||
158 | * is used from another cpu as a broadcast timer, this call may | 158 | * is used from another cpu as a broadcast timer, this call may |
159 | * fail if it is not available | 159 | * fail if it is not available |
160 | */ | 160 | */ |
161 | if (broadcast && | 161 | if (broadcast && tick_broadcast_enter()) |
162 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) | ||
163 | goto use_default; | 162 | goto use_default; |
164 | 163 | ||
165 | /* Take note of the planned idle state. */ | 164 | /* Take note of the planned idle state. */ |
@@ -176,7 +175,7 @@ static void cpuidle_idle_call(void) | |||
176 | idle_set_state(this_rq(), NULL); | 175 | idle_set_state(this_rq(), NULL); |
177 | 176 | ||
178 | if (broadcast) | 177 | if (broadcast) |
179 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); | 178 | tick_broadcast_exit(); |
180 | 179 | ||
181 | /* | 180 | /* |
182 | * Give the governor an opportunity to reflect on the outcome | 181 | * Give the governor an opportunity to reflect on the outcome |
@@ -210,6 +209,8 @@ use_default: | |||
210 | goto exit_idle; | 209 | goto exit_idle; |
211 | } | 210 | } |
212 | 211 | ||
212 | DEFINE_PER_CPU(bool, cpu_dead_idle); | ||
213 | |||
213 | /* | 214 | /* |
214 | * Generic idle loop implementation | 215 | * Generic idle loop implementation |
215 | * | 216 | * |
@@ -234,8 +235,13 @@ static void cpu_idle_loop(void) | |||
234 | check_pgt_cache(); | 235 | check_pgt_cache(); |
235 | rmb(); | 236 | rmb(); |
236 | 237 | ||
237 | if (cpu_is_offline(smp_processor_id())) | 238 | if (cpu_is_offline(smp_processor_id())) { |
239 | rcu_cpu_notify(NULL, CPU_DYING_IDLE, | ||
240 | (void *)(long)smp_processor_id()); | ||
241 | smp_mb(); /* all activity before dead. */ | ||
242 | this_cpu_write(cpu_dead_idle, true); | ||
238 | arch_cpu_idle_dead(); | 243 | arch_cpu_idle_dead(); |
244 | } | ||
239 | 245 | ||
240 | local_irq_disable(); | 246 | local_irq_disable(); |
241 | arch_cpu_idle_enter(); | 247 | arch_cpu_idle_enter(); |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f4d4b077eba0..575da76a3874 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -6,6 +6,7 @@ | |||
6 | #include "sched.h" | 6 | #include "sched.h" |
7 | 7 | ||
8 | #include <linux/slab.h> | 8 | #include <linux/slab.h> |
9 | #include <linux/irq_work.h> | ||
9 | 10 | ||
10 | int sched_rr_timeslice = RR_TIMESLICE; | 11 | int sched_rr_timeslice = RR_TIMESLICE; |
11 | 12 | ||
@@ -59,7 +60,11 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | |||
59 | raw_spin_unlock(&rt_b->rt_runtime_lock); | 60 | raw_spin_unlock(&rt_b->rt_runtime_lock); |
60 | } | 61 | } |
61 | 62 | ||
62 | void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | 63 | #ifdef CONFIG_SMP |
64 | static void push_irq_work_func(struct irq_work *work); | ||
65 | #endif | ||
66 | |||
67 | void init_rt_rq(struct rt_rq *rt_rq) | ||
63 | { | 68 | { |
64 | struct rt_prio_array *array; | 69 | struct rt_prio_array *array; |
65 | int i; | 70 | int i; |
@@ -78,7 +83,14 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | |||
78 | rt_rq->rt_nr_migratory = 0; | 83 | rt_rq->rt_nr_migratory = 0; |
79 | rt_rq->overloaded = 0; | 84 | rt_rq->overloaded = 0; |
80 | plist_head_init(&rt_rq->pushable_tasks); | 85 | plist_head_init(&rt_rq->pushable_tasks); |
86 | |||
87 | #ifdef HAVE_RT_PUSH_IPI | ||
88 | rt_rq->push_flags = 0; | ||
89 | rt_rq->push_cpu = nr_cpu_ids; | ||
90 | raw_spin_lock_init(&rt_rq->push_lock); | ||
91 | init_irq_work(&rt_rq->push_work, push_irq_work_func); | ||
81 | #endif | 92 | #endif |
93 | #endif /* CONFIG_SMP */ | ||
82 | /* We start is dequeued state, because no RT tasks are queued */ | 94 | /* We start is dequeued state, because no RT tasks are queued */ |
83 | rt_rq->rt_queued = 0; | 95 | rt_rq->rt_queued = 0; |
84 | 96 | ||
@@ -193,7 +205,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
193 | if (!rt_se) | 205 | if (!rt_se) |
194 | goto err_free_rq; | 206 | goto err_free_rq; |
195 | 207 | ||
196 | init_rt_rq(rt_rq, cpu_rq(i)); | 208 | init_rt_rq(rt_rq); |
197 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; | 209 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; |
198 | init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); | 210 | init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); |
199 | } | 211 | } |
@@ -1778,6 +1790,164 @@ static void push_rt_tasks(struct rq *rq) | |||
1778 | ; | 1790 | ; |
1779 | } | 1791 | } |
1780 | 1792 | ||
1793 | #ifdef HAVE_RT_PUSH_IPI | ||
1794 | /* | ||
1795 | * The search for the next cpu always starts at rq->cpu and ends | ||
1796 | * when we reach rq->cpu again. It will never return rq->cpu. | ||
1797 | * This returns the next cpu to check, or nr_cpu_ids if the loop | ||
1798 | * is complete. | ||
1799 | * | ||
1800 | * rq->rt.push_cpu holds the last cpu returned by this function, | ||
1801 | * or if this is the first instance, it must hold rq->cpu. | ||
1802 | */ | ||
1803 | static int rto_next_cpu(struct rq *rq) | ||
1804 | { | ||
1805 | int prev_cpu = rq->rt.push_cpu; | ||
1806 | int cpu; | ||
1807 | |||
1808 | cpu = cpumask_next(prev_cpu, rq->rd->rto_mask); | ||
1809 | |||
1810 | /* | ||
1811 | * If the previous cpu is less than the rq's CPU, then it already | ||
1812 | * passed the end of the mask, and has started from the beginning. | ||
1813 | * We end if the next CPU is greater or equal to rq's CPU. | ||
1814 | */ | ||
1815 | if (prev_cpu < rq->cpu) { | ||
1816 | if (cpu >= rq->cpu) | ||
1817 | return nr_cpu_ids; | ||
1818 | |||
1819 | } else if (cpu >= nr_cpu_ids) { | ||
1820 | /* | ||
1821 | * We passed the end of the mask, start at the beginning. | ||
1822 | * If the result is greater or equal to the rq's CPU, then | ||
1823 | * the loop is finished. | ||
1824 | */ | ||
1825 | cpu = cpumask_first(rq->rd->rto_mask); | ||
1826 | if (cpu >= rq->cpu) | ||
1827 | return nr_cpu_ids; | ||
1828 | } | ||
1829 | rq->rt.push_cpu = cpu; | ||
1830 | |||
1831 | /* Return cpu to let the caller know if the loop is finished or not */ | ||
1832 | return cpu; | ||
1833 | } | ||
1834 | |||
1835 | static int find_next_push_cpu(struct rq *rq) | ||
1836 | { | ||
1837 | struct rq *next_rq; | ||
1838 | int cpu; | ||
1839 | |||
1840 | while (1) { | ||
1841 | cpu = rto_next_cpu(rq); | ||
1842 | if (cpu >= nr_cpu_ids) | ||
1843 | break; | ||
1844 | next_rq = cpu_rq(cpu); | ||
1845 | |||
1846 | /* Make sure the next rq can push to this rq */ | ||
1847 | if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr) | ||
1848 | break; | ||
1849 | } | ||
1850 | |||
1851 | return cpu; | ||
1852 | } | ||
1853 | |||
1854 | #define RT_PUSH_IPI_EXECUTING 1 | ||
1855 | #define RT_PUSH_IPI_RESTART 2 | ||
1856 | |||
1857 | static void tell_cpu_to_push(struct rq *rq) | ||
1858 | { | ||
1859 | int cpu; | ||
1860 | |||
1861 | if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { | ||
1862 | raw_spin_lock(&rq->rt.push_lock); | ||
1863 | /* Make sure it's still executing */ | ||
1864 | if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { | ||
1865 | /* | ||
1866 | * Tell the IPI to restart the loop as things have | ||
1867 | * changed since it started. | ||
1868 | */ | ||
1869 | rq->rt.push_flags |= RT_PUSH_IPI_RESTART; | ||
1870 | raw_spin_unlock(&rq->rt.push_lock); | ||
1871 | return; | ||
1872 | } | ||
1873 | raw_spin_unlock(&rq->rt.push_lock); | ||
1874 | } | ||
1875 | |||
1876 | /* When here, there's no IPI going around */ | ||
1877 | |||
1878 | rq->rt.push_cpu = rq->cpu; | ||
1879 | cpu = find_next_push_cpu(rq); | ||
1880 | if (cpu >= nr_cpu_ids) | ||
1881 | return; | ||
1882 | |||
1883 | rq->rt.push_flags = RT_PUSH_IPI_EXECUTING; | ||
1884 | |||
1885 | irq_work_queue_on(&rq->rt.push_work, cpu); | ||
1886 | } | ||
1887 | |||
1888 | /* Called from hardirq context */ | ||
1889 | static void try_to_push_tasks(void *arg) | ||
1890 | { | ||
1891 | struct rt_rq *rt_rq = arg; | ||
1892 | struct rq *rq, *src_rq; | ||
1893 | int this_cpu; | ||
1894 | int cpu; | ||
1895 | |||
1896 | this_cpu = rt_rq->push_cpu; | ||
1897 | |||
1898 | /* Paranoid check */ | ||
1899 | BUG_ON(this_cpu != smp_processor_id()); | ||
1900 | |||
1901 | rq = cpu_rq(this_cpu); | ||
1902 | src_rq = rq_of_rt_rq(rt_rq); | ||
1903 | |||
1904 | again: | ||
1905 | if (has_pushable_tasks(rq)) { | ||
1906 | raw_spin_lock(&rq->lock); | ||
1907 | push_rt_task(rq); | ||
1908 | raw_spin_unlock(&rq->lock); | ||
1909 | } | ||
1910 | |||
1911 | /* Pass the IPI to the next rt overloaded queue */ | ||
1912 | raw_spin_lock(&rt_rq->push_lock); | ||
1913 | /* | ||
1914 | * If the source queue changed since the IPI went out, | ||
1915 | * we need to restart the search from that CPU again. | ||
1916 | */ | ||
1917 | if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) { | ||
1918 | rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART; | ||
1919 | rt_rq->push_cpu = src_rq->cpu; | ||
1920 | } | ||
1921 | |||
1922 | cpu = find_next_push_cpu(src_rq); | ||
1923 | |||
1924 | if (cpu >= nr_cpu_ids) | ||
1925 | rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING; | ||
1926 | raw_spin_unlock(&rt_rq->push_lock); | ||
1927 | |||
1928 | if (cpu >= nr_cpu_ids) | ||
1929 | return; | ||
1930 | |||
1931 | /* | ||
1932 | * It is possible that a restart caused this CPU to be | ||
1933 | * chosen again. Don't bother with an IPI, just see if we | ||
1934 | * have more to push. | ||
1935 | */ | ||
1936 | if (unlikely(cpu == rq->cpu)) | ||
1937 | goto again; | ||
1938 | |||
1939 | /* Try the next RT overloaded CPU */ | ||
1940 | irq_work_queue_on(&rt_rq->push_work, cpu); | ||
1941 | } | ||
1942 | |||
1943 | static void push_irq_work_func(struct irq_work *work) | ||
1944 | { | ||
1945 | struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work); | ||
1946 | |||
1947 | try_to_push_tasks(rt_rq); | ||
1948 | } | ||
1949 | #endif /* HAVE_RT_PUSH_IPI */ | ||
1950 | |||
1781 | static int pull_rt_task(struct rq *this_rq) | 1951 | static int pull_rt_task(struct rq *this_rq) |
1782 | { | 1952 | { |
1783 | int this_cpu = this_rq->cpu, ret = 0, cpu; | 1953 | int this_cpu = this_rq->cpu, ret = 0, cpu; |
@@ -1793,6 +1963,13 @@ static int pull_rt_task(struct rq *this_rq) | |||
1793 | */ | 1963 | */ |
1794 | smp_rmb(); | 1964 | smp_rmb(); |
1795 | 1965 | ||
1966 | #ifdef HAVE_RT_PUSH_IPI | ||
1967 | if (sched_feat(RT_PUSH_IPI)) { | ||
1968 | tell_cpu_to_push(this_rq); | ||
1969 | return 0; | ||
1970 | } | ||
1971 | #endif | ||
1972 | |||
1796 | for_each_cpu(cpu, this_rq->rd->rto_mask) { | 1973 | for_each_cpu(cpu, this_rq->rd->rto_mask) { |
1797 | if (this_cpu == cpu) | 1974 | if (this_cpu == cpu) |
1798 | continue; | 1975 | continue; |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index dc0f435a2779..e0e129993958 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -6,6 +6,7 @@ | |||
6 | #include <linux/mutex.h> | 6 | #include <linux/mutex.h> |
7 | #include <linux/spinlock.h> | 7 | #include <linux/spinlock.h> |
8 | #include <linux/stop_machine.h> | 8 | #include <linux/stop_machine.h> |
9 | #include <linux/irq_work.h> | ||
9 | #include <linux/tick.h> | 10 | #include <linux/tick.h> |
10 | #include <linux/slab.h> | 11 | #include <linux/slab.h> |
11 | 12 | ||
@@ -362,8 +363,14 @@ struct cfs_rq { | |||
362 | * Under CFS, load is tracked on a per-entity basis and aggregated up. | 363 | * Under CFS, load is tracked on a per-entity basis and aggregated up. |
363 | * This allows for the description of both thread and group usage (in | 364 | * This allows for the description of both thread and group usage (in |
364 | * the FAIR_GROUP_SCHED case). | 365 | * the FAIR_GROUP_SCHED case). |
366 | * runnable_load_avg is the sum of the load_avg_contrib of the | ||
367 | * sched_entities on the rq. | ||
368 | * blocked_load_avg is similar to runnable_load_avg except that its | ||
369 | * the blocked sched_entities on the rq. | ||
370 | * utilization_load_avg is the sum of the average running time of the | ||
371 | * sched_entities on the rq. | ||
365 | */ | 372 | */ |
366 | unsigned long runnable_load_avg, blocked_load_avg; | 373 | unsigned long runnable_load_avg, blocked_load_avg, utilization_load_avg; |
367 | atomic64_t decay_counter; | 374 | atomic64_t decay_counter; |
368 | u64 last_decay; | 375 | u64 last_decay; |
369 | atomic_long_t removed_load; | 376 | atomic_long_t removed_load; |
@@ -418,6 +425,11 @@ static inline int rt_bandwidth_enabled(void) | |||
418 | return sysctl_sched_rt_runtime >= 0; | 425 | return sysctl_sched_rt_runtime >= 0; |
419 | } | 426 | } |
420 | 427 | ||
428 | /* RT IPI pull logic requires IRQ_WORK */ | ||
429 | #ifdef CONFIG_IRQ_WORK | ||
430 | # define HAVE_RT_PUSH_IPI | ||
431 | #endif | ||
432 | |||
421 | /* Real-Time classes' related field in a runqueue: */ | 433 | /* Real-Time classes' related field in a runqueue: */ |
422 | struct rt_rq { | 434 | struct rt_rq { |
423 | struct rt_prio_array active; | 435 | struct rt_prio_array active; |
@@ -435,7 +447,13 @@ struct rt_rq { | |||
435 | unsigned long rt_nr_total; | 447 | unsigned long rt_nr_total; |
436 | int overloaded; | 448 | int overloaded; |
437 | struct plist_head pushable_tasks; | 449 | struct plist_head pushable_tasks; |
450 | #ifdef HAVE_RT_PUSH_IPI | ||
451 | int push_flags; | ||
452 | int push_cpu; | ||
453 | struct irq_work push_work; | ||
454 | raw_spinlock_t push_lock; | ||
438 | #endif | 455 | #endif |
456 | #endif /* CONFIG_SMP */ | ||
439 | int rt_queued; | 457 | int rt_queued; |
440 | 458 | ||
441 | int rt_throttled; | 459 | int rt_throttled; |
@@ -597,6 +615,7 @@ struct rq { | |||
597 | struct sched_domain *sd; | 615 | struct sched_domain *sd; |
598 | 616 | ||
599 | unsigned long cpu_capacity; | 617 | unsigned long cpu_capacity; |
618 | unsigned long cpu_capacity_orig; | ||
600 | 619 | ||
601 | unsigned char idle_balance; | 620 | unsigned char idle_balance; |
602 | /* For active balancing */ | 621 | /* For active balancing */ |
@@ -807,7 +826,7 @@ struct sched_group_capacity { | |||
807 | * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity | 826 | * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity |
808 | * for a single CPU. | 827 | * for a single CPU. |
809 | */ | 828 | */ |
810 | unsigned int capacity, capacity_orig; | 829 | unsigned int capacity; |
811 | unsigned long next_update; | 830 | unsigned long next_update; |
812 | int imbalance; /* XXX unrelated to capacity but shared group state */ | 831 | int imbalance; /* XXX unrelated to capacity but shared group state */ |
813 | /* | 832 | /* |
@@ -1368,9 +1387,18 @@ static inline int hrtick_enabled(struct rq *rq) | |||
1368 | 1387 | ||
1369 | #ifdef CONFIG_SMP | 1388 | #ifdef CONFIG_SMP |
1370 | extern void sched_avg_update(struct rq *rq); | 1389 | extern void sched_avg_update(struct rq *rq); |
1390 | |||
1391 | #ifndef arch_scale_freq_capacity | ||
1392 | static __always_inline | ||
1393 | unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) | ||
1394 | { | ||
1395 | return SCHED_CAPACITY_SCALE; | ||
1396 | } | ||
1397 | #endif | ||
1398 | |||
1371 | static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | 1399 | static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) |
1372 | { | 1400 | { |
1373 | rq->rt_avg += rt_delta; | 1401 | rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); |
1374 | sched_avg_update(rq); | 1402 | sched_avg_update(rq); |
1375 | } | 1403 | } |
1376 | #else | 1404 | #else |
@@ -1643,8 +1671,8 @@ extern void print_rt_stats(struct seq_file *m, int cpu); | |||
1643 | extern void print_dl_stats(struct seq_file *m, int cpu); | 1671 | extern void print_dl_stats(struct seq_file *m, int cpu); |
1644 | 1672 | ||
1645 | extern void init_cfs_rq(struct cfs_rq *cfs_rq); | 1673 | extern void init_cfs_rq(struct cfs_rq *cfs_rq); |
1646 | extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); | 1674 | extern void init_rt_rq(struct rt_rq *rt_rq); |
1647 | extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq); | 1675 | extern void init_dl_rq(struct dl_rq *dl_rq); |
1648 | 1676 | ||
1649 | extern void cfs_bandwidth_usage_inc(void); | 1677 | extern void cfs_bandwidth_usage_inc(void); |
1650 | extern void cfs_bandwidth_usage_dec(void); | 1678 | extern void cfs_bandwidth_usage_dec(void); |
diff --git a/kernel/signal.c b/kernel/signal.c index a390499943e4..d51c5ddd855c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -2992,11 +2992,9 @@ static int do_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t *info) | |||
2992 | * Nor can they impersonate a kill()/tgkill(), which adds source info. | 2992 | * Nor can they impersonate a kill()/tgkill(), which adds source info. |
2993 | */ | 2993 | */ |
2994 | if ((info->si_code >= 0 || info->si_code == SI_TKILL) && | 2994 | if ((info->si_code >= 0 || info->si_code == SI_TKILL) && |
2995 | (task_pid_vnr(current) != pid)) { | 2995 | (task_pid_vnr(current) != pid)) |
2996 | /* We used to allow any < 0 si_code */ | ||
2997 | WARN_ON_ONCE(info->si_code < 0); | ||
2998 | return -EPERM; | 2996 | return -EPERM; |
2999 | } | 2997 | |
3000 | info->si_signo = sig; | 2998 | info->si_signo = sig; |
3001 | 2999 | ||
3002 | /* POSIX.1b doesn't mention process groups. */ | 3000 | /* POSIX.1b doesn't mention process groups. */ |
@@ -3041,12 +3039,10 @@ static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) | |||
3041 | /* Not even root can pretend to send signals from the kernel. | 3039 | /* Not even root can pretend to send signals from the kernel. |
3042 | * Nor can they impersonate a kill()/tgkill(), which adds source info. | 3040 | * Nor can they impersonate a kill()/tgkill(), which adds source info. |
3043 | */ | 3041 | */ |
3044 | if (((info->si_code >= 0 || info->si_code == SI_TKILL)) && | 3042 | if ((info->si_code >= 0 || info->si_code == SI_TKILL) && |
3045 | (task_pid_vnr(current) != pid)) { | 3043 | (task_pid_vnr(current) != pid)) |
3046 | /* We used to allow any < 0 si_code */ | ||
3047 | WARN_ON_ONCE(info->si_code < 0); | ||
3048 | return -EPERM; | 3044 | return -EPERM; |
3049 | } | 3045 | |
3050 | info->si_signo = sig; | 3046 | info->si_signo = sig; |
3051 | 3047 | ||
3052 | return do_send_specific(tgid, pid, sig, info); | 3048 | return do_send_specific(tgid, pid, sig, info); |
diff --git a/kernel/smp.c b/kernel/smp.c index f38a1e692259..07854477c164 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
@@ -19,7 +19,7 @@ | |||
19 | 19 | ||
20 | enum { | 20 | enum { |
21 | CSD_FLAG_LOCK = 0x01, | 21 | CSD_FLAG_LOCK = 0x01, |
22 | CSD_FLAG_WAIT = 0x02, | 22 | CSD_FLAG_SYNCHRONOUS = 0x02, |
23 | }; | 23 | }; |
24 | 24 | ||
25 | struct call_function_data { | 25 | struct call_function_data { |
@@ -107,7 +107,7 @@ void __init call_function_init(void) | |||
107 | */ | 107 | */ |
108 | static void csd_lock_wait(struct call_single_data *csd) | 108 | static void csd_lock_wait(struct call_single_data *csd) |
109 | { | 109 | { |
110 | while (csd->flags & CSD_FLAG_LOCK) | 110 | while (smp_load_acquire(&csd->flags) & CSD_FLAG_LOCK) |
111 | cpu_relax(); | 111 | cpu_relax(); |
112 | } | 112 | } |
113 | 113 | ||
@@ -121,19 +121,17 @@ static void csd_lock(struct call_single_data *csd) | |||
121 | * to ->flags with any subsequent assignments to other | 121 | * to ->flags with any subsequent assignments to other |
122 | * fields of the specified call_single_data structure: | 122 | * fields of the specified call_single_data structure: |
123 | */ | 123 | */ |
124 | smp_mb(); | 124 | smp_wmb(); |
125 | } | 125 | } |
126 | 126 | ||
127 | static void csd_unlock(struct call_single_data *csd) | 127 | static void csd_unlock(struct call_single_data *csd) |
128 | { | 128 | { |
129 | WARN_ON((csd->flags & CSD_FLAG_WAIT) && !(csd->flags & CSD_FLAG_LOCK)); | 129 | WARN_ON(!(csd->flags & CSD_FLAG_LOCK)); |
130 | 130 | ||
131 | /* | 131 | /* |
132 | * ensure we're all done before releasing data: | 132 | * ensure we're all done before releasing data: |
133 | */ | 133 | */ |
134 | smp_mb(); | 134 | smp_store_release(&csd->flags, 0); |
135 | |||
136 | csd->flags &= ~CSD_FLAG_LOCK; | ||
137 | } | 135 | } |
138 | 136 | ||
139 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data); | 137 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data); |
@@ -144,13 +142,16 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data); | |||
144 | * ->func, ->info, and ->flags set. | 142 | * ->func, ->info, and ->flags set. |
145 | */ | 143 | */ |
146 | static int generic_exec_single(int cpu, struct call_single_data *csd, | 144 | static int generic_exec_single(int cpu, struct call_single_data *csd, |
147 | smp_call_func_t func, void *info, int wait) | 145 | smp_call_func_t func, void *info) |
148 | { | 146 | { |
149 | struct call_single_data csd_stack = { .flags = 0 }; | ||
150 | unsigned long flags; | ||
151 | |||
152 | |||
153 | if (cpu == smp_processor_id()) { | 147 | if (cpu == smp_processor_id()) { |
148 | unsigned long flags; | ||
149 | |||
150 | /* | ||
151 | * We can unlock early even for the synchronous on-stack case, | ||
152 | * since we're doing this from the same CPU.. | ||
153 | */ | ||
154 | csd_unlock(csd); | ||
154 | local_irq_save(flags); | 155 | local_irq_save(flags); |
155 | func(info); | 156 | func(info); |
156 | local_irq_restore(flags); | 157 | local_irq_restore(flags); |
@@ -158,24 +159,14 @@ static int generic_exec_single(int cpu, struct call_single_data *csd, | |||
158 | } | 159 | } |
159 | 160 | ||
160 | 161 | ||
161 | if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) | 162 | if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) { |
163 | csd_unlock(csd); | ||
162 | return -ENXIO; | 164 | return -ENXIO; |
163 | |||
164 | |||
165 | if (!csd) { | ||
166 | csd = &csd_stack; | ||
167 | if (!wait) | ||
168 | csd = this_cpu_ptr(&csd_data); | ||
169 | } | 165 | } |
170 | 166 | ||
171 | csd_lock(csd); | ||
172 | |||
173 | csd->func = func; | 167 | csd->func = func; |
174 | csd->info = info; | 168 | csd->info = info; |
175 | 169 | ||
176 | if (wait) | ||
177 | csd->flags |= CSD_FLAG_WAIT; | ||
178 | |||
179 | /* | 170 | /* |
180 | * The list addition should be visible before sending the IPI | 171 | * The list addition should be visible before sending the IPI |
181 | * handler locks the list to pull the entry off it because of | 172 | * handler locks the list to pull the entry off it because of |
@@ -190,9 +181,6 @@ static int generic_exec_single(int cpu, struct call_single_data *csd, | |||
190 | if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) | 181 | if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) |
191 | arch_send_call_function_single_ipi(cpu); | 182 | arch_send_call_function_single_ipi(cpu); |
192 | 183 | ||
193 | if (wait) | ||
194 | csd_lock_wait(csd); | ||
195 | |||
196 | return 0; | 184 | return 0; |
197 | } | 185 | } |
198 | 186 | ||
@@ -250,8 +238,17 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) | |||
250 | } | 238 | } |
251 | 239 | ||
252 | llist_for_each_entry_safe(csd, csd_next, entry, llist) { | 240 | llist_for_each_entry_safe(csd, csd_next, entry, llist) { |
253 | csd->func(csd->info); | 241 | smp_call_func_t func = csd->func; |
254 | csd_unlock(csd); | 242 | void *info = csd->info; |
243 | |||
244 | /* Do we wait until *after* callback? */ | ||
245 | if (csd->flags & CSD_FLAG_SYNCHRONOUS) { | ||
246 | func(info); | ||
247 | csd_unlock(csd); | ||
248 | } else { | ||
249 | csd_unlock(csd); | ||
250 | func(info); | ||
251 | } | ||
255 | } | 252 | } |
256 | 253 | ||
257 | /* | 254 | /* |
@@ -274,6 +271,8 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) | |||
274 | int smp_call_function_single(int cpu, smp_call_func_t func, void *info, | 271 | int smp_call_function_single(int cpu, smp_call_func_t func, void *info, |
275 | int wait) | 272 | int wait) |
276 | { | 273 | { |
274 | struct call_single_data *csd; | ||
275 | struct call_single_data csd_stack = { .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS }; | ||
277 | int this_cpu; | 276 | int this_cpu; |
278 | int err; | 277 | int err; |
279 | 278 | ||
@@ -292,7 +291,16 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, | |||
292 | WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() | 291 | WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() |
293 | && !oops_in_progress); | 292 | && !oops_in_progress); |
294 | 293 | ||
295 | err = generic_exec_single(cpu, NULL, func, info, wait); | 294 | csd = &csd_stack; |
295 | if (!wait) { | ||
296 | csd = this_cpu_ptr(&csd_data); | ||
297 | csd_lock(csd); | ||
298 | } | ||
299 | |||
300 | err = generic_exec_single(cpu, csd, func, info); | ||
301 | |||
302 | if (wait) | ||
303 | csd_lock_wait(csd); | ||
296 | 304 | ||
297 | put_cpu(); | 305 | put_cpu(); |
298 | 306 | ||
@@ -321,7 +329,15 @@ int smp_call_function_single_async(int cpu, struct call_single_data *csd) | |||
321 | int err = 0; | 329 | int err = 0; |
322 | 330 | ||
323 | preempt_disable(); | 331 | preempt_disable(); |
324 | err = generic_exec_single(cpu, csd, csd->func, csd->info, 0); | 332 | |
333 | /* We could deadlock if we have to wait here with interrupts disabled! */ | ||
334 | if (WARN_ON_ONCE(csd->flags & CSD_FLAG_LOCK)) | ||
335 | csd_lock_wait(csd); | ||
336 | |||
337 | csd->flags = CSD_FLAG_LOCK; | ||
338 | smp_wmb(); | ||
339 | |||
340 | err = generic_exec_single(cpu, csd, csd->func, csd->info); | ||
325 | preempt_enable(); | 341 | preempt_enable(); |
326 | 342 | ||
327 | return err; | 343 | return err; |
@@ -433,6 +449,8 @@ void smp_call_function_many(const struct cpumask *mask, | |||
433 | struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu); | 449 | struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu); |
434 | 450 | ||
435 | csd_lock(csd); | 451 | csd_lock(csd); |
452 | if (wait) | ||
453 | csd->flags |= CSD_FLAG_SYNCHRONOUS; | ||
436 | csd->func = func; | 454 | csd->func = func; |
437 | csd->info = info; | 455 | csd->info = info; |
438 | llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)); | 456 | llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)); |
diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 40190f28db35..c697f73d82d6 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c | |||
@@ -4,6 +4,7 @@ | |||
4 | #include <linux/cpu.h> | 4 | #include <linux/cpu.h> |
5 | #include <linux/err.h> | 5 | #include <linux/err.h> |
6 | #include <linux/smp.h> | 6 | #include <linux/smp.h> |
7 | #include <linux/delay.h> | ||
7 | #include <linux/init.h> | 8 | #include <linux/init.h> |
8 | #include <linux/list.h> | 9 | #include <linux/list.h> |
9 | #include <linux/slab.h> | 10 | #include <linux/slab.h> |
@@ -314,3 +315,158 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread) | |||
314 | put_online_cpus(); | 315 | put_online_cpus(); |
315 | } | 316 | } |
316 | EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread); | 317 | EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread); |
318 | |||
319 | static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD); | ||
320 | |||
321 | /* | ||
322 | * Called to poll specified CPU's state, for example, when waiting for | ||
323 | * a CPU to come online. | ||
324 | */ | ||
325 | int cpu_report_state(int cpu) | ||
326 | { | ||
327 | return atomic_read(&per_cpu(cpu_hotplug_state, cpu)); | ||
328 | } | ||
329 | |||
330 | /* | ||
331 | * If CPU has died properly, set its state to CPU_UP_PREPARE and | ||
332 | * return success. Otherwise, return -EBUSY if the CPU died after | ||
333 | * cpu_wait_death() timed out. And yet otherwise again, return -EAGAIN | ||
334 | * if cpu_wait_death() timed out and the CPU still hasn't gotten around | ||
335 | * to dying. In the latter two cases, the CPU might not be set up | ||
336 | * properly, but it is up to the arch-specific code to decide. | ||
337 | * Finally, -EIO indicates an unanticipated problem. | ||
338 | * | ||
339 | * Note that it is permissible to omit this call entirely, as is | ||
340 | * done in architectures that do no CPU-hotplug error checking. | ||
341 | */ | ||
342 | int cpu_check_up_prepare(int cpu) | ||
343 | { | ||
344 | if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) { | ||
345 | atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE); | ||
346 | return 0; | ||
347 | } | ||
348 | |||
349 | switch (atomic_read(&per_cpu(cpu_hotplug_state, cpu))) { | ||
350 | |||
351 | case CPU_POST_DEAD: | ||
352 | |||
353 | /* The CPU died properly, so just start it up again. */ | ||
354 | atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE); | ||
355 | return 0; | ||
356 | |||
357 | case CPU_DEAD_FROZEN: | ||
358 | |||
359 | /* | ||
360 | * Timeout during CPU death, so let caller know. | ||
361 | * The outgoing CPU completed its processing, but after | ||
362 | * cpu_wait_death() timed out and reported the error. The | ||
363 | * caller is free to proceed, in which case the state | ||
364 | * will be reset properly by cpu_set_state_online(). | ||
365 | * Proceeding despite this -EBUSY return makes sense | ||
366 | * for systems where the outgoing CPUs take themselves | ||
367 | * offline, with no post-death manipulation required from | ||
368 | * a surviving CPU. | ||
369 | */ | ||
370 | return -EBUSY; | ||
371 | |||
372 | case CPU_BROKEN: | ||
373 | |||
374 | /* | ||
375 | * The most likely reason we got here is that there was | ||
376 | * a timeout during CPU death, and the outgoing CPU never | ||
377 | * did complete its processing. This could happen on | ||
378 | * a virtualized system if the outgoing VCPU gets preempted | ||
379 | * for more than five seconds, and the user attempts to | ||
380 | * immediately online that same CPU. Trying again later | ||
381 | * might return -EBUSY above, hence -EAGAIN. | ||
382 | */ | ||
383 | return -EAGAIN; | ||
384 | |||
385 | default: | ||
386 | |||
387 | /* Should not happen. Famous last words. */ | ||
388 | return -EIO; | ||
389 | } | ||
390 | } | ||
391 | |||
392 | /* | ||
393 | * Mark the specified CPU online. | ||
394 | * | ||
395 | * Note that it is permissible to omit this call entirely, as is | ||
396 | * done in architectures that do no CPU-hotplug error checking. | ||
397 | */ | ||
398 | void cpu_set_state_online(int cpu) | ||
399 | { | ||
400 | (void)atomic_xchg(&per_cpu(cpu_hotplug_state, cpu), CPU_ONLINE); | ||
401 | } | ||
402 | |||
403 | #ifdef CONFIG_HOTPLUG_CPU | ||
404 | |||
405 | /* | ||
406 | * Wait for the specified CPU to exit the idle loop and die. | ||
407 | */ | ||
408 | bool cpu_wait_death(unsigned int cpu, int seconds) | ||
409 | { | ||
410 | int jf_left = seconds * HZ; | ||
411 | int oldstate; | ||
412 | bool ret = true; | ||
413 | int sleep_jf = 1; | ||
414 | |||
415 | might_sleep(); | ||
416 | |||
417 | /* The outgoing CPU will normally get done quite quickly. */ | ||
418 | if (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) == CPU_DEAD) | ||
419 | goto update_state; | ||
420 | udelay(5); | ||
421 | |||
422 | /* But if the outgoing CPU dawdles, wait increasingly long times. */ | ||
423 | while (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) != CPU_DEAD) { | ||
424 | schedule_timeout_uninterruptible(sleep_jf); | ||
425 | jf_left -= sleep_jf; | ||
426 | if (jf_left <= 0) | ||
427 | break; | ||
428 | sleep_jf = DIV_ROUND_UP(sleep_jf * 11, 10); | ||
429 | } | ||
430 | update_state: | ||
431 | oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu)); | ||
432 | if (oldstate == CPU_DEAD) { | ||
433 | /* Outgoing CPU died normally, update state. */ | ||
434 | smp_mb(); /* atomic_read() before update. */ | ||
435 | atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_POST_DEAD); | ||
436 | } else { | ||
437 | /* Outgoing CPU still hasn't died, set state accordingly. */ | ||
438 | if (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu), | ||
439 | oldstate, CPU_BROKEN) != oldstate) | ||
440 | goto update_state; | ||
441 | ret = false; | ||
442 | } | ||
443 | return ret; | ||
444 | } | ||
445 | |||
446 | /* | ||
447 | * Called by the outgoing CPU to report its successful death. Return | ||
448 | * false if this report follows the surviving CPU's timing out. | ||
449 | * | ||
450 | * A separate "CPU_DEAD_FROZEN" is used when the surviving CPU | ||
451 | * timed out. This approach allows architectures to omit calls to | ||
452 | * cpu_check_up_prepare() and cpu_set_state_online() without defeating | ||
453 | * the next cpu_wait_death()'s polling loop. | ||
454 | */ | ||
455 | bool cpu_report_death(void) | ||
456 | { | ||
457 | int oldstate; | ||
458 | int newstate; | ||
459 | int cpu = smp_processor_id(); | ||
460 | |||
461 | do { | ||
462 | oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu)); | ||
463 | if (oldstate != CPU_BROKEN) | ||
464 | newstate = CPU_DEAD; | ||
465 | else | ||
466 | newstate = CPU_DEAD_FROZEN; | ||
467 | } while (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu), | ||
468 | oldstate, newstate) != oldstate); | ||
469 | return newstate == CPU_DEAD; | ||
470 | } | ||
471 | |||
472 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
diff --git a/kernel/sys.c b/kernel/sys.c index a03d9cd23ed7..a4e372b798a5 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -325,6 +325,7 @@ out_unlock: | |||
325 | * SMP: There are not races, the GIDs are checked only by filesystem | 325 | * SMP: There are not races, the GIDs are checked only by filesystem |
326 | * operations (as far as semantic preservation is concerned). | 326 | * operations (as far as semantic preservation is concerned). |
327 | */ | 327 | */ |
328 | #ifdef CONFIG_MULTIUSER | ||
328 | SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) | 329 | SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) |
329 | { | 330 | { |
330 | struct user_namespace *ns = current_user_ns(); | 331 | struct user_namespace *ns = current_user_ns(); |
@@ -815,6 +816,7 @@ change_okay: | |||
815 | commit_creds(new); | 816 | commit_creds(new); |
816 | return old_fsgid; | 817 | return old_fsgid; |
817 | } | 818 | } |
819 | #endif /* CONFIG_MULTIUSER */ | ||
818 | 820 | ||
819 | /** | 821 | /** |
820 | * sys_getpid - return the thread group id of the current process | 822 | * sys_getpid - return the thread group id of the current process |
@@ -1647,14 +1649,13 @@ SYSCALL_DEFINE1(umask, int, mask) | |||
1647 | return mask; | 1649 | return mask; |
1648 | } | 1650 | } |
1649 | 1651 | ||
1650 | static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd) | 1652 | static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) |
1651 | { | 1653 | { |
1652 | struct fd exe; | 1654 | struct fd exe; |
1655 | struct file *old_exe, *exe_file; | ||
1653 | struct inode *inode; | 1656 | struct inode *inode; |
1654 | int err; | 1657 | int err; |
1655 | 1658 | ||
1656 | VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); | ||
1657 | |||
1658 | exe = fdget(fd); | 1659 | exe = fdget(fd); |
1659 | if (!exe.file) | 1660 | if (!exe.file) |
1660 | return -EBADF; | 1661 | return -EBADF; |
@@ -1678,15 +1679,22 @@ static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd) | |||
1678 | /* | 1679 | /* |
1679 | * Forbid mm->exe_file change if old file still mapped. | 1680 | * Forbid mm->exe_file change if old file still mapped. |
1680 | */ | 1681 | */ |
1682 | exe_file = get_mm_exe_file(mm); | ||
1681 | err = -EBUSY; | 1683 | err = -EBUSY; |
1682 | if (mm->exe_file) { | 1684 | if (exe_file) { |
1683 | struct vm_area_struct *vma; | 1685 | struct vm_area_struct *vma; |
1684 | 1686 | ||
1685 | for (vma = mm->mmap; vma; vma = vma->vm_next) | 1687 | down_read(&mm->mmap_sem); |
1686 | if (vma->vm_file && | 1688 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
1687 | path_equal(&vma->vm_file->f_path, | 1689 | if (!vma->vm_file) |
1688 | &mm->exe_file->f_path)) | 1690 | continue; |
1689 | goto exit; | 1691 | if (path_equal(&vma->vm_file->f_path, |
1692 | &exe_file->f_path)) | ||
1693 | goto exit_err; | ||
1694 | } | ||
1695 | |||
1696 | up_read(&mm->mmap_sem); | ||
1697 | fput(exe_file); | ||
1690 | } | 1698 | } |
1691 | 1699 | ||
1692 | /* | 1700 | /* |
@@ -1700,10 +1708,18 @@ static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd) | |||
1700 | goto exit; | 1708 | goto exit; |
1701 | 1709 | ||
1702 | err = 0; | 1710 | err = 0; |
1703 | set_mm_exe_file(mm, exe.file); /* this grabs a reference to exe.file */ | 1711 | /* set the new file, lockless */ |
1712 | get_file(exe.file); | ||
1713 | old_exe = xchg(&mm->exe_file, exe.file); | ||
1714 | if (old_exe) | ||
1715 | fput(old_exe); | ||
1704 | exit: | 1716 | exit: |
1705 | fdput(exe); | 1717 | fdput(exe); |
1706 | return err; | 1718 | return err; |
1719 | exit_err: | ||
1720 | up_read(&mm->mmap_sem); | ||
1721 | fput(exe_file); | ||
1722 | goto exit; | ||
1707 | } | 1723 | } |
1708 | 1724 | ||
1709 | #ifdef CONFIG_CHECKPOINT_RESTORE | 1725 | #ifdef CONFIG_CHECKPOINT_RESTORE |
@@ -1838,10 +1854,9 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data | |||
1838 | user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL; | 1854 | user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL; |
1839 | } | 1855 | } |
1840 | 1856 | ||
1841 | down_write(&mm->mmap_sem); | ||
1842 | if (prctl_map.exe_fd != (u32)-1) | 1857 | if (prctl_map.exe_fd != (u32)-1) |
1843 | error = prctl_set_mm_exe_file_locked(mm, prctl_map.exe_fd); | 1858 | error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd); |
1844 | downgrade_write(&mm->mmap_sem); | 1859 | down_read(&mm->mmap_sem); |
1845 | if (error) | 1860 | if (error) |
1846 | goto out; | 1861 | goto out; |
1847 | 1862 | ||
@@ -1907,12 +1922,8 @@ static int prctl_set_mm(int opt, unsigned long addr, | |||
1907 | if (!capable(CAP_SYS_RESOURCE)) | 1922 | if (!capable(CAP_SYS_RESOURCE)) |
1908 | return -EPERM; | 1923 | return -EPERM; |
1909 | 1924 | ||
1910 | if (opt == PR_SET_MM_EXE_FILE) { | 1925 | if (opt == PR_SET_MM_EXE_FILE) |
1911 | down_write(&mm->mmap_sem); | 1926 | return prctl_set_mm_exe_file(mm, (unsigned int)addr); |
1912 | error = prctl_set_mm_exe_file_locked(mm, (unsigned int)addr); | ||
1913 | up_write(&mm->mmap_sem); | ||
1914 | return error; | ||
1915 | } | ||
1916 | 1927 | ||
1917 | if (addr >= TASK_SIZE || addr < mmap_min_addr) | 1928 | if (addr >= TASK_SIZE || addr < mmap_min_addr) |
1918 | return -EINVAL; | 1929 | return -EINVAL; |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 5adcb0ae3a58..7995ef5868d8 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
@@ -159,6 +159,20 @@ cond_syscall(sys_uselib); | |||
159 | cond_syscall(sys_fadvise64); | 159 | cond_syscall(sys_fadvise64); |
160 | cond_syscall(sys_fadvise64_64); | 160 | cond_syscall(sys_fadvise64_64); |
161 | cond_syscall(sys_madvise); | 161 | cond_syscall(sys_madvise); |
162 | cond_syscall(sys_setuid); | ||
163 | cond_syscall(sys_setregid); | ||
164 | cond_syscall(sys_setgid); | ||
165 | cond_syscall(sys_setreuid); | ||
166 | cond_syscall(sys_setresuid); | ||
167 | cond_syscall(sys_getresuid); | ||
168 | cond_syscall(sys_setresgid); | ||
169 | cond_syscall(sys_getresgid); | ||
170 | cond_syscall(sys_setgroups); | ||
171 | cond_syscall(sys_getgroups); | ||
172 | cond_syscall(sys_setfsuid); | ||
173 | cond_syscall(sys_setfsgid); | ||
174 | cond_syscall(sys_capget); | ||
175 | cond_syscall(sys_capset); | ||
162 | 176 | ||
163 | /* arch-specific weak syscall entries */ | 177 | /* arch-specific weak syscall entries */ |
164 | cond_syscall(sys_pciconfig_read); | 178 | cond_syscall(sys_pciconfig_read); |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 88ea2d6e0031..2082b1a88fb9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -19,6 +19,7 @@ | |||
19 | */ | 19 | */ |
20 | 20 | ||
21 | #include <linux/module.h> | 21 | #include <linux/module.h> |
22 | #include <linux/aio.h> | ||
22 | #include <linux/mm.h> | 23 | #include <linux/mm.h> |
23 | #include <linux/swap.h> | 24 | #include <linux/swap.h> |
24 | #include <linux/slab.h> | 25 | #include <linux/slab.h> |
@@ -92,11 +93,9 @@ | |||
92 | #include <linux/nmi.h> | 93 | #include <linux/nmi.h> |
93 | #endif | 94 | #endif |
94 | 95 | ||
95 | |||
96 | #if defined(CONFIG_SYSCTL) | 96 | #if defined(CONFIG_SYSCTL) |
97 | 97 | ||
98 | /* External variables not in a header file. */ | 98 | /* External variables not in a header file. */ |
99 | extern int max_threads; | ||
100 | extern int suid_dumpable; | 99 | extern int suid_dumpable; |
101 | #ifdef CONFIG_COREDUMP | 100 | #ifdef CONFIG_COREDUMP |
102 | extern int core_uses_pid; | 101 | extern int core_uses_pid; |
@@ -709,10 +708,10 @@ static struct ctl_table kern_table[] = { | |||
709 | #endif | 708 | #endif |
710 | { | 709 | { |
711 | .procname = "threads-max", | 710 | .procname = "threads-max", |
712 | .data = &max_threads, | 711 | .data = NULL, |
713 | .maxlen = sizeof(int), | 712 | .maxlen = sizeof(int), |
714 | .mode = 0644, | 713 | .mode = 0644, |
715 | .proc_handler = proc_dointvec, | 714 | .proc_handler = sysctl_max_threads, |
716 | }, | 715 | }, |
717 | { | 716 | { |
718 | .procname = "random", | 717 | .procname = "random", |
@@ -846,7 +845,7 @@ static struct ctl_table kern_table[] = { | |||
846 | .data = &watchdog_user_enabled, | 845 | .data = &watchdog_user_enabled, |
847 | .maxlen = sizeof (int), | 846 | .maxlen = sizeof (int), |
848 | .mode = 0644, | 847 | .mode = 0644, |
849 | .proc_handler = proc_dowatchdog, | 848 | .proc_handler = proc_watchdog, |
850 | .extra1 = &zero, | 849 | .extra1 = &zero, |
851 | .extra2 = &one, | 850 | .extra2 = &one, |
852 | }, | 851 | }, |
@@ -855,11 +854,33 @@ static struct ctl_table kern_table[] = { | |||
855 | .data = &watchdog_thresh, | 854 | .data = &watchdog_thresh, |
856 | .maxlen = sizeof(int), | 855 | .maxlen = sizeof(int), |
857 | .mode = 0644, | 856 | .mode = 0644, |
858 | .proc_handler = proc_dowatchdog, | 857 | .proc_handler = proc_watchdog_thresh, |
859 | .extra1 = &zero, | 858 | .extra1 = &zero, |
860 | .extra2 = &sixty, | 859 | .extra2 = &sixty, |
861 | }, | 860 | }, |
862 | { | 861 | { |
862 | .procname = "nmi_watchdog", | ||
863 | .data = &nmi_watchdog_enabled, | ||
864 | .maxlen = sizeof (int), | ||
865 | .mode = 0644, | ||
866 | .proc_handler = proc_nmi_watchdog, | ||
867 | .extra1 = &zero, | ||
868 | #if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR) | ||
869 | .extra2 = &one, | ||
870 | #else | ||
871 | .extra2 = &zero, | ||
872 | #endif | ||
873 | }, | ||
874 | { | ||
875 | .procname = "soft_watchdog", | ||
876 | .data = &soft_watchdog_enabled, | ||
877 | .maxlen = sizeof (int), | ||
878 | .mode = 0644, | ||
879 | .proc_handler = proc_soft_watchdog, | ||
880 | .extra1 = &zero, | ||
881 | .extra2 = &one, | ||
882 | }, | ||
883 | { | ||
863 | .procname = "softlockup_panic", | 884 | .procname = "softlockup_panic", |
864 | .data = &softlockup_panic, | 885 | .data = &softlockup_panic, |
865 | .maxlen = sizeof(int), | 886 | .maxlen = sizeof(int), |
@@ -879,15 +900,6 @@ static struct ctl_table kern_table[] = { | |||
879 | .extra2 = &one, | 900 | .extra2 = &one, |
880 | }, | 901 | }, |
881 | #endif /* CONFIG_SMP */ | 902 | #endif /* CONFIG_SMP */ |
882 | { | ||
883 | .procname = "nmi_watchdog", | ||
884 | .data = &watchdog_user_enabled, | ||
885 | .maxlen = sizeof (int), | ||
886 | .mode = 0644, | ||
887 | .proc_handler = proc_dowatchdog, | ||
888 | .extra1 = &zero, | ||
889 | .extra2 = &one, | ||
890 | }, | ||
891 | #endif | 903 | #endif |
892 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) | 904 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) |
893 | { | 905 | { |
@@ -1228,6 +1240,14 @@ static struct ctl_table vm_table[] = { | |||
1228 | .extra1 = &zero, | 1240 | .extra1 = &zero, |
1229 | }, | 1241 | }, |
1230 | { | 1242 | { |
1243 | .procname = "dirtytime_expire_seconds", | ||
1244 | .data = &dirtytime_expire_interval, | ||
1245 | .maxlen = sizeof(dirty_expire_interval), | ||
1246 | .mode = 0644, | ||
1247 | .proc_handler = dirtytime_interval_handler, | ||
1248 | .extra1 = &zero, | ||
1249 | }, | ||
1250 | { | ||
1231 | .procname = "nr_pdflush_threads", | 1251 | .procname = "nr_pdflush_threads", |
1232 | .mode = 0444 /* read-only */, | 1252 | .mode = 0444 /* read-only */, |
1233 | .proc_handler = pdflush_proc_obsolete, | 1253 | .proc_handler = pdflush_proc_obsolete, |
@@ -1313,6 +1333,15 @@ static struct ctl_table vm_table[] = { | |||
1313 | .extra1 = &min_extfrag_threshold, | 1333 | .extra1 = &min_extfrag_threshold, |
1314 | .extra2 = &max_extfrag_threshold, | 1334 | .extra2 = &max_extfrag_threshold, |
1315 | }, | 1335 | }, |
1336 | { | ||
1337 | .procname = "compact_unevictable_allowed", | ||
1338 | .data = &sysctl_compact_unevictable_allowed, | ||
1339 | .maxlen = sizeof(int), | ||
1340 | .mode = 0644, | ||
1341 | .proc_handler = proc_dointvec, | ||
1342 | .extra1 = &zero, | ||
1343 | .extra2 = &one, | ||
1344 | }, | ||
1316 | 1345 | ||
1317 | #endif /* CONFIG_COMPACTION */ | 1346 | #endif /* CONFIG_COMPACTION */ |
1318 | { | 1347 | { |
@@ -1952,7 +1981,15 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, | |||
1952 | int write, void *data) | 1981 | int write, void *data) |
1953 | { | 1982 | { |
1954 | if (write) { | 1983 | if (write) { |
1955 | *valp = *negp ? -*lvalp : *lvalp; | 1984 | if (*negp) { |
1985 | if (*lvalp > (unsigned long) INT_MAX + 1) | ||
1986 | return -EINVAL; | ||
1987 | *valp = -*lvalp; | ||
1988 | } else { | ||
1989 | if (*lvalp > (unsigned long) INT_MAX) | ||
1990 | return -EINVAL; | ||
1991 | *valp = *lvalp; | ||
1992 | } | ||
1956 | } else { | 1993 | } else { |
1957 | int val = *valp; | 1994 | int val = *valp; |
1958 | if (val < 0) { | 1995 | if (val < 0) { |
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index d626dc98e8df..579ce1b929af 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig | |||
@@ -33,12 +33,6 @@ config ARCH_USES_GETTIMEOFFSET | |||
33 | config GENERIC_CLOCKEVENTS | 33 | config GENERIC_CLOCKEVENTS |
34 | bool | 34 | bool |
35 | 35 | ||
36 | # Migration helper. Builds, but does not invoke | ||
37 | config GENERIC_CLOCKEVENTS_BUILD | ||
38 | bool | ||
39 | default y | ||
40 | depends on GENERIC_CLOCKEVENTS | ||
41 | |||
42 | # Architecture can handle broadcast in a driver-agnostic way | 36 | # Architecture can handle broadcast in a driver-agnostic way |
43 | config ARCH_HAS_TICK_BROADCAST | 37 | config ARCH_HAS_TICK_BROADCAST |
44 | bool | 38 | bool |
diff --git a/kernel/time/Makefile b/kernel/time/Makefile index c09c07817d7a..01f0312419b3 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile | |||
@@ -2,15 +2,13 @@ obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o | |||
2 | obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o | 2 | obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o |
3 | obj-y += timeconv.o timecounter.o posix-clock.o alarmtimer.o | 3 | obj-y += timeconv.o timecounter.o posix-clock.o alarmtimer.o |
4 | 4 | ||
5 | obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o | 5 | obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o tick-common.o |
6 | obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o | ||
7 | ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y) | 6 | ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y) |
8 | obj-y += tick-broadcast.o | 7 | obj-y += tick-broadcast.o |
9 | obj-$(CONFIG_TICK_ONESHOT) += tick-broadcast-hrtimer.o | 8 | obj-$(CONFIG_TICK_ONESHOT) += tick-broadcast-hrtimer.o |
10 | endif | 9 | endif |
11 | obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o | 10 | obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o |
12 | obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o | 11 | obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o |
13 | obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o | ||
14 | obj-$(CONFIG_TIMER_STATS) += timer_stats.o | 12 | obj-$(CONFIG_TIMER_STATS) += timer_stats.o |
15 | obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o | 13 | obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o |
16 | obj-$(CONFIG_TEST_UDELAY) += test_udelay.o | 14 | obj-$(CONFIG_TEST_UDELAY) += test_udelay.o |
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 888ecc114ddc..11dc22a6983b 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c | |||
@@ -94,25 +94,76 @@ u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) | |||
94 | } | 94 | } |
95 | EXPORT_SYMBOL_GPL(clockevent_delta2ns); | 95 | EXPORT_SYMBOL_GPL(clockevent_delta2ns); |
96 | 96 | ||
97 | static int __clockevents_set_state(struct clock_event_device *dev, | ||
98 | enum clock_event_state state) | ||
99 | { | ||
100 | /* Transition with legacy set_mode() callback */ | ||
101 | if (dev->set_mode) { | ||
102 | /* Legacy callback doesn't support new modes */ | ||
103 | if (state > CLOCK_EVT_STATE_ONESHOT) | ||
104 | return -ENOSYS; | ||
105 | /* | ||
106 | * 'clock_event_state' and 'clock_event_mode' have 1-to-1 | ||
107 | * mapping until *_ONESHOT, and so a simple cast will work. | ||
108 | */ | ||
109 | dev->set_mode((enum clock_event_mode)state, dev); | ||
110 | dev->mode = (enum clock_event_mode)state; | ||
111 | return 0; | ||
112 | } | ||
113 | |||
114 | if (dev->features & CLOCK_EVT_FEAT_DUMMY) | ||
115 | return 0; | ||
116 | |||
117 | /* Transition with new state-specific callbacks */ | ||
118 | switch (state) { | ||
119 | case CLOCK_EVT_STATE_DETACHED: | ||
120 | /* | ||
121 | * This is an internal state, which is guaranteed to go from | ||
122 | * SHUTDOWN to DETACHED. No driver interaction required. | ||
123 | */ | ||
124 | return 0; | ||
125 | |||
126 | case CLOCK_EVT_STATE_SHUTDOWN: | ||
127 | return dev->set_state_shutdown(dev); | ||
128 | |||
129 | case CLOCK_EVT_STATE_PERIODIC: | ||
130 | /* Core internal bug */ | ||
131 | if (!(dev->features & CLOCK_EVT_FEAT_PERIODIC)) | ||
132 | return -ENOSYS; | ||
133 | return dev->set_state_periodic(dev); | ||
134 | |||
135 | case CLOCK_EVT_STATE_ONESHOT: | ||
136 | /* Core internal bug */ | ||
137 | if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) | ||
138 | return -ENOSYS; | ||
139 | return dev->set_state_oneshot(dev); | ||
140 | |||
141 | default: | ||
142 | return -ENOSYS; | ||
143 | } | ||
144 | } | ||
145 | |||
97 | /** | 146 | /** |
98 | * clockevents_set_mode - set the operating mode of a clock event device | 147 | * clockevents_set_state - set the operating state of a clock event device |
99 | * @dev: device to modify | 148 | * @dev: device to modify |
100 | * @mode: new mode | 149 | * @state: new state |
101 | * | 150 | * |
102 | * Must be called with interrupts disabled ! | 151 | * Must be called with interrupts disabled ! |
103 | */ | 152 | */ |
104 | void clockevents_set_mode(struct clock_event_device *dev, | 153 | void clockevents_set_state(struct clock_event_device *dev, |
105 | enum clock_event_mode mode) | 154 | enum clock_event_state state) |
106 | { | 155 | { |
107 | if (dev->mode != mode) { | 156 | if (dev->state != state) { |
108 | dev->set_mode(mode, dev); | 157 | if (__clockevents_set_state(dev, state)) |
109 | dev->mode = mode; | 158 | return; |
159 | |||
160 | dev->state = state; | ||
110 | 161 | ||
111 | /* | 162 | /* |
112 | * A nsec2cyc multiplicator of 0 is invalid and we'd crash | 163 | * A nsec2cyc multiplicator of 0 is invalid and we'd crash |
113 | * on it, so fix it up and emit a warning: | 164 | * on it, so fix it up and emit a warning: |
114 | */ | 165 | */ |
115 | if (mode == CLOCK_EVT_MODE_ONESHOT) { | 166 | if (state == CLOCK_EVT_STATE_ONESHOT) { |
116 | if (unlikely(!dev->mult)) { | 167 | if (unlikely(!dev->mult)) { |
117 | dev->mult = 1; | 168 | dev->mult = 1; |
118 | WARN_ON(1); | 169 | WARN_ON(1); |
@@ -127,10 +178,28 @@ void clockevents_set_mode(struct clock_event_device *dev, | |||
127 | */ | 178 | */ |
128 | void clockevents_shutdown(struct clock_event_device *dev) | 179 | void clockevents_shutdown(struct clock_event_device *dev) |
129 | { | 180 | { |
130 | clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); | 181 | clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN); |
131 | dev->next_event.tv64 = KTIME_MAX; | 182 | dev->next_event.tv64 = KTIME_MAX; |
132 | } | 183 | } |
133 | 184 | ||
185 | /** | ||
186 | * clockevents_tick_resume - Resume the tick device before using it again | ||
187 | * @dev: device to resume | ||
188 | */ | ||
189 | int clockevents_tick_resume(struct clock_event_device *dev) | ||
190 | { | ||
191 | int ret = 0; | ||
192 | |||
193 | if (dev->set_mode) { | ||
194 | dev->set_mode(CLOCK_EVT_MODE_RESUME, dev); | ||
195 | dev->mode = CLOCK_EVT_MODE_RESUME; | ||
196 | } else if (dev->tick_resume) { | ||
197 | ret = dev->tick_resume(dev); | ||
198 | } | ||
199 | |||
200 | return ret; | ||
201 | } | ||
202 | |||
134 | #ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST | 203 | #ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST |
135 | 204 | ||
136 | /* Limit min_delta to a jiffie */ | 205 | /* Limit min_delta to a jiffie */ |
@@ -183,7 +252,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev) | |||
183 | delta = dev->min_delta_ns; | 252 | delta = dev->min_delta_ns; |
184 | dev->next_event = ktime_add_ns(ktime_get(), delta); | 253 | dev->next_event = ktime_add_ns(ktime_get(), delta); |
185 | 254 | ||
186 | if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) | 255 | if (dev->state == CLOCK_EVT_STATE_SHUTDOWN) |
187 | return 0; | 256 | return 0; |
188 | 257 | ||
189 | dev->retries++; | 258 | dev->retries++; |
@@ -220,7 +289,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev) | |||
220 | delta = dev->min_delta_ns; | 289 | delta = dev->min_delta_ns; |
221 | dev->next_event = ktime_add_ns(ktime_get(), delta); | 290 | dev->next_event = ktime_add_ns(ktime_get(), delta); |
222 | 291 | ||
223 | if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) | 292 | if (dev->state == CLOCK_EVT_STATE_SHUTDOWN) |
224 | return 0; | 293 | return 0; |
225 | 294 | ||
226 | dev->retries++; | 295 | dev->retries++; |
@@ -252,7 +321,7 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, | |||
252 | 321 | ||
253 | dev->next_event = expires; | 322 | dev->next_event = expires; |
254 | 323 | ||
255 | if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) | 324 | if (dev->state == CLOCK_EVT_STATE_SHUTDOWN) |
256 | return 0; | 325 | return 0; |
257 | 326 | ||
258 | /* Shortcut for clockevent devices that can deal with ktime. */ | 327 | /* Shortcut for clockevent devices that can deal with ktime. */ |
@@ -297,7 +366,7 @@ static int clockevents_replace(struct clock_event_device *ced) | |||
297 | struct clock_event_device *dev, *newdev = NULL; | 366 | struct clock_event_device *dev, *newdev = NULL; |
298 | 367 | ||
299 | list_for_each_entry(dev, &clockevent_devices, list) { | 368 | list_for_each_entry(dev, &clockevent_devices, list) { |
300 | if (dev == ced || dev->mode != CLOCK_EVT_MODE_UNUSED) | 369 | if (dev == ced || dev->state != CLOCK_EVT_STATE_DETACHED) |
301 | continue; | 370 | continue; |
302 | 371 | ||
303 | if (!tick_check_replacement(newdev, dev)) | 372 | if (!tick_check_replacement(newdev, dev)) |
@@ -323,7 +392,7 @@ static int clockevents_replace(struct clock_event_device *ced) | |||
323 | static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu) | 392 | static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu) |
324 | { | 393 | { |
325 | /* Fast track. Device is unused */ | 394 | /* Fast track. Device is unused */ |
326 | if (ced->mode == CLOCK_EVT_MODE_UNUSED) { | 395 | if (ced->state == CLOCK_EVT_STATE_DETACHED) { |
327 | list_del_init(&ced->list); | 396 | list_del_init(&ced->list); |
328 | return 0; | 397 | return 0; |
329 | } | 398 | } |
@@ -373,6 +442,37 @@ int clockevents_unbind_device(struct clock_event_device *ced, int cpu) | |||
373 | } | 442 | } |
374 | EXPORT_SYMBOL_GPL(clockevents_unbind_device); | 443 | EXPORT_SYMBOL_GPL(clockevents_unbind_device); |
375 | 444 | ||
445 | /* Sanity check of state transition callbacks */ | ||
446 | static int clockevents_sanity_check(struct clock_event_device *dev) | ||
447 | { | ||
448 | /* Legacy set_mode() callback */ | ||
449 | if (dev->set_mode) { | ||
450 | /* We shouldn't be supporting new modes now */ | ||
451 | WARN_ON(dev->set_state_periodic || dev->set_state_oneshot || | ||
452 | dev->set_state_shutdown || dev->tick_resume); | ||
453 | |||
454 | BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); | ||
455 | return 0; | ||
456 | } | ||
457 | |||
458 | if (dev->features & CLOCK_EVT_FEAT_DUMMY) | ||
459 | return 0; | ||
460 | |||
461 | /* New state-specific callbacks */ | ||
462 | if (!dev->set_state_shutdown) | ||
463 | return -EINVAL; | ||
464 | |||
465 | if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && | ||
466 | !dev->set_state_periodic) | ||
467 | return -EINVAL; | ||
468 | |||
469 | if ((dev->features & CLOCK_EVT_FEAT_ONESHOT) && | ||
470 | !dev->set_state_oneshot) | ||
471 | return -EINVAL; | ||
472 | |||
473 | return 0; | ||
474 | } | ||
475 | |||
376 | /** | 476 | /** |
377 | * clockevents_register_device - register a clock event device | 477 | * clockevents_register_device - register a clock event device |
378 | * @dev: device to register | 478 | * @dev: device to register |
@@ -381,7 +481,11 @@ void clockevents_register_device(struct clock_event_device *dev) | |||
381 | { | 481 | { |
382 | unsigned long flags; | 482 | unsigned long flags; |
383 | 483 | ||
384 | BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); | 484 | BUG_ON(clockevents_sanity_check(dev)); |
485 | |||
486 | /* Initialize state to DETACHED */ | ||
487 | dev->state = CLOCK_EVT_STATE_DETACHED; | ||
488 | |||
385 | if (!dev->cpumask) { | 489 | if (!dev->cpumask) { |
386 | WARN_ON(num_possible_cpus() > 1); | 490 | WARN_ON(num_possible_cpus() > 1); |
387 | dev->cpumask = cpumask_of(smp_processor_id()); | 491 | dev->cpumask = cpumask_of(smp_processor_id()); |
@@ -445,11 +549,11 @@ int __clockevents_update_freq(struct clock_event_device *dev, u32 freq) | |||
445 | { | 549 | { |
446 | clockevents_config(dev, freq); | 550 | clockevents_config(dev, freq); |
447 | 551 | ||
448 | if (dev->mode == CLOCK_EVT_MODE_ONESHOT) | 552 | if (dev->state == CLOCK_EVT_STATE_ONESHOT) |
449 | return clockevents_program_event(dev, dev->next_event, false); | 553 | return clockevents_program_event(dev, dev->next_event, false); |
450 | 554 | ||
451 | if (dev->mode == CLOCK_EVT_MODE_PERIODIC) | 555 | if (dev->state == CLOCK_EVT_STATE_PERIODIC) |
452 | dev->set_mode(CLOCK_EVT_MODE_PERIODIC, dev); | 556 | return __clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC); |
453 | 557 | ||
454 | return 0; | 558 | return 0; |
455 | } | 559 | } |
@@ -491,30 +595,27 @@ void clockevents_handle_noop(struct clock_event_device *dev) | |||
491 | * @old: device to release (can be NULL) | 595 | * @old: device to release (can be NULL) |
492 | * @new: device to request (can be NULL) | 596 | * @new: device to request (can be NULL) |
493 | * | 597 | * |
494 | * Called from the notifier chain. clockevents_lock is held already | 598 | * Called from various tick functions with clockevents_lock held and |
599 | * interrupts disabled. | ||
495 | */ | 600 | */ |
496 | void clockevents_exchange_device(struct clock_event_device *old, | 601 | void clockevents_exchange_device(struct clock_event_device *old, |
497 | struct clock_event_device *new) | 602 | struct clock_event_device *new) |
498 | { | 603 | { |
499 | unsigned long flags; | ||
500 | |||
501 | local_irq_save(flags); | ||
502 | /* | 604 | /* |
503 | * Caller releases a clock event device. We queue it into the | 605 | * Caller releases a clock event device. We queue it into the |
504 | * released list and do a notify add later. | 606 | * released list and do a notify add later. |
505 | */ | 607 | */ |
506 | if (old) { | 608 | if (old) { |
507 | module_put(old->owner); | 609 | module_put(old->owner); |
508 | clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); | 610 | clockevents_set_state(old, CLOCK_EVT_STATE_DETACHED); |
509 | list_del(&old->list); | 611 | list_del(&old->list); |
510 | list_add(&old->list, &clockevents_released); | 612 | list_add(&old->list, &clockevents_released); |
511 | } | 613 | } |
512 | 614 | ||
513 | if (new) { | 615 | if (new) { |
514 | BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED); | 616 | BUG_ON(new->state != CLOCK_EVT_STATE_DETACHED); |
515 | clockevents_shutdown(new); | 617 | clockevents_shutdown(new); |
516 | } | 618 | } |
517 | local_irq_restore(flags); | ||
518 | } | 619 | } |
519 | 620 | ||
520 | /** | 621 | /** |
@@ -541,74 +642,40 @@ void clockevents_resume(void) | |||
541 | dev->resume(dev); | 642 | dev->resume(dev); |
542 | } | 643 | } |
543 | 644 | ||
544 | #ifdef CONFIG_GENERIC_CLOCKEVENTS | 645 | #ifdef CONFIG_HOTPLUG_CPU |
545 | /** | 646 | /** |
546 | * clockevents_notify - notification about relevant events | 647 | * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu |
547 | * Returns 0 on success, any other value on error | ||
548 | */ | 648 | */ |
549 | int clockevents_notify(unsigned long reason, void *arg) | 649 | void tick_cleanup_dead_cpu(int cpu) |
550 | { | 650 | { |
551 | struct clock_event_device *dev, *tmp; | 651 | struct clock_event_device *dev, *tmp; |
552 | unsigned long flags; | 652 | unsigned long flags; |
553 | int cpu, ret = 0; | ||
554 | 653 | ||
555 | raw_spin_lock_irqsave(&clockevents_lock, flags); | 654 | raw_spin_lock_irqsave(&clockevents_lock, flags); |
556 | 655 | ||
557 | switch (reason) { | 656 | tick_shutdown_broadcast_oneshot(cpu); |
558 | case CLOCK_EVT_NOTIFY_BROADCAST_ON: | 657 | tick_shutdown_broadcast(cpu); |
559 | case CLOCK_EVT_NOTIFY_BROADCAST_OFF: | 658 | tick_shutdown(cpu); |
560 | case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: | 659 | /* |
561 | tick_broadcast_on_off(reason, arg); | 660 | * Unregister the clock event devices which were |
562 | break; | 661 | * released from the users in the notify chain. |
563 | 662 | */ | |
564 | case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: | 663 | list_for_each_entry_safe(dev, tmp, &clockevents_released, list) |
565 | case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: | 664 | list_del(&dev->list); |
566 | ret = tick_broadcast_oneshot_control(reason); | 665 | /* |
567 | break; | 666 | * Now check whether the CPU has left unused per cpu devices |
568 | 667 | */ | |
569 | case CLOCK_EVT_NOTIFY_CPU_DYING: | 668 | list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) { |
570 | tick_handover_do_timer(arg); | 669 | if (cpumask_test_cpu(cpu, dev->cpumask) && |
571 | break; | 670 | cpumask_weight(dev->cpumask) == 1 && |
572 | 671 | !tick_is_broadcast_device(dev)) { | |
573 | case CLOCK_EVT_NOTIFY_SUSPEND: | 672 | BUG_ON(dev->state != CLOCK_EVT_STATE_DETACHED); |
574 | tick_suspend(); | ||
575 | tick_suspend_broadcast(); | ||
576 | break; | ||
577 | |||
578 | case CLOCK_EVT_NOTIFY_RESUME: | ||
579 | tick_resume(); | ||
580 | break; | ||
581 | |||
582 | case CLOCK_EVT_NOTIFY_CPU_DEAD: | ||
583 | tick_shutdown_broadcast_oneshot(arg); | ||
584 | tick_shutdown_broadcast(arg); | ||
585 | tick_shutdown(arg); | ||
586 | /* | ||
587 | * Unregister the clock event devices which were | ||
588 | * released from the users in the notify chain. | ||
589 | */ | ||
590 | list_for_each_entry_safe(dev, tmp, &clockevents_released, list) | ||
591 | list_del(&dev->list); | 673 | list_del(&dev->list); |
592 | /* | ||
593 | * Now check whether the CPU has left unused per cpu devices | ||
594 | */ | ||
595 | cpu = *((int *)arg); | ||
596 | list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) { | ||
597 | if (cpumask_test_cpu(cpu, dev->cpumask) && | ||
598 | cpumask_weight(dev->cpumask) == 1 && | ||
599 | !tick_is_broadcast_device(dev)) { | ||
600 | BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); | ||
601 | list_del(&dev->list); | ||
602 | } | ||
603 | } | 674 | } |
604 | break; | ||
605 | default: | ||
606 | break; | ||
607 | } | 675 | } |
608 | raw_spin_unlock_irqrestore(&clockevents_lock, flags); | 676 | raw_spin_unlock_irqrestore(&clockevents_lock, flags); |
609 | return ret; | ||
610 | } | 677 | } |
611 | EXPORT_SYMBOL_GPL(clockevents_notify); | 678 | #endif |
612 | 679 | ||
613 | #ifdef CONFIG_SYSFS | 680 | #ifdef CONFIG_SYSFS |
614 | struct bus_type clockevents_subsys = { | 681 | struct bus_type clockevents_subsys = { |
@@ -727,5 +794,3 @@ static int __init clockevents_init_sysfs(void) | |||
727 | } | 794 | } |
728 | device_initcall(clockevents_init_sysfs); | 795 | device_initcall(clockevents_init_sysfs); |
729 | #endif /* SYSFS */ | 796 | #endif /* SYSFS */ |
730 | |||
731 | #endif /* GENERIC_CLOCK_EVENTS */ | ||
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 4892352f0e49..15facb1b9c60 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
@@ -142,13 +142,6 @@ static void __clocksource_unstable(struct clocksource *cs) | |||
142 | schedule_work(&watchdog_work); | 142 | schedule_work(&watchdog_work); |
143 | } | 143 | } |
144 | 144 | ||
145 | static void clocksource_unstable(struct clocksource *cs, int64_t delta) | ||
146 | { | ||
147 | printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", | ||
148 | cs->name, delta); | ||
149 | __clocksource_unstable(cs); | ||
150 | } | ||
151 | |||
152 | /** | 145 | /** |
153 | * clocksource_mark_unstable - mark clocksource unstable via watchdog | 146 | * clocksource_mark_unstable - mark clocksource unstable via watchdog |
154 | * @cs: clocksource to be marked unstable | 147 | * @cs: clocksource to be marked unstable |
@@ -174,7 +167,7 @@ void clocksource_mark_unstable(struct clocksource *cs) | |||
174 | static void clocksource_watchdog(unsigned long data) | 167 | static void clocksource_watchdog(unsigned long data) |
175 | { | 168 | { |
176 | struct clocksource *cs; | 169 | struct clocksource *cs; |
177 | cycle_t csnow, wdnow, delta; | 170 | cycle_t csnow, wdnow, cslast, wdlast, delta; |
178 | int64_t wd_nsec, cs_nsec; | 171 | int64_t wd_nsec, cs_nsec; |
179 | int next_cpu, reset_pending; | 172 | int next_cpu, reset_pending; |
180 | 173 | ||
@@ -213,6 +206,8 @@ static void clocksource_watchdog(unsigned long data) | |||
213 | 206 | ||
214 | delta = clocksource_delta(csnow, cs->cs_last, cs->mask); | 207 | delta = clocksource_delta(csnow, cs->cs_last, cs->mask); |
215 | cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift); | 208 | cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift); |
209 | wdlast = cs->wd_last; /* save these in case we print them */ | ||
210 | cslast = cs->cs_last; | ||
216 | cs->cs_last = csnow; | 211 | cs->cs_last = csnow; |
217 | cs->wd_last = wdnow; | 212 | cs->wd_last = wdnow; |
218 | 213 | ||
@@ -221,7 +216,12 @@ static void clocksource_watchdog(unsigned long data) | |||
221 | 216 | ||
222 | /* Check the deviation from the watchdog clocksource. */ | 217 | /* Check the deviation from the watchdog clocksource. */ |
223 | if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) { | 218 | if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) { |
224 | clocksource_unstable(cs, cs_nsec - wd_nsec); | 219 | pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable, because the skew is too large:\n", cs->name); |
220 | pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n", | ||
221 | watchdog->name, wdnow, wdlast, watchdog->mask); | ||
222 | pr_warn(" '%s' cs_now: %llx cs_last: %llx mask: %llx\n", | ||
223 | cs->name, csnow, cslast, cs->mask); | ||
224 | __clocksource_unstable(cs); | ||
225 | continue; | 225 | continue; |
226 | } | 226 | } |
227 | 227 | ||
@@ -469,26 +469,25 @@ static u32 clocksource_max_adjustment(struct clocksource *cs) | |||
469 | * @shift: cycle to nanosecond divisor (power of two) | 469 | * @shift: cycle to nanosecond divisor (power of two) |
470 | * @maxadj: maximum adjustment value to mult (~11%) | 470 | * @maxadj: maximum adjustment value to mult (~11%) |
471 | * @mask: bitmask for two's complement subtraction of non 64 bit counters | 471 | * @mask: bitmask for two's complement subtraction of non 64 bit counters |
472 | * @max_cyc: maximum cycle value before potential overflow (does not include | ||
473 | * any safety margin) | ||
474 | * | ||
475 | * NOTE: This function includes a safety margin of 50%, in other words, we | ||
476 | * return half the number of nanoseconds the hardware counter can technically | ||
477 | * cover. This is done so that we can potentially detect problems caused by | ||
478 | * delayed timers or bad hardware, which might result in time intervals that | ||
479 | * are larger then what the math used can handle without overflows. | ||
472 | */ | 480 | */ |
473 | u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask) | 481 | u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cyc) |
474 | { | 482 | { |
475 | u64 max_nsecs, max_cycles; | 483 | u64 max_nsecs, max_cycles; |
476 | 484 | ||
477 | /* | 485 | /* |
478 | * Calculate the maximum number of cycles that we can pass to the | 486 | * Calculate the maximum number of cycles that we can pass to the |
479 | * cyc2ns function without overflowing a 64-bit signed result. The | 487 | * cyc2ns() function without overflowing a 64-bit result. |
480 | * maximum number of cycles is equal to ULLONG_MAX/(mult+maxadj) | ||
481 | * which is equivalent to the below. | ||
482 | * max_cycles < (2^63)/(mult + maxadj) | ||
483 | * max_cycles < 2^(log2((2^63)/(mult + maxadj))) | ||
484 | * max_cycles < 2^(log2(2^63) - log2(mult + maxadj)) | ||
485 | * max_cycles < 2^(63 - log2(mult + maxadj)) | ||
486 | * max_cycles < 1 << (63 - log2(mult + maxadj)) | ||
487 | * Please note that we add 1 to the result of the log2 to account for | ||
488 | * any rounding errors, ensure the above inequality is satisfied and | ||
489 | * no overflow will occur. | ||
490 | */ | 488 | */ |
491 | max_cycles = 1ULL << (63 - (ilog2(mult + maxadj) + 1)); | 489 | max_cycles = ULLONG_MAX; |
490 | do_div(max_cycles, mult+maxadj); | ||
492 | 491 | ||
493 | /* | 492 | /* |
494 | * The actual maximum number of cycles we can defer the clocksource is | 493 | * The actual maximum number of cycles we can defer the clocksource is |
@@ -499,27 +498,26 @@ u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask) | |||
499 | max_cycles = min(max_cycles, mask); | 498 | max_cycles = min(max_cycles, mask); |
500 | max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift); | 499 | max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift); |
501 | 500 | ||
501 | /* return the max_cycles value as well if requested */ | ||
502 | if (max_cyc) | ||
503 | *max_cyc = max_cycles; | ||
504 | |||
505 | /* Return 50% of the actual maximum, so we can detect bad values */ | ||
506 | max_nsecs >>= 1; | ||
507 | |||
502 | return max_nsecs; | 508 | return max_nsecs; |
503 | } | 509 | } |
504 | 510 | ||
505 | /** | 511 | /** |
506 | * clocksource_max_deferment - Returns max time the clocksource can be deferred | 512 | * clocksource_update_max_deferment - Updates the clocksource max_idle_ns & max_cycles |
507 | * @cs: Pointer to clocksource | 513 | * @cs: Pointer to clocksource to be updated |
508 | * | 514 | * |
509 | */ | 515 | */ |
510 | static u64 clocksource_max_deferment(struct clocksource *cs) | 516 | static inline void clocksource_update_max_deferment(struct clocksource *cs) |
511 | { | 517 | { |
512 | u64 max_nsecs; | 518 | cs->max_idle_ns = clocks_calc_max_nsecs(cs->mult, cs->shift, |
513 | 519 | cs->maxadj, cs->mask, | |
514 | max_nsecs = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj, | 520 | &cs->max_cycles); |
515 | cs->mask); | ||
516 | /* | ||
517 | * To ensure that the clocksource does not wrap whilst we are idle, | ||
518 | * limit the time the clocksource can be deferred by 12.5%. Please | ||
519 | * note a margin of 12.5% is used because this can be computed with | ||
520 | * a shift, versus say 10% which would require division. | ||
521 | */ | ||
522 | return max_nsecs - (max_nsecs >> 3); | ||
523 | } | 521 | } |
524 | 522 | ||
525 | #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET | 523 | #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET |
@@ -648,7 +646,7 @@ static void clocksource_enqueue(struct clocksource *cs) | |||
648 | } | 646 | } |
649 | 647 | ||
650 | /** | 648 | /** |
651 | * __clocksource_updatefreq_scale - Used update clocksource with new freq | 649 | * __clocksource_update_freq_scale - Used update clocksource with new freq |
652 | * @cs: clocksource to be registered | 650 | * @cs: clocksource to be registered |
653 | * @scale: Scale factor multiplied against freq to get clocksource hz | 651 | * @scale: Scale factor multiplied against freq to get clocksource hz |
654 | * @freq: clocksource frequency (cycles per second) divided by scale | 652 | * @freq: clocksource frequency (cycles per second) divided by scale |
@@ -656,48 +654,64 @@ static void clocksource_enqueue(struct clocksource *cs) | |||
656 | * This should only be called from the clocksource->enable() method. | 654 | * This should only be called from the clocksource->enable() method. |
657 | * | 655 | * |
658 | * This *SHOULD NOT* be called directly! Please use the | 656 | * This *SHOULD NOT* be called directly! Please use the |
659 | * clocksource_updatefreq_hz() or clocksource_updatefreq_khz helper functions. | 657 | * __clocksource_update_freq_hz() or __clocksource_update_freq_khz() helper |
658 | * functions. | ||
660 | */ | 659 | */ |
661 | void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) | 660 | void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq) |
662 | { | 661 | { |
663 | u64 sec; | 662 | u64 sec; |
663 | |||
664 | /* | 664 | /* |
665 | * Calc the maximum number of seconds which we can run before | 665 | * Default clocksources are *special* and self-define their mult/shift. |
666 | * wrapping around. For clocksources which have a mask > 32bit | 666 | * But, you're not special, so you should specify a freq value. |
667 | * we need to limit the max sleep time to have a good | ||
668 | * conversion precision. 10 minutes is still a reasonable | ||
669 | * amount. That results in a shift value of 24 for a | ||
670 | * clocksource with mask >= 40bit and f >= 4GHz. That maps to | ||
671 | * ~ 0.06ppm granularity for NTP. We apply the same 12.5% | ||
672 | * margin as we do in clocksource_max_deferment() | ||
673 | */ | 667 | */ |
674 | sec = (cs->mask - (cs->mask >> 3)); | 668 | if (freq) { |
675 | do_div(sec, freq); | 669 | /* |
676 | do_div(sec, scale); | 670 | * Calc the maximum number of seconds which we can run before |
677 | if (!sec) | 671 | * wrapping around. For clocksources which have a mask > 32-bit |
678 | sec = 1; | 672 | * we need to limit the max sleep time to have a good |
679 | else if (sec > 600 && cs->mask > UINT_MAX) | 673 | * conversion precision. 10 minutes is still a reasonable |
680 | sec = 600; | 674 | * amount. That results in a shift value of 24 for a |
681 | 675 | * clocksource with mask >= 40-bit and f >= 4GHz. That maps to | |
682 | clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, | 676 | * ~ 0.06ppm granularity for NTP. |
683 | NSEC_PER_SEC / scale, sec * scale); | 677 | */ |
684 | 678 | sec = cs->mask; | |
679 | do_div(sec, freq); | ||
680 | do_div(sec, scale); | ||
681 | if (!sec) | ||
682 | sec = 1; | ||
683 | else if (sec > 600 && cs->mask > UINT_MAX) | ||
684 | sec = 600; | ||
685 | |||
686 | clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, | ||
687 | NSEC_PER_SEC / scale, sec * scale); | ||
688 | } | ||
685 | /* | 689 | /* |
686 | * for clocksources that have large mults, to avoid overflow. | 690 | * Ensure clocksources that have large 'mult' values don't overflow |
687 | * Since mult may be adjusted by ntp, add an safety extra margin | 691 | * when adjusted. |
688 | * | ||
689 | */ | 692 | */ |
690 | cs->maxadj = clocksource_max_adjustment(cs); | 693 | cs->maxadj = clocksource_max_adjustment(cs); |
691 | while ((cs->mult + cs->maxadj < cs->mult) | 694 | while (freq && ((cs->mult + cs->maxadj < cs->mult) |
692 | || (cs->mult - cs->maxadj > cs->mult)) { | 695 | || (cs->mult - cs->maxadj > cs->mult))) { |
693 | cs->mult >>= 1; | 696 | cs->mult >>= 1; |
694 | cs->shift--; | 697 | cs->shift--; |
695 | cs->maxadj = clocksource_max_adjustment(cs); | 698 | cs->maxadj = clocksource_max_adjustment(cs); |
696 | } | 699 | } |
697 | 700 | ||
698 | cs->max_idle_ns = clocksource_max_deferment(cs); | 701 | /* |
702 | * Only warn for *special* clocksources that self-define | ||
703 | * their mult/shift values and don't specify a freq. | ||
704 | */ | ||
705 | WARN_ONCE(cs->mult + cs->maxadj < cs->mult, | ||
706 | "timekeeping: Clocksource %s might overflow on 11%% adjustment\n", | ||
707 | cs->name); | ||
708 | |||
709 | clocksource_update_max_deferment(cs); | ||
710 | |||
711 | pr_info("clocksource %s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n", | ||
712 | cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns); | ||
699 | } | 713 | } |
700 | EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); | 714 | EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale); |
701 | 715 | ||
702 | /** | 716 | /** |
703 | * __clocksource_register_scale - Used to install new clocksources | 717 | * __clocksource_register_scale - Used to install new clocksources |
@@ -714,7 +728,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) | |||
714 | { | 728 | { |
715 | 729 | ||
716 | /* Initialize mult/shift and max_idle_ns */ | 730 | /* Initialize mult/shift and max_idle_ns */ |
717 | __clocksource_updatefreq_scale(cs, scale, freq); | 731 | __clocksource_update_freq_scale(cs, scale, freq); |
718 | 732 | ||
719 | /* Add clocksource to the clocksource list */ | 733 | /* Add clocksource to the clocksource list */ |
720 | mutex_lock(&clocksource_mutex); | 734 | mutex_lock(&clocksource_mutex); |
@@ -726,33 +740,6 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) | |||
726 | } | 740 | } |
727 | EXPORT_SYMBOL_GPL(__clocksource_register_scale); | 741 | EXPORT_SYMBOL_GPL(__clocksource_register_scale); |
728 | 742 | ||
729 | |||
730 | /** | ||
731 | * clocksource_register - Used to install new clocksources | ||
732 | * @cs: clocksource to be registered | ||
733 | * | ||
734 | * Returns -EBUSY if registration fails, zero otherwise. | ||
735 | */ | ||
736 | int clocksource_register(struct clocksource *cs) | ||
737 | { | ||
738 | /* calculate max adjustment for given mult/shift */ | ||
739 | cs->maxadj = clocksource_max_adjustment(cs); | ||
740 | WARN_ONCE(cs->mult + cs->maxadj < cs->mult, | ||
741 | "Clocksource %s might overflow on 11%% adjustment\n", | ||
742 | cs->name); | ||
743 | |||
744 | /* calculate max idle time permitted for this clocksource */ | ||
745 | cs->max_idle_ns = clocksource_max_deferment(cs); | ||
746 | |||
747 | mutex_lock(&clocksource_mutex); | ||
748 | clocksource_enqueue(cs); | ||
749 | clocksource_enqueue_watchdog(cs); | ||
750 | clocksource_select(); | ||
751 | mutex_unlock(&clocksource_mutex); | ||
752 | return 0; | ||
753 | } | ||
754 | EXPORT_SYMBOL(clocksource_register); | ||
755 | |||
756 | static void __clocksource_change_rating(struct clocksource *cs, int rating) | 743 | static void __clocksource_change_rating(struct clocksource *cs, int rating) |
757 | { | 744 | { |
758 | list_del(&cs->list); | 745 | list_del(&cs->list); |
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index bee0c1f78091..76d4bd962b19 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c | |||
@@ -54,7 +54,7 @@ | |||
54 | 54 | ||
55 | #include <trace/events/timer.h> | 55 | #include <trace/events/timer.h> |
56 | 56 | ||
57 | #include "timekeeping.h" | 57 | #include "tick-internal.h" |
58 | 58 | ||
59 | /* | 59 | /* |
60 | * The timer bases: | 60 | * The timer bases: |
@@ -1707,17 +1707,10 @@ static int hrtimer_cpu_notify(struct notifier_block *self, | |||
1707 | break; | 1707 | break; |
1708 | 1708 | ||
1709 | #ifdef CONFIG_HOTPLUG_CPU | 1709 | #ifdef CONFIG_HOTPLUG_CPU |
1710 | case CPU_DYING: | ||
1711 | case CPU_DYING_FROZEN: | ||
1712 | clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DYING, &scpu); | ||
1713 | break; | ||
1714 | case CPU_DEAD: | 1710 | case CPU_DEAD: |
1715 | case CPU_DEAD_FROZEN: | 1711 | case CPU_DEAD_FROZEN: |
1716 | { | ||
1717 | clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu); | ||
1718 | migrate_hrtimers(scpu); | 1712 | migrate_hrtimers(scpu); |
1719 | break; | 1713 | break; |
1720 | } | ||
1721 | #endif | 1714 | #endif |
1722 | 1715 | ||
1723 | default: | 1716 | default: |
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index a6a5bf53e86d..347fecf86a3f 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c | |||
@@ -25,7 +25,7 @@ | |||
25 | #include <linux/module.h> | 25 | #include <linux/module.h> |
26 | #include <linux/init.h> | 26 | #include <linux/init.h> |
27 | 27 | ||
28 | #include "tick-internal.h" | 28 | #include "timekeeping.h" |
29 | 29 | ||
30 | /* The Jiffies based clocksource is the lowest common | 30 | /* The Jiffies based clocksource is the lowest common |
31 | * denominator clock source which should function on | 31 | * denominator clock source which should function on |
@@ -71,6 +71,7 @@ static struct clocksource clocksource_jiffies = { | |||
71 | .mask = 0xffffffff, /*32bits*/ | 71 | .mask = 0xffffffff, /*32bits*/ |
72 | .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ | 72 | .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ |
73 | .shift = JIFFIES_SHIFT, | 73 | .shift = JIFFIES_SHIFT, |
74 | .max_cycles = 10, | ||
74 | }; | 75 | }; |
75 | 76 | ||
76 | __cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock); | 77 | __cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock); |
@@ -94,7 +95,7 @@ EXPORT_SYMBOL(jiffies); | |||
94 | 95 | ||
95 | static int __init init_jiffies_clocksource(void) | 96 | static int __init init_jiffies_clocksource(void) |
96 | { | 97 | { |
97 | return clocksource_register(&clocksource_jiffies); | 98 | return __clocksource_register(&clocksource_jiffies); |
98 | } | 99 | } |
99 | 100 | ||
100 | core_initcall(init_jiffies_clocksource); | 101 | core_initcall(init_jiffies_clocksource); |
@@ -130,6 +131,6 @@ int register_refined_jiffies(long cycles_per_second) | |||
130 | 131 | ||
131 | refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT; | 132 | refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT; |
132 | 133 | ||
133 | clocksource_register(&refined_jiffies); | 134 | __clocksource_register(&refined_jiffies); |
134 | return 0; | 135 | return 0; |
135 | } | 136 | } |
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 0f60b08a4f07..7a681003001c 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
@@ -17,7 +17,6 @@ | |||
17 | #include <linux/module.h> | 17 | #include <linux/module.h> |
18 | #include <linux/rtc.h> | 18 | #include <linux/rtc.h> |
19 | 19 | ||
20 | #include "tick-internal.h" | ||
21 | #include "ntp_internal.h" | 20 | #include "ntp_internal.h" |
22 | 21 | ||
23 | /* | 22 | /* |
@@ -459,6 +458,16 @@ out: | |||
459 | return leap; | 458 | return leap; |
460 | } | 459 | } |
461 | 460 | ||
461 | #ifdef CONFIG_GENERIC_CMOS_UPDATE | ||
462 | int __weak update_persistent_clock64(struct timespec64 now64) | ||
463 | { | ||
464 | struct timespec now; | ||
465 | |||
466 | now = timespec64_to_timespec(now64); | ||
467 | return update_persistent_clock(now); | ||
468 | } | ||
469 | #endif | ||
470 | |||
462 | #if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) | 471 | #if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) |
463 | static void sync_cmos_clock(struct work_struct *work); | 472 | static void sync_cmos_clock(struct work_struct *work); |
464 | 473 | ||
@@ -494,8 +503,9 @@ static void sync_cmos_clock(struct work_struct *work) | |||
494 | if (persistent_clock_is_local) | 503 | if (persistent_clock_is_local) |
495 | adjust.tv_sec -= (sys_tz.tz_minuteswest * 60); | 504 | adjust.tv_sec -= (sys_tz.tz_minuteswest * 60); |
496 | #ifdef CONFIG_GENERIC_CMOS_UPDATE | 505 | #ifdef CONFIG_GENERIC_CMOS_UPDATE |
497 | fail = update_persistent_clock(timespec64_to_timespec(adjust)); | 506 | fail = update_persistent_clock64(adjust); |
498 | #endif | 507 | #endif |
508 | |||
499 | #ifdef CONFIG_RTC_SYSTOHC | 509 | #ifdef CONFIG_RTC_SYSTOHC |
500 | if (fail == -ENODEV) | 510 | if (fail == -ENODEV) |
501 | fail = rtc_set_ntp_time(adjust); | 511 | fail = rtc_set_ntp_time(adjust); |
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 01d2d15aa662..a26036d37a38 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c | |||
@@ -1,5 +1,6 @@ | |||
1 | /* | 1 | /* |
2 | * sched_clock.c: support for extending counters to full 64-bit ns counter | 2 | * sched_clock.c: Generic sched_clock() support, to extend low level |
3 | * hardware time counters to full 64-bit ns values. | ||
3 | * | 4 | * |
4 | * This program is free software; you can redistribute it and/or modify | 5 | * This program is free software; you can redistribute it and/or modify |
5 | * it under the terms of the GNU General Public License version 2 as | 6 | * it under the terms of the GNU General Public License version 2 as |
@@ -18,15 +19,53 @@ | |||
18 | #include <linux/seqlock.h> | 19 | #include <linux/seqlock.h> |
19 | #include <linux/bitops.h> | 20 | #include <linux/bitops.h> |
20 | 21 | ||
21 | struct clock_data { | 22 | /** |
22 | ktime_t wrap_kt; | 23 | * struct clock_read_data - data required to read from sched_clock() |
24 | * | ||
25 | * @epoch_ns: sched_clock() value at last update | ||
26 | * @epoch_cyc: Clock cycle value at last update. | ||
27 | * @sched_clock_mask: Bitmask for two's complement subtraction of non 64bit | ||
28 | * clocks. | ||
29 | * @read_sched_clock: Current clock source (or dummy source when suspended). | ||
30 | * @mult: Multipler for scaled math conversion. | ||
31 | * @shift: Shift value for scaled math conversion. | ||
32 | * | ||
33 | * Care must be taken when updating this structure; it is read by | ||
34 | * some very hot code paths. It occupies <=40 bytes and, when combined | ||
35 | * with the seqcount used to synchronize access, comfortably fits into | ||
36 | * a 64 byte cache line. | ||
37 | */ | ||
38 | struct clock_read_data { | ||
23 | u64 epoch_ns; | 39 | u64 epoch_ns; |
24 | u64 epoch_cyc; | 40 | u64 epoch_cyc; |
25 | seqcount_t seq; | 41 | u64 sched_clock_mask; |
26 | unsigned long rate; | 42 | u64 (*read_sched_clock)(void); |
27 | u32 mult; | 43 | u32 mult; |
28 | u32 shift; | 44 | u32 shift; |
29 | bool suspended; | 45 | }; |
46 | |||
47 | /** | ||
48 | * struct clock_data - all data needed for sched_clock() (including | ||
49 | * registration of a new clock source) | ||
50 | * | ||
51 | * @seq: Sequence counter for protecting updates. The lowest | ||
52 | * bit is the index for @read_data. | ||
53 | * @read_data: Data required to read from sched_clock. | ||
54 | * @wrap_kt: Duration for which clock can run before wrapping. | ||
55 | * @rate: Tick rate of the registered clock. | ||
56 | * @actual_read_sched_clock: Registered hardware level clock read function. | ||
57 | * | ||
58 | * The ordering of this structure has been chosen to optimize cache | ||
59 | * performance. In particular 'seq' and 'read_data[0]' (combined) should fit | ||
60 | * into a single 64-byte cache line. | ||
61 | */ | ||
62 | struct clock_data { | ||
63 | seqcount_t seq; | ||
64 | struct clock_read_data read_data[2]; | ||
65 | ktime_t wrap_kt; | ||
66 | unsigned long rate; | ||
67 | |||
68 | u64 (*actual_read_sched_clock)(void); | ||
30 | }; | 69 | }; |
31 | 70 | ||
32 | static struct hrtimer sched_clock_timer; | 71 | static struct hrtimer sched_clock_timer; |
@@ -34,12 +73,6 @@ static int irqtime = -1; | |||
34 | 73 | ||
35 | core_param(irqtime, irqtime, int, 0400); | 74 | core_param(irqtime, irqtime, int, 0400); |
36 | 75 | ||
37 | static struct clock_data cd = { | ||
38 | .mult = NSEC_PER_SEC / HZ, | ||
39 | }; | ||
40 | |||
41 | static u64 __read_mostly sched_clock_mask; | ||
42 | |||
43 | static u64 notrace jiffy_sched_clock_read(void) | 76 | static u64 notrace jiffy_sched_clock_read(void) |
44 | { | 77 | { |
45 | /* | 78 | /* |
@@ -49,7 +82,11 @@ static u64 notrace jiffy_sched_clock_read(void) | |||
49 | return (u64)(jiffies - INITIAL_JIFFIES); | 82 | return (u64)(jiffies - INITIAL_JIFFIES); |
50 | } | 83 | } |
51 | 84 | ||
52 | static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; | 85 | static struct clock_data cd ____cacheline_aligned = { |
86 | .read_data[0] = { .mult = NSEC_PER_SEC / HZ, | ||
87 | .read_sched_clock = jiffy_sched_clock_read, }, | ||
88 | .actual_read_sched_clock = jiffy_sched_clock_read, | ||
89 | }; | ||
53 | 90 | ||
54 | static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) | 91 | static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) |
55 | { | 92 | { |
@@ -58,111 +95,136 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) | |||
58 | 95 | ||
59 | unsigned long long notrace sched_clock(void) | 96 | unsigned long long notrace sched_clock(void) |
60 | { | 97 | { |
61 | u64 epoch_ns; | 98 | u64 cyc, res; |
62 | u64 epoch_cyc; | ||
63 | u64 cyc; | ||
64 | unsigned long seq; | 99 | unsigned long seq; |
65 | 100 | struct clock_read_data *rd; | |
66 | if (cd.suspended) | ||
67 | return cd.epoch_ns; | ||
68 | 101 | ||
69 | do { | 102 | do { |
70 | seq = raw_read_seqcount_begin(&cd.seq); | 103 | seq = raw_read_seqcount(&cd.seq); |
71 | epoch_cyc = cd.epoch_cyc; | 104 | rd = cd.read_data + (seq & 1); |
72 | epoch_ns = cd.epoch_ns; | 105 | |
106 | cyc = (rd->read_sched_clock() - rd->epoch_cyc) & | ||
107 | rd->sched_clock_mask; | ||
108 | res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift); | ||
73 | } while (read_seqcount_retry(&cd.seq, seq)); | 109 | } while (read_seqcount_retry(&cd.seq, seq)); |
74 | 110 | ||
75 | cyc = read_sched_clock(); | 111 | return res; |
76 | cyc = (cyc - epoch_cyc) & sched_clock_mask; | 112 | } |
77 | return epoch_ns + cyc_to_ns(cyc, cd.mult, cd.shift); | 113 | |
114 | /* | ||
115 | * Updating the data required to read the clock. | ||
116 | * | ||
117 | * sched_clock() will never observe mis-matched data even if called from | ||
118 | * an NMI. We do this by maintaining an odd/even copy of the data and | ||
119 | * steering sched_clock() to one or the other using a sequence counter. | ||
120 | * In order to preserve the data cache profile of sched_clock() as much | ||
121 | * as possible the system reverts back to the even copy when the update | ||
122 | * completes; the odd copy is used *only* during an update. | ||
123 | */ | ||
124 | static void update_clock_read_data(struct clock_read_data *rd) | ||
125 | { | ||
126 | /* update the backup (odd) copy with the new data */ | ||
127 | cd.read_data[1] = *rd; | ||
128 | |||
129 | /* steer readers towards the odd copy */ | ||
130 | raw_write_seqcount_latch(&cd.seq); | ||
131 | |||
132 | /* now its safe for us to update the normal (even) copy */ | ||
133 | cd.read_data[0] = *rd; | ||
134 | |||
135 | /* switch readers back to the even copy */ | ||
136 | raw_write_seqcount_latch(&cd.seq); | ||
78 | } | 137 | } |
79 | 138 | ||
80 | /* | 139 | /* |
81 | * Atomically update the sched_clock epoch. | 140 | * Atomically update the sched_clock() epoch. |
82 | */ | 141 | */ |
83 | static void notrace update_sched_clock(void) | 142 | static void update_sched_clock(void) |
84 | { | 143 | { |
85 | unsigned long flags; | ||
86 | u64 cyc; | 144 | u64 cyc; |
87 | u64 ns; | 145 | u64 ns; |
146 | struct clock_read_data rd; | ||
147 | |||
148 | rd = cd.read_data[0]; | ||
149 | |||
150 | cyc = cd.actual_read_sched_clock(); | ||
151 | ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift); | ||
152 | |||
153 | rd.epoch_ns = ns; | ||
154 | rd.epoch_cyc = cyc; | ||
88 | 155 | ||
89 | cyc = read_sched_clock(); | 156 | update_clock_read_data(&rd); |
90 | ns = cd.epoch_ns + | ||
91 | cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, | ||
92 | cd.mult, cd.shift); | ||
93 | |||
94 | raw_local_irq_save(flags); | ||
95 | raw_write_seqcount_begin(&cd.seq); | ||
96 | cd.epoch_ns = ns; | ||
97 | cd.epoch_cyc = cyc; | ||
98 | raw_write_seqcount_end(&cd.seq); | ||
99 | raw_local_irq_restore(flags); | ||
100 | } | 157 | } |
101 | 158 | ||
102 | static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) | 159 | static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) |
103 | { | 160 | { |
104 | update_sched_clock(); | 161 | update_sched_clock(); |
105 | hrtimer_forward_now(hrt, cd.wrap_kt); | 162 | hrtimer_forward_now(hrt, cd.wrap_kt); |
163 | |||
106 | return HRTIMER_RESTART; | 164 | return HRTIMER_RESTART; |
107 | } | 165 | } |
108 | 166 | ||
109 | void __init sched_clock_register(u64 (*read)(void), int bits, | 167 | void __init |
110 | unsigned long rate) | 168 | sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) |
111 | { | 169 | { |
112 | u64 res, wrap, new_mask, new_epoch, cyc, ns; | 170 | u64 res, wrap, new_mask, new_epoch, cyc, ns; |
113 | u32 new_mult, new_shift; | 171 | u32 new_mult, new_shift; |
114 | ktime_t new_wrap_kt; | ||
115 | unsigned long r; | 172 | unsigned long r; |
116 | char r_unit; | 173 | char r_unit; |
174 | struct clock_read_data rd; | ||
117 | 175 | ||
118 | if (cd.rate > rate) | 176 | if (cd.rate > rate) |
119 | return; | 177 | return; |
120 | 178 | ||
121 | WARN_ON(!irqs_disabled()); | 179 | WARN_ON(!irqs_disabled()); |
122 | 180 | ||
123 | /* calculate the mult/shift to convert counter ticks to ns. */ | 181 | /* Calculate the mult/shift to convert counter ticks to ns. */ |
124 | clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600); | 182 | clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600); |
125 | 183 | ||
126 | new_mask = CLOCKSOURCE_MASK(bits); | 184 | new_mask = CLOCKSOURCE_MASK(bits); |
185 | cd.rate = rate; | ||
186 | |||
187 | /* Calculate how many nanosecs until we risk wrapping */ | ||
188 | wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask, NULL); | ||
189 | cd.wrap_kt = ns_to_ktime(wrap); | ||
127 | 190 | ||
128 | /* calculate how many ns until we wrap */ | 191 | rd = cd.read_data[0]; |
129 | wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask); | ||
130 | new_wrap_kt = ns_to_ktime(wrap - (wrap >> 3)); | ||
131 | 192 | ||
132 | /* update epoch for new counter and update epoch_ns from old counter*/ | 193 | /* Update epoch for new counter and update 'epoch_ns' from old counter*/ |
133 | new_epoch = read(); | 194 | new_epoch = read(); |
134 | cyc = read_sched_clock(); | 195 | cyc = cd.actual_read_sched_clock(); |
135 | ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, | 196 | ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift); |
136 | cd.mult, cd.shift); | 197 | cd.actual_read_sched_clock = read; |
137 | 198 | ||
138 | raw_write_seqcount_begin(&cd.seq); | 199 | rd.read_sched_clock = read; |
139 | read_sched_clock = read; | 200 | rd.sched_clock_mask = new_mask; |
140 | sched_clock_mask = new_mask; | 201 | rd.mult = new_mult; |
141 | cd.rate = rate; | 202 | rd.shift = new_shift; |
142 | cd.wrap_kt = new_wrap_kt; | 203 | rd.epoch_cyc = new_epoch; |
143 | cd.mult = new_mult; | 204 | rd.epoch_ns = ns; |
144 | cd.shift = new_shift; | 205 | |
145 | cd.epoch_cyc = new_epoch; | 206 | update_clock_read_data(&rd); |
146 | cd.epoch_ns = ns; | ||
147 | raw_write_seqcount_end(&cd.seq); | ||
148 | 207 | ||
149 | r = rate; | 208 | r = rate; |
150 | if (r >= 4000000) { | 209 | if (r >= 4000000) { |
151 | r /= 1000000; | 210 | r /= 1000000; |
152 | r_unit = 'M'; | 211 | r_unit = 'M'; |
153 | } else if (r >= 1000) { | 212 | } else { |
154 | r /= 1000; | 213 | if (r >= 1000) { |
155 | r_unit = 'k'; | 214 | r /= 1000; |
156 | } else | 215 | r_unit = 'k'; |
157 | r_unit = ' '; | 216 | } else { |
158 | 217 | r_unit = ' '; | |
159 | /* calculate the ns resolution of this counter */ | 218 | } |
219 | } | ||
220 | |||
221 | /* Calculate the ns resolution of this counter */ | ||
160 | res = cyc_to_ns(1ULL, new_mult, new_shift); | 222 | res = cyc_to_ns(1ULL, new_mult, new_shift); |
161 | 223 | ||
162 | pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n", | 224 | pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n", |
163 | bits, r, r_unit, res, wrap); | 225 | bits, r, r_unit, res, wrap); |
164 | 226 | ||
165 | /* Enable IRQ time accounting if we have a fast enough sched_clock */ | 227 | /* Enable IRQ time accounting if we have a fast enough sched_clock() */ |
166 | if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) | 228 | if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) |
167 | enable_sched_clock_irqtime(); | 229 | enable_sched_clock_irqtime(); |
168 | 230 | ||
@@ -172,10 +234,10 @@ void __init sched_clock_register(u64 (*read)(void), int bits, | |||
172 | void __init sched_clock_postinit(void) | 234 | void __init sched_clock_postinit(void) |
173 | { | 235 | { |
174 | /* | 236 | /* |
175 | * If no sched_clock function has been provided at that point, | 237 | * If no sched_clock() function has been provided at that point, |
176 | * make it the final one one. | 238 | * make it the final one one. |
177 | */ | 239 | */ |
178 | if (read_sched_clock == jiffy_sched_clock_read) | 240 | if (cd.actual_read_sched_clock == jiffy_sched_clock_read) |
179 | sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ); | 241 | sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ); |
180 | 242 | ||
181 | update_sched_clock(); | 243 | update_sched_clock(); |
@@ -189,29 +251,53 @@ void __init sched_clock_postinit(void) | |||
189 | hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); | 251 | hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); |
190 | } | 252 | } |
191 | 253 | ||
254 | /* | ||
255 | * Clock read function for use when the clock is suspended. | ||
256 | * | ||
257 | * This function makes it appear to sched_clock() as if the clock | ||
258 | * stopped counting at its last update. | ||
259 | * | ||
260 | * This function must only be called from the critical | ||
261 | * section in sched_clock(). It relies on the read_seqcount_retry() | ||
262 | * at the end of the critical section to be sure we observe the | ||
263 | * correct copy of 'epoch_cyc'. | ||
264 | */ | ||
265 | static u64 notrace suspended_sched_clock_read(void) | ||
266 | { | ||
267 | unsigned long seq = raw_read_seqcount(&cd.seq); | ||
268 | |||
269 | return cd.read_data[seq & 1].epoch_cyc; | ||
270 | } | ||
271 | |||
192 | static int sched_clock_suspend(void) | 272 | static int sched_clock_suspend(void) |
193 | { | 273 | { |
274 | struct clock_read_data *rd = &cd.read_data[0]; | ||
275 | |||
194 | update_sched_clock(); | 276 | update_sched_clock(); |
195 | hrtimer_cancel(&sched_clock_timer); | 277 | hrtimer_cancel(&sched_clock_timer); |
196 | cd.suspended = true; | 278 | rd->read_sched_clock = suspended_sched_clock_read; |
279 | |||
197 | return 0; | 280 | return 0; |
198 | } | 281 | } |
199 | 282 | ||
200 | static void sched_clock_resume(void) | 283 | static void sched_clock_resume(void) |
201 | { | 284 | { |
202 | cd.epoch_cyc = read_sched_clock(); | 285 | struct clock_read_data *rd = &cd.read_data[0]; |
286 | |||
287 | rd->epoch_cyc = cd.actual_read_sched_clock(); | ||
203 | hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); | 288 | hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); |
204 | cd.suspended = false; | 289 | rd->read_sched_clock = cd.actual_read_sched_clock; |
205 | } | 290 | } |
206 | 291 | ||
207 | static struct syscore_ops sched_clock_ops = { | 292 | static struct syscore_ops sched_clock_ops = { |
208 | .suspend = sched_clock_suspend, | 293 | .suspend = sched_clock_suspend, |
209 | .resume = sched_clock_resume, | 294 | .resume = sched_clock_resume, |
210 | }; | 295 | }; |
211 | 296 | ||
212 | static int __init sched_clock_syscore_init(void) | 297 | static int __init sched_clock_syscore_init(void) |
213 | { | 298 | { |
214 | register_syscore_ops(&sched_clock_ops); | 299 | register_syscore_ops(&sched_clock_ops); |
300 | |||
215 | return 0; | 301 | return 0; |
216 | } | 302 | } |
217 | device_initcall(sched_clock_syscore_init); | 303 | device_initcall(sched_clock_syscore_init); |
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index eb682d5c697c..6aac4beedbbe 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c | |||
@@ -49,6 +49,7 @@ static void bc_set_mode(enum clock_event_mode mode, | |||
49 | */ | 49 | */ |
50 | static int bc_set_next(ktime_t expires, struct clock_event_device *bc) | 50 | static int bc_set_next(ktime_t expires, struct clock_event_device *bc) |
51 | { | 51 | { |
52 | int bc_moved; | ||
52 | /* | 53 | /* |
53 | * We try to cancel the timer first. If the callback is on | 54 | * We try to cancel the timer first. If the callback is on |
54 | * flight on some other cpu then we let it handle it. If we | 55 | * flight on some other cpu then we let it handle it. If we |
@@ -60,9 +61,15 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc) | |||
60 | * restart the timer because we are in the callback, but we | 61 | * restart the timer because we are in the callback, but we |
61 | * can set the expiry time and let the callback return | 62 | * can set the expiry time and let the callback return |
62 | * HRTIMER_RESTART. | 63 | * HRTIMER_RESTART. |
64 | * | ||
65 | * Since we are in the idle loop at this point and because | ||
66 | * hrtimer_{start/cancel} functions call into tracing, | ||
67 | * calls to these functions must be bound within RCU_NONIDLE. | ||
63 | */ | 68 | */ |
64 | if (hrtimer_try_to_cancel(&bctimer) >= 0) { | 69 | RCU_NONIDLE(bc_moved = (hrtimer_try_to_cancel(&bctimer) >= 0) ? |
65 | hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED); | 70 | !hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED) : |
71 | 0); | ||
72 | if (bc_moved) { | ||
66 | /* Bind the "device" to the cpu */ | 73 | /* Bind the "device" to the cpu */ |
67 | bc->bound_on = smp_processor_id(); | 74 | bc->bound_on = smp_processor_id(); |
68 | } else if (bc->bound_on == smp_processor_id()) { | 75 | } else if (bc->bound_on == smp_processor_id()) { |
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 066f0ec05e48..7e8ca4f448a8 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
@@ -33,12 +33,14 @@ static cpumask_var_t tick_broadcast_mask; | |||
33 | static cpumask_var_t tick_broadcast_on; | 33 | static cpumask_var_t tick_broadcast_on; |
34 | static cpumask_var_t tmpmask; | 34 | static cpumask_var_t tmpmask; |
35 | static DEFINE_RAW_SPINLOCK(tick_broadcast_lock); | 35 | static DEFINE_RAW_SPINLOCK(tick_broadcast_lock); |
36 | static int tick_broadcast_force; | 36 | static int tick_broadcast_forced; |
37 | 37 | ||
38 | #ifdef CONFIG_TICK_ONESHOT | 38 | #ifdef CONFIG_TICK_ONESHOT |
39 | static void tick_broadcast_clear_oneshot(int cpu); | 39 | static void tick_broadcast_clear_oneshot(int cpu); |
40 | static void tick_resume_broadcast_oneshot(struct clock_event_device *bc); | ||
40 | #else | 41 | #else |
41 | static inline void tick_broadcast_clear_oneshot(int cpu) { } | 42 | static inline void tick_broadcast_clear_oneshot(int cpu) { } |
43 | static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { } | ||
42 | #endif | 44 | #endif |
43 | 45 | ||
44 | /* | 46 | /* |
@@ -303,7 +305,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) | |||
303 | /* | 305 | /* |
304 | * The device is in periodic mode. No reprogramming necessary: | 306 | * The device is in periodic mode. No reprogramming necessary: |
305 | */ | 307 | */ |
306 | if (dev->mode == CLOCK_EVT_MODE_PERIODIC) | 308 | if (dev->state == CLOCK_EVT_STATE_PERIODIC) |
307 | goto unlock; | 309 | goto unlock; |
308 | 310 | ||
309 | /* | 311 | /* |
@@ -324,49 +326,54 @@ unlock: | |||
324 | raw_spin_unlock(&tick_broadcast_lock); | 326 | raw_spin_unlock(&tick_broadcast_lock); |
325 | } | 327 | } |
326 | 328 | ||
327 | /* | 329 | /** |
328 | * Powerstate information: The system enters/leaves a state, where | 330 | * tick_broadcast_control - Enable/disable or force broadcast mode |
329 | * affected devices might stop | 331 | * @mode: The selected broadcast mode |
332 | * | ||
333 | * Called when the system enters a state where affected tick devices | ||
334 | * might stop. Note: TICK_BROADCAST_FORCE cannot be undone. | ||
335 | * | ||
336 | * Called with interrupts disabled, so clockevents_lock is not | ||
337 | * required here because the local clock event device cannot go away | ||
338 | * under us. | ||
330 | */ | 339 | */ |
331 | static void tick_do_broadcast_on_off(unsigned long *reason) | 340 | void tick_broadcast_control(enum tick_broadcast_mode mode) |
332 | { | 341 | { |
333 | struct clock_event_device *bc, *dev; | 342 | struct clock_event_device *bc, *dev; |
334 | struct tick_device *td; | 343 | struct tick_device *td; |
335 | unsigned long flags; | ||
336 | int cpu, bc_stopped; | 344 | int cpu, bc_stopped; |
337 | 345 | ||
338 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); | 346 | td = this_cpu_ptr(&tick_cpu_device); |
339 | |||
340 | cpu = smp_processor_id(); | ||
341 | td = &per_cpu(tick_cpu_device, cpu); | ||
342 | dev = td->evtdev; | 347 | dev = td->evtdev; |
343 | bc = tick_broadcast_device.evtdev; | ||
344 | 348 | ||
345 | /* | 349 | /* |
346 | * Is the device not affected by the powerstate ? | 350 | * Is the device not affected by the powerstate ? |
347 | */ | 351 | */ |
348 | if (!dev || !(dev->features & CLOCK_EVT_FEAT_C3STOP)) | 352 | if (!dev || !(dev->features & CLOCK_EVT_FEAT_C3STOP)) |
349 | goto out; | 353 | return; |
350 | 354 | ||
351 | if (!tick_device_is_functional(dev)) | 355 | if (!tick_device_is_functional(dev)) |
352 | goto out; | 356 | return; |
353 | 357 | ||
358 | raw_spin_lock(&tick_broadcast_lock); | ||
359 | cpu = smp_processor_id(); | ||
360 | bc = tick_broadcast_device.evtdev; | ||
354 | bc_stopped = cpumask_empty(tick_broadcast_mask); | 361 | bc_stopped = cpumask_empty(tick_broadcast_mask); |
355 | 362 | ||
356 | switch (*reason) { | 363 | switch (mode) { |
357 | case CLOCK_EVT_NOTIFY_BROADCAST_ON: | 364 | case TICK_BROADCAST_FORCE: |
358 | case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: | 365 | tick_broadcast_forced = 1; |
366 | case TICK_BROADCAST_ON: | ||
359 | cpumask_set_cpu(cpu, tick_broadcast_on); | 367 | cpumask_set_cpu(cpu, tick_broadcast_on); |
360 | if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { | 368 | if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { |
361 | if (tick_broadcast_device.mode == | 369 | if (tick_broadcast_device.mode == |
362 | TICKDEV_MODE_PERIODIC) | 370 | TICKDEV_MODE_PERIODIC) |
363 | clockevents_shutdown(dev); | 371 | clockevents_shutdown(dev); |
364 | } | 372 | } |
365 | if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE) | ||
366 | tick_broadcast_force = 1; | ||
367 | break; | 373 | break; |
368 | case CLOCK_EVT_NOTIFY_BROADCAST_OFF: | 374 | |
369 | if (tick_broadcast_force) | 375 | case TICK_BROADCAST_OFF: |
376 | if (tick_broadcast_forced) | ||
370 | break; | 377 | break; |
371 | cpumask_clear_cpu(cpu, tick_broadcast_on); | 378 | cpumask_clear_cpu(cpu, tick_broadcast_on); |
372 | if (!tick_device_is_functional(dev)) | 379 | if (!tick_device_is_functional(dev)) |
@@ -388,22 +395,9 @@ static void tick_do_broadcast_on_off(unsigned long *reason) | |||
388 | else | 395 | else |
389 | tick_broadcast_setup_oneshot(bc); | 396 | tick_broadcast_setup_oneshot(bc); |
390 | } | 397 | } |
391 | out: | 398 | raw_spin_unlock(&tick_broadcast_lock); |
392 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); | ||
393 | } | ||
394 | |||
395 | /* | ||
396 | * Powerstate information: The system enters/leaves a state, where | ||
397 | * affected devices might stop. | ||
398 | */ | ||
399 | void tick_broadcast_on_off(unsigned long reason, int *oncpu) | ||
400 | { | ||
401 | if (!cpumask_test_cpu(*oncpu, cpu_online_mask)) | ||
402 | printk(KERN_ERR "tick-broadcast: ignoring broadcast for " | ||
403 | "offline CPU #%d\n", *oncpu); | ||
404 | else | ||
405 | tick_do_broadcast_on_off(&reason); | ||
406 | } | 399 | } |
400 | EXPORT_SYMBOL_GPL(tick_broadcast_control); | ||
407 | 401 | ||
408 | /* | 402 | /* |
409 | * Set the periodic handler depending on broadcast on/off | 403 | * Set the periodic handler depending on broadcast on/off |
@@ -416,14 +410,14 @@ void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast) | |||
416 | dev->event_handler = tick_handle_periodic_broadcast; | 410 | dev->event_handler = tick_handle_periodic_broadcast; |
417 | } | 411 | } |
418 | 412 | ||
413 | #ifdef CONFIG_HOTPLUG_CPU | ||
419 | /* | 414 | /* |
420 | * Remove a CPU from broadcasting | 415 | * Remove a CPU from broadcasting |
421 | */ | 416 | */ |
422 | void tick_shutdown_broadcast(unsigned int *cpup) | 417 | void tick_shutdown_broadcast(unsigned int cpu) |
423 | { | 418 | { |
424 | struct clock_event_device *bc; | 419 | struct clock_event_device *bc; |
425 | unsigned long flags; | 420 | unsigned long flags; |
426 | unsigned int cpu = *cpup; | ||
427 | 421 | ||
428 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); | 422 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); |
429 | 423 | ||
@@ -438,6 +432,7 @@ void tick_shutdown_broadcast(unsigned int *cpup) | |||
438 | 432 | ||
439 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); | 433 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); |
440 | } | 434 | } |
435 | #endif | ||
441 | 436 | ||
442 | void tick_suspend_broadcast(void) | 437 | void tick_suspend_broadcast(void) |
443 | { | 438 | { |
@@ -453,38 +448,48 @@ void tick_suspend_broadcast(void) | |||
453 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); | 448 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); |
454 | } | 449 | } |
455 | 450 | ||
456 | int tick_resume_broadcast(void) | 451 | /* |
452 | * This is called from tick_resume_local() on a resuming CPU. That's | ||
453 | * called from the core resume function, tick_unfreeze() and the magic XEN | ||
454 | * resume hackery. | ||
455 | * | ||
456 | * In none of these cases the broadcast device mode can change and the | ||
457 | * bit of the resuming CPU in the broadcast mask is safe as well. | ||
458 | */ | ||
459 | bool tick_resume_check_broadcast(void) | ||
460 | { | ||
461 | if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT) | ||
462 | return false; | ||
463 | else | ||
464 | return cpumask_test_cpu(smp_processor_id(), tick_broadcast_mask); | ||
465 | } | ||
466 | |||
467 | void tick_resume_broadcast(void) | ||
457 | { | 468 | { |
458 | struct clock_event_device *bc; | 469 | struct clock_event_device *bc; |
459 | unsigned long flags; | 470 | unsigned long flags; |
460 | int broadcast = 0; | ||
461 | 471 | ||
462 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); | 472 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); |
463 | 473 | ||
464 | bc = tick_broadcast_device.evtdev; | 474 | bc = tick_broadcast_device.evtdev; |
465 | 475 | ||
466 | if (bc) { | 476 | if (bc) { |
467 | clockevents_set_mode(bc, CLOCK_EVT_MODE_RESUME); | 477 | clockevents_tick_resume(bc); |
468 | 478 | ||
469 | switch (tick_broadcast_device.mode) { | 479 | switch (tick_broadcast_device.mode) { |
470 | case TICKDEV_MODE_PERIODIC: | 480 | case TICKDEV_MODE_PERIODIC: |
471 | if (!cpumask_empty(tick_broadcast_mask)) | 481 | if (!cpumask_empty(tick_broadcast_mask)) |
472 | tick_broadcast_start_periodic(bc); | 482 | tick_broadcast_start_periodic(bc); |
473 | broadcast = cpumask_test_cpu(smp_processor_id(), | ||
474 | tick_broadcast_mask); | ||
475 | break; | 483 | break; |
476 | case TICKDEV_MODE_ONESHOT: | 484 | case TICKDEV_MODE_ONESHOT: |
477 | if (!cpumask_empty(tick_broadcast_mask)) | 485 | if (!cpumask_empty(tick_broadcast_mask)) |
478 | broadcast = tick_resume_broadcast_oneshot(bc); | 486 | tick_resume_broadcast_oneshot(bc); |
479 | break; | 487 | break; |
480 | } | 488 | } |
481 | } | 489 | } |
482 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); | 490 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); |
483 | |||
484 | return broadcast; | ||
485 | } | 491 | } |
486 | 492 | ||
487 | |||
488 | #ifdef CONFIG_TICK_ONESHOT | 493 | #ifdef CONFIG_TICK_ONESHOT |
489 | 494 | ||
490 | static cpumask_var_t tick_broadcast_oneshot_mask; | 495 | static cpumask_var_t tick_broadcast_oneshot_mask; |
@@ -532,8 +537,8 @@ static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu, | |||
532 | { | 537 | { |
533 | int ret; | 538 | int ret; |
534 | 539 | ||
535 | if (bc->mode != CLOCK_EVT_MODE_ONESHOT) | 540 | if (bc->state != CLOCK_EVT_STATE_ONESHOT) |
536 | clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); | 541 | clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT); |
537 | 542 | ||
538 | ret = clockevents_program_event(bc, expires, force); | 543 | ret = clockevents_program_event(bc, expires, force); |
539 | if (!ret) | 544 | if (!ret) |
@@ -541,10 +546,9 @@ static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu, | |||
541 | return ret; | 546 | return ret; |
542 | } | 547 | } |
543 | 548 | ||
544 | int tick_resume_broadcast_oneshot(struct clock_event_device *bc) | 549 | static void tick_resume_broadcast_oneshot(struct clock_event_device *bc) |
545 | { | 550 | { |
546 | clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); | 551 | clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT); |
547 | return 0; | ||
548 | } | 552 | } |
549 | 553 | ||
550 | /* | 554 | /* |
@@ -562,8 +566,8 @@ void tick_check_oneshot_broadcast_this_cpu(void) | |||
562 | * switched over, leave the device alone. | 566 | * switched over, leave the device alone. |
563 | */ | 567 | */ |
564 | if (td->mode == TICKDEV_MODE_ONESHOT) { | 568 | if (td->mode == TICKDEV_MODE_ONESHOT) { |
565 | clockevents_set_mode(td->evtdev, | 569 | clockevents_set_state(td->evtdev, |
566 | CLOCK_EVT_MODE_ONESHOT); | 570 | CLOCK_EVT_STATE_ONESHOT); |
567 | } | 571 | } |
568 | } | 572 | } |
569 | } | 573 | } |
@@ -666,31 +670,26 @@ static void broadcast_shutdown_local(struct clock_event_device *bc, | |||
666 | if (dev->next_event.tv64 < bc->next_event.tv64) | 670 | if (dev->next_event.tv64 < bc->next_event.tv64) |
667 | return; | 671 | return; |
668 | } | 672 | } |
669 | clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); | 673 | clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN); |
670 | } | 674 | } |
671 | 675 | ||
672 | static void broadcast_move_bc(int deadcpu) | 676 | /** |
673 | { | 677 | * tick_broadcast_oneshot_control - Enter/exit broadcast oneshot mode |
674 | struct clock_event_device *bc = tick_broadcast_device.evtdev; | 678 | * @state: The target state (enter/exit) |
675 | 679 | * | |
676 | if (!bc || !broadcast_needs_cpu(bc, deadcpu)) | 680 | * The system enters/leaves a state, where affected devices might stop |
677 | return; | ||
678 | /* This moves the broadcast assignment to this cpu */ | ||
679 | clockevents_program_event(bc, bc->next_event, 1); | ||
680 | } | ||
681 | |||
682 | /* | ||
683 | * Powerstate information: The system enters/leaves a state, where | ||
684 | * affected devices might stop | ||
685 | * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups. | 681 | * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups. |
682 | * | ||
683 | * Called with interrupts disabled, so clockevents_lock is not | ||
684 | * required here because the local clock event device cannot go away | ||
685 | * under us. | ||
686 | */ | 686 | */ |
687 | int tick_broadcast_oneshot_control(unsigned long reason) | 687 | int tick_broadcast_oneshot_control(enum tick_broadcast_state state) |
688 | { | 688 | { |
689 | struct clock_event_device *bc, *dev; | 689 | struct clock_event_device *bc, *dev; |
690 | struct tick_device *td; | 690 | struct tick_device *td; |
691 | unsigned long flags; | ||
692 | ktime_t now; | ||
693 | int cpu, ret = 0; | 691 | int cpu, ret = 0; |
692 | ktime_t now; | ||
694 | 693 | ||
695 | /* | 694 | /* |
696 | * Periodic mode does not care about the enter/exit of power | 695 | * Periodic mode does not care about the enter/exit of power |
@@ -703,17 +702,17 @@ int tick_broadcast_oneshot_control(unsigned long reason) | |||
703 | * We are called with preemtion disabled from the depth of the | 702 | * We are called with preemtion disabled from the depth of the |
704 | * idle code, so we can't be moved away. | 703 | * idle code, so we can't be moved away. |
705 | */ | 704 | */ |
706 | cpu = smp_processor_id(); | 705 | td = this_cpu_ptr(&tick_cpu_device); |
707 | td = &per_cpu(tick_cpu_device, cpu); | ||
708 | dev = td->evtdev; | 706 | dev = td->evtdev; |
709 | 707 | ||
710 | if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) | 708 | if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) |
711 | return 0; | 709 | return 0; |
712 | 710 | ||
711 | raw_spin_lock(&tick_broadcast_lock); | ||
713 | bc = tick_broadcast_device.evtdev; | 712 | bc = tick_broadcast_device.evtdev; |
713 | cpu = smp_processor_id(); | ||
714 | 714 | ||
715 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); | 715 | if (state == TICK_BROADCAST_ENTER) { |
716 | if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { | ||
717 | if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { | 716 | if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { |
718 | WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); | 717 | WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); |
719 | broadcast_shutdown_local(bc, dev); | 718 | broadcast_shutdown_local(bc, dev); |
@@ -741,7 +740,7 @@ int tick_broadcast_oneshot_control(unsigned long reason) | |||
741 | cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); | 740 | cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); |
742 | } else { | 741 | } else { |
743 | if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { | 742 | if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { |
744 | clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); | 743 | clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); |
745 | /* | 744 | /* |
746 | * The cpu which was handling the broadcast | 745 | * The cpu which was handling the broadcast |
747 | * timer marked this cpu in the broadcast | 746 | * timer marked this cpu in the broadcast |
@@ -805,9 +804,10 @@ int tick_broadcast_oneshot_control(unsigned long reason) | |||
805 | } | 804 | } |
806 | } | 805 | } |
807 | out: | 806 | out: |
808 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); | 807 | raw_spin_unlock(&tick_broadcast_lock); |
809 | return ret; | 808 | return ret; |
810 | } | 809 | } |
810 | EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control); | ||
811 | 811 | ||
812 | /* | 812 | /* |
813 | * Reset the one shot broadcast for a cpu | 813 | * Reset the one shot broadcast for a cpu |
@@ -842,7 +842,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) | |||
842 | 842 | ||
843 | /* Set it up only once ! */ | 843 | /* Set it up only once ! */ |
844 | if (bc->event_handler != tick_handle_oneshot_broadcast) { | 844 | if (bc->event_handler != tick_handle_oneshot_broadcast) { |
845 | int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; | 845 | int was_periodic = bc->state == CLOCK_EVT_STATE_PERIODIC; |
846 | 846 | ||
847 | bc->event_handler = tick_handle_oneshot_broadcast; | 847 | bc->event_handler = tick_handle_oneshot_broadcast; |
848 | 848 | ||
@@ -858,7 +858,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) | |||
858 | tick_broadcast_oneshot_mask, tmpmask); | 858 | tick_broadcast_oneshot_mask, tmpmask); |
859 | 859 | ||
860 | if (was_periodic && !cpumask_empty(tmpmask)) { | 860 | if (was_periodic && !cpumask_empty(tmpmask)) { |
861 | clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); | 861 | clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT); |
862 | tick_broadcast_init_next_event(tmpmask, | 862 | tick_broadcast_init_next_event(tmpmask, |
863 | tick_next_period); | 863 | tick_next_period); |
864 | tick_broadcast_set_event(bc, cpu, tick_next_period, 1); | 864 | tick_broadcast_set_event(bc, cpu, tick_next_period, 1); |
@@ -894,14 +894,28 @@ void tick_broadcast_switch_to_oneshot(void) | |||
894 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); | 894 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); |
895 | } | 895 | } |
896 | 896 | ||
897 | #ifdef CONFIG_HOTPLUG_CPU | ||
898 | void hotplug_cpu__broadcast_tick_pull(int deadcpu) | ||
899 | { | ||
900 | struct clock_event_device *bc; | ||
901 | unsigned long flags; | ||
902 | |||
903 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); | ||
904 | bc = tick_broadcast_device.evtdev; | ||
905 | |||
906 | if (bc && broadcast_needs_cpu(bc, deadcpu)) { | ||
907 | /* This moves the broadcast assignment to this CPU: */ | ||
908 | clockevents_program_event(bc, bc->next_event, 1); | ||
909 | } | ||
910 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); | ||
911 | } | ||
897 | 912 | ||
898 | /* | 913 | /* |
899 | * Remove a dead CPU from broadcasting | 914 | * Remove a dead CPU from broadcasting |
900 | */ | 915 | */ |
901 | void tick_shutdown_broadcast_oneshot(unsigned int *cpup) | 916 | void tick_shutdown_broadcast_oneshot(unsigned int cpu) |
902 | { | 917 | { |
903 | unsigned long flags; | 918 | unsigned long flags; |
904 | unsigned int cpu = *cpup; | ||
905 | 919 | ||
906 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); | 920 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); |
907 | 921 | ||
@@ -913,10 +927,9 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) | |||
913 | cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); | 927 | cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); |
914 | cpumask_clear_cpu(cpu, tick_broadcast_force_mask); | 928 | cpumask_clear_cpu(cpu, tick_broadcast_force_mask); |
915 | 929 | ||
916 | broadcast_move_bc(cpu); | ||
917 | |||
918 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); | 930 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); |
919 | } | 931 | } |
932 | #endif | ||
920 | 933 | ||
921 | /* | 934 | /* |
922 | * Check, whether the broadcast device is in one shot mode | 935 | * Check, whether the broadcast device is in one shot mode |
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index f7c515595b42..3ae6afa1eb98 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c | |||
@@ -102,7 +102,7 @@ void tick_handle_periodic(struct clock_event_device *dev) | |||
102 | 102 | ||
103 | tick_periodic(cpu); | 103 | tick_periodic(cpu); |
104 | 104 | ||
105 | if (dev->mode != CLOCK_EVT_MODE_ONESHOT) | 105 | if (dev->state != CLOCK_EVT_STATE_ONESHOT) |
106 | return; | 106 | return; |
107 | for (;;) { | 107 | for (;;) { |
108 | /* | 108 | /* |
@@ -140,7 +140,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) | |||
140 | 140 | ||
141 | if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && | 141 | if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && |
142 | !tick_broadcast_oneshot_active()) { | 142 | !tick_broadcast_oneshot_active()) { |
143 | clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC); | 143 | clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC); |
144 | } else { | 144 | } else { |
145 | unsigned long seq; | 145 | unsigned long seq; |
146 | ktime_t next; | 146 | ktime_t next; |
@@ -150,7 +150,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) | |||
150 | next = tick_next_period; | 150 | next = tick_next_period; |
151 | } while (read_seqretry(&jiffies_lock, seq)); | 151 | } while (read_seqretry(&jiffies_lock, seq)); |
152 | 152 | ||
153 | clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); | 153 | clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); |
154 | 154 | ||
155 | for (;;) { | 155 | for (;;) { |
156 | if (!clockevents_program_event(dev, next, false)) | 156 | if (!clockevents_program_event(dev, next, false)) |
@@ -332,14 +332,16 @@ out_bc: | |||
332 | tick_install_broadcast_device(newdev); | 332 | tick_install_broadcast_device(newdev); |
333 | } | 333 | } |
334 | 334 | ||
335 | #ifdef CONFIG_HOTPLUG_CPU | ||
335 | /* | 336 | /* |
336 | * Transfer the do_timer job away from a dying cpu. | 337 | * Transfer the do_timer job away from a dying cpu. |
337 | * | 338 | * |
338 | * Called with interrupts disabled. | 339 | * Called with interrupts disabled. Not locking required. If |
340 | * tick_do_timer_cpu is owned by this cpu, nothing can change it. | ||
339 | */ | 341 | */ |
340 | void tick_handover_do_timer(int *cpup) | 342 | void tick_handover_do_timer(void) |
341 | { | 343 | { |
342 | if (*cpup == tick_do_timer_cpu) { | 344 | if (tick_do_timer_cpu == smp_processor_id()) { |
343 | int cpu = cpumask_first(cpu_online_mask); | 345 | int cpu = cpumask_first(cpu_online_mask); |
344 | 346 | ||
345 | tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu : | 347 | tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu : |
@@ -354,9 +356,9 @@ void tick_handover_do_timer(int *cpup) | |||
354 | * access the hardware device itself. | 356 | * access the hardware device itself. |
355 | * We just set the mode and remove it from the lists. | 357 | * We just set the mode and remove it from the lists. |
356 | */ | 358 | */ |
357 | void tick_shutdown(unsigned int *cpup) | 359 | void tick_shutdown(unsigned int cpu) |
358 | { | 360 | { |
359 | struct tick_device *td = &per_cpu(tick_cpu_device, *cpup); | 361 | struct tick_device *td = &per_cpu(tick_cpu_device, cpu); |
360 | struct clock_event_device *dev = td->evtdev; | 362 | struct clock_event_device *dev = td->evtdev; |
361 | 363 | ||
362 | td->mode = TICKDEV_MODE_PERIODIC; | 364 | td->mode = TICKDEV_MODE_PERIODIC; |
@@ -365,27 +367,42 @@ void tick_shutdown(unsigned int *cpup) | |||
365 | * Prevent that the clock events layer tries to call | 367 | * Prevent that the clock events layer tries to call |
366 | * the set mode function! | 368 | * the set mode function! |
367 | */ | 369 | */ |
370 | dev->state = CLOCK_EVT_STATE_DETACHED; | ||
368 | dev->mode = CLOCK_EVT_MODE_UNUSED; | 371 | dev->mode = CLOCK_EVT_MODE_UNUSED; |
369 | clockevents_exchange_device(dev, NULL); | 372 | clockevents_exchange_device(dev, NULL); |
370 | dev->event_handler = clockevents_handle_noop; | 373 | dev->event_handler = clockevents_handle_noop; |
371 | td->evtdev = NULL; | 374 | td->evtdev = NULL; |
372 | } | 375 | } |
373 | } | 376 | } |
377 | #endif | ||
374 | 378 | ||
375 | void tick_suspend(void) | 379 | /** |
380 | * tick_suspend_local - Suspend the local tick device | ||
381 | * | ||
382 | * Called from the local cpu for freeze with interrupts disabled. | ||
383 | * | ||
384 | * No locks required. Nothing can change the per cpu device. | ||
385 | */ | ||
386 | void tick_suspend_local(void) | ||
376 | { | 387 | { |
377 | struct tick_device *td = this_cpu_ptr(&tick_cpu_device); | 388 | struct tick_device *td = this_cpu_ptr(&tick_cpu_device); |
378 | 389 | ||
379 | clockevents_shutdown(td->evtdev); | 390 | clockevents_shutdown(td->evtdev); |
380 | } | 391 | } |
381 | 392 | ||
382 | void tick_resume(void) | 393 | /** |
394 | * tick_resume_local - Resume the local tick device | ||
395 | * | ||
396 | * Called from the local CPU for unfreeze or XEN resume magic. | ||
397 | * | ||
398 | * No locks required. Nothing can change the per cpu device. | ||
399 | */ | ||
400 | void tick_resume_local(void) | ||
383 | { | 401 | { |
384 | struct tick_device *td = this_cpu_ptr(&tick_cpu_device); | 402 | struct tick_device *td = this_cpu_ptr(&tick_cpu_device); |
385 | int broadcast = tick_resume_broadcast(); | 403 | bool broadcast = tick_resume_check_broadcast(); |
386 | |||
387 | clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); | ||
388 | 404 | ||
405 | clockevents_tick_resume(td->evtdev); | ||
389 | if (!broadcast) { | 406 | if (!broadcast) { |
390 | if (td->mode == TICKDEV_MODE_PERIODIC) | 407 | if (td->mode == TICKDEV_MODE_PERIODIC) |
391 | tick_setup_periodic(td->evtdev, 0); | 408 | tick_setup_periodic(td->evtdev, 0); |
@@ -394,6 +411,35 @@ void tick_resume(void) | |||
394 | } | 411 | } |
395 | } | 412 | } |
396 | 413 | ||
414 | /** | ||
415 | * tick_suspend - Suspend the tick and the broadcast device | ||
416 | * | ||
417 | * Called from syscore_suspend() via timekeeping_suspend with only one | ||
418 | * CPU online and interrupts disabled or from tick_unfreeze() under | ||
419 | * tick_freeze_lock. | ||
420 | * | ||
421 | * No locks required. Nothing can change the per cpu device. | ||
422 | */ | ||
423 | void tick_suspend(void) | ||
424 | { | ||
425 | tick_suspend_local(); | ||
426 | tick_suspend_broadcast(); | ||
427 | } | ||
428 | |||
429 | /** | ||
430 | * tick_resume - Resume the tick and the broadcast device | ||
431 | * | ||
432 | * Called from syscore_resume() via timekeeping_resume with only one | ||
433 | * CPU online and interrupts disabled. | ||
434 | * | ||
435 | * No locks required. Nothing can change the per cpu device. | ||
436 | */ | ||
437 | void tick_resume(void) | ||
438 | { | ||
439 | tick_resume_broadcast(); | ||
440 | tick_resume_local(); | ||
441 | } | ||
442 | |||
397 | static DEFINE_RAW_SPINLOCK(tick_freeze_lock); | 443 | static DEFINE_RAW_SPINLOCK(tick_freeze_lock); |
398 | static unsigned int tick_freeze_depth; | 444 | static unsigned int tick_freeze_depth; |
399 | 445 | ||
@@ -411,12 +457,10 @@ void tick_freeze(void) | |||
411 | raw_spin_lock(&tick_freeze_lock); | 457 | raw_spin_lock(&tick_freeze_lock); |
412 | 458 | ||
413 | tick_freeze_depth++; | 459 | tick_freeze_depth++; |
414 | if (tick_freeze_depth == num_online_cpus()) { | 460 | if (tick_freeze_depth == num_online_cpus()) |
415 | timekeeping_suspend(); | 461 | timekeeping_suspend(); |
416 | } else { | 462 | else |
417 | tick_suspend(); | 463 | tick_suspend_local(); |
418 | tick_suspend_broadcast(); | ||
419 | } | ||
420 | 464 | ||
421 | raw_spin_unlock(&tick_freeze_lock); | 465 | raw_spin_unlock(&tick_freeze_lock); |
422 | } | 466 | } |
@@ -437,7 +481,7 @@ void tick_unfreeze(void) | |||
437 | if (tick_freeze_depth == num_online_cpus()) | 481 | if (tick_freeze_depth == num_online_cpus()) |
438 | timekeeping_resume(); | 482 | timekeeping_resume(); |
439 | else | 483 | else |
440 | tick_resume(); | 484 | tick_resume_local(); |
441 | 485 | ||
442 | tick_freeze_depth--; | 486 | tick_freeze_depth--; |
443 | 487 | ||
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 366aeb4f2c66..b64fdd8054c5 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h | |||
@@ -5,15 +5,12 @@ | |||
5 | #include <linux/tick.h> | 5 | #include <linux/tick.h> |
6 | 6 | ||
7 | #include "timekeeping.h" | 7 | #include "timekeeping.h" |
8 | #include "tick-sched.h" | ||
8 | 9 | ||
9 | extern seqlock_t jiffies_lock; | 10 | #ifdef CONFIG_GENERIC_CLOCKEVENTS |
10 | 11 | ||
11 | #define CS_NAME_LEN 32 | 12 | # define TICK_DO_TIMER_NONE -1 |
12 | 13 | # define TICK_DO_TIMER_BOOT -2 | |
13 | #ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD | ||
14 | |||
15 | #define TICK_DO_TIMER_NONE -1 | ||
16 | #define TICK_DO_TIMER_BOOT -2 | ||
17 | 14 | ||
18 | DECLARE_PER_CPU(struct tick_device, tick_cpu_device); | 15 | DECLARE_PER_CPU(struct tick_device, tick_cpu_device); |
19 | extern ktime_t tick_next_period; | 16 | extern ktime_t tick_next_period; |
@@ -23,21 +20,72 @@ extern int tick_do_timer_cpu __read_mostly; | |||
23 | extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); | 20 | extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); |
24 | extern void tick_handle_periodic(struct clock_event_device *dev); | 21 | extern void tick_handle_periodic(struct clock_event_device *dev); |
25 | extern void tick_check_new_device(struct clock_event_device *dev); | 22 | extern void tick_check_new_device(struct clock_event_device *dev); |
26 | extern void tick_handover_do_timer(int *cpup); | 23 | extern void tick_shutdown(unsigned int cpu); |
27 | extern void tick_shutdown(unsigned int *cpup); | ||
28 | extern void tick_suspend(void); | 24 | extern void tick_suspend(void); |
29 | extern void tick_resume(void); | 25 | extern void tick_resume(void); |
30 | extern bool tick_check_replacement(struct clock_event_device *curdev, | 26 | extern bool tick_check_replacement(struct clock_event_device *curdev, |
31 | struct clock_event_device *newdev); | 27 | struct clock_event_device *newdev); |
32 | extern void tick_install_replacement(struct clock_event_device *dev); | 28 | extern void tick_install_replacement(struct clock_event_device *dev); |
29 | extern int tick_is_oneshot_available(void); | ||
30 | extern struct tick_device *tick_get_device(int cpu); | ||
33 | 31 | ||
34 | extern void clockevents_shutdown(struct clock_event_device *dev); | 32 | extern int clockevents_tick_resume(struct clock_event_device *dev); |
33 | /* Check, if the device is functional or a dummy for broadcast */ | ||
34 | static inline int tick_device_is_functional(struct clock_event_device *dev) | ||
35 | { | ||
36 | return !(dev->features & CLOCK_EVT_FEAT_DUMMY); | ||
37 | } | ||
35 | 38 | ||
39 | extern void clockevents_shutdown(struct clock_event_device *dev); | ||
40 | extern void clockevents_exchange_device(struct clock_event_device *old, | ||
41 | struct clock_event_device *new); | ||
42 | extern void clockevents_set_state(struct clock_event_device *dev, | ||
43 | enum clock_event_state state); | ||
44 | extern int clockevents_program_event(struct clock_event_device *dev, | ||
45 | ktime_t expires, bool force); | ||
46 | extern void clockevents_handle_noop(struct clock_event_device *dev); | ||
47 | extern int __clockevents_update_freq(struct clock_event_device *dev, u32 freq); | ||
36 | extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); | 48 | extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); |
37 | 49 | ||
38 | /* | 50 | /* Broadcasting support */ |
39 | * NO_HZ / high resolution timer shared code | 51 | # ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST |
40 | */ | 52 | extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); |
53 | extern void tick_install_broadcast_device(struct clock_event_device *dev); | ||
54 | extern int tick_is_broadcast_device(struct clock_event_device *dev); | ||
55 | extern void tick_shutdown_broadcast(unsigned int cpu); | ||
56 | extern void tick_suspend_broadcast(void); | ||
57 | extern void tick_resume_broadcast(void); | ||
58 | extern bool tick_resume_check_broadcast(void); | ||
59 | extern void tick_broadcast_init(void); | ||
60 | extern void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); | ||
61 | extern int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq); | ||
62 | extern struct tick_device *tick_get_broadcast_device(void); | ||
63 | extern struct cpumask *tick_get_broadcast_mask(void); | ||
64 | # else /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST: */ | ||
65 | static inline void tick_install_broadcast_device(struct clock_event_device *dev) { } | ||
66 | static inline int tick_is_broadcast_device(struct clock_event_device *dev) { return 0; } | ||
67 | static inline int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { return 0; } | ||
68 | static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { } | ||
69 | static inline void tick_shutdown_broadcast(unsigned int cpu) { } | ||
70 | static inline void tick_suspend_broadcast(void) { } | ||
71 | static inline void tick_resume_broadcast(void) { } | ||
72 | static inline bool tick_resume_check_broadcast(void) { return false; } | ||
73 | static inline void tick_broadcast_init(void) { } | ||
74 | static inline int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq) { return -ENODEV; } | ||
75 | |||
76 | /* Set the periodic handler in non broadcast mode */ | ||
77 | static inline void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast) | ||
78 | { | ||
79 | dev->event_handler = tick_handle_periodic; | ||
80 | } | ||
81 | # endif /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST */ | ||
82 | |||
83 | #else /* !GENERIC_CLOCKEVENTS: */ | ||
84 | static inline void tick_suspend(void) { } | ||
85 | static inline void tick_resume(void) { } | ||
86 | #endif /* !GENERIC_CLOCKEVENTS */ | ||
87 | |||
88 | /* Oneshot related functions */ | ||
41 | #ifdef CONFIG_TICK_ONESHOT | 89 | #ifdef CONFIG_TICK_ONESHOT |
42 | extern void tick_setup_oneshot(struct clock_event_device *newdev, | 90 | extern void tick_setup_oneshot(struct clock_event_device *newdev, |
43 | void (*handler)(struct clock_event_device *), | 91 | void (*handler)(struct clock_event_device *), |
@@ -46,58 +94,42 @@ extern int tick_program_event(ktime_t expires, int force); | |||
46 | extern void tick_oneshot_notify(void); | 94 | extern void tick_oneshot_notify(void); |
47 | extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); | 95 | extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); |
48 | extern void tick_resume_oneshot(void); | 96 | extern void tick_resume_oneshot(void); |
49 | # ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST | 97 | static inline bool tick_oneshot_possible(void) { return true; } |
98 | extern int tick_oneshot_mode_active(void); | ||
99 | extern void tick_clock_notify(void); | ||
100 | extern int tick_check_oneshot_change(int allow_nohz); | ||
101 | extern int tick_init_highres(void); | ||
102 | #else /* !CONFIG_TICK_ONESHOT: */ | ||
103 | static inline | ||
104 | void tick_setup_oneshot(struct clock_event_device *newdev, | ||
105 | void (*handler)(struct clock_event_device *), | ||
106 | ktime_t nextevt) { BUG(); } | ||
107 | static inline void tick_resume_oneshot(void) { BUG(); } | ||
108 | static inline int tick_program_event(ktime_t expires, int force) { return 0; } | ||
109 | static inline void tick_oneshot_notify(void) { } | ||
110 | static inline bool tick_oneshot_possible(void) { return false; } | ||
111 | static inline int tick_oneshot_mode_active(void) { return 0; } | ||
112 | static inline void tick_clock_notify(void) { } | ||
113 | static inline int tick_check_oneshot_change(int allow_nohz) { return 0; } | ||
114 | #endif /* !CONFIG_TICK_ONESHOT */ | ||
115 | |||
116 | /* Functions related to oneshot broadcasting */ | ||
117 | #if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT) | ||
50 | extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc); | 118 | extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc); |
51 | extern int tick_broadcast_oneshot_control(unsigned long reason); | ||
52 | extern void tick_broadcast_switch_to_oneshot(void); | 119 | extern void tick_broadcast_switch_to_oneshot(void); |
53 | extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); | 120 | extern void tick_shutdown_broadcast_oneshot(unsigned int cpu); |
54 | extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); | ||
55 | extern int tick_broadcast_oneshot_active(void); | 121 | extern int tick_broadcast_oneshot_active(void); |
56 | extern void tick_check_oneshot_broadcast_this_cpu(void); | 122 | extern void tick_check_oneshot_broadcast_this_cpu(void); |
57 | bool tick_broadcast_oneshot_available(void); | 123 | bool tick_broadcast_oneshot_available(void); |
58 | # else /* BROADCAST */ | 124 | extern struct cpumask *tick_get_broadcast_oneshot_mask(void); |
59 | static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) | 125 | #else /* !(BROADCAST && ONESHOT): */ |
60 | { | 126 | static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } |
61 | BUG(); | ||
62 | } | ||
63 | static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; } | ||
64 | static inline void tick_broadcast_switch_to_oneshot(void) { } | 127 | static inline void tick_broadcast_switch_to_oneshot(void) { } |
65 | static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } | 128 | static inline void tick_shutdown_broadcast_oneshot(unsigned int cpu) { } |
66 | static inline int tick_broadcast_oneshot_active(void) { return 0; } | 129 | static inline int tick_broadcast_oneshot_active(void) { return 0; } |
67 | static inline void tick_check_oneshot_broadcast_this_cpu(void) { } | 130 | static inline void tick_check_oneshot_broadcast_this_cpu(void) { } |
68 | static inline bool tick_broadcast_oneshot_available(void) { return true; } | 131 | static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); } |
69 | # endif /* !BROADCAST */ | 132 | #endif /* !(BROADCAST && ONESHOT) */ |
70 | |||
71 | #else /* !ONESHOT */ | ||
72 | static inline | ||
73 | void tick_setup_oneshot(struct clock_event_device *newdev, | ||
74 | void (*handler)(struct clock_event_device *), | ||
75 | ktime_t nextevt) | ||
76 | { | ||
77 | BUG(); | ||
78 | } | ||
79 | static inline void tick_resume_oneshot(void) | ||
80 | { | ||
81 | BUG(); | ||
82 | } | ||
83 | static inline int tick_program_event(ktime_t expires, int force) | ||
84 | { | ||
85 | return 0; | ||
86 | } | ||
87 | static inline void tick_oneshot_notify(void) { } | ||
88 | static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) | ||
89 | { | ||
90 | BUG(); | ||
91 | } | ||
92 | static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; } | ||
93 | static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } | ||
94 | static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc) | ||
95 | { | ||
96 | return 0; | ||
97 | } | ||
98 | static inline int tick_broadcast_oneshot_active(void) { return 0; } | ||
99 | static inline bool tick_broadcast_oneshot_available(void) { return false; } | ||
100 | #endif /* !TICK_ONESHOT */ | ||
101 | 133 | ||
102 | /* NO_HZ_FULL internal */ | 134 | /* NO_HZ_FULL internal */ |
103 | #ifdef CONFIG_NO_HZ_FULL | 135 | #ifdef CONFIG_NO_HZ_FULL |
@@ -105,68 +137,3 @@ extern void tick_nohz_init(void); | |||
105 | # else | 137 | # else |
106 | static inline void tick_nohz_init(void) { } | 138 | static inline void tick_nohz_init(void) { } |
107 | #endif | 139 | #endif |
108 | |||
109 | /* | ||
110 | * Broadcasting support | ||
111 | */ | ||
112 | #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST | ||
113 | extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); | ||
114 | extern void tick_install_broadcast_device(struct clock_event_device *dev); | ||
115 | extern int tick_is_broadcast_device(struct clock_event_device *dev); | ||
116 | extern void tick_broadcast_on_off(unsigned long reason, int *oncpu); | ||
117 | extern void tick_shutdown_broadcast(unsigned int *cpup); | ||
118 | extern void tick_suspend_broadcast(void); | ||
119 | extern int tick_resume_broadcast(void); | ||
120 | extern void tick_broadcast_init(void); | ||
121 | extern void | ||
122 | tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); | ||
123 | int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq); | ||
124 | |||
125 | #else /* !BROADCAST */ | ||
126 | |||
127 | static inline void tick_install_broadcast_device(struct clock_event_device *dev) | ||
128 | { | ||
129 | } | ||
130 | |||
131 | static inline int tick_is_broadcast_device(struct clock_event_device *dev) | ||
132 | { | ||
133 | return 0; | ||
134 | } | ||
135 | static inline int tick_device_uses_broadcast(struct clock_event_device *dev, | ||
136 | int cpu) | ||
137 | { | ||
138 | return 0; | ||
139 | } | ||
140 | static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { } | ||
141 | static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { } | ||
142 | static inline void tick_shutdown_broadcast(unsigned int *cpup) { } | ||
143 | static inline void tick_suspend_broadcast(void) { } | ||
144 | static inline int tick_resume_broadcast(void) { return 0; } | ||
145 | static inline void tick_broadcast_init(void) { } | ||
146 | static inline int tick_broadcast_update_freq(struct clock_event_device *dev, | ||
147 | u32 freq) { return -ENODEV; } | ||
148 | |||
149 | /* | ||
150 | * Set the periodic handler in non broadcast mode | ||
151 | */ | ||
152 | static inline void tick_set_periodic_handler(struct clock_event_device *dev, | ||
153 | int broadcast) | ||
154 | { | ||
155 | dev->event_handler = tick_handle_periodic; | ||
156 | } | ||
157 | #endif /* !BROADCAST */ | ||
158 | |||
159 | /* | ||
160 | * Check, if the device is functional or a dummy for broadcast | ||
161 | */ | ||
162 | static inline int tick_device_is_functional(struct clock_event_device *dev) | ||
163 | { | ||
164 | return !(dev->features & CLOCK_EVT_FEAT_DUMMY); | ||
165 | } | ||
166 | |||
167 | int __clockevents_update_freq(struct clock_event_device *dev, u32 freq); | ||
168 | |||
169 | #endif | ||
170 | |||
171 | extern void do_timer(unsigned long ticks); | ||
172 | extern void update_wall_time(void); | ||
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 7ce740e78e1b..67a64b1670bf 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c | |||
@@ -38,7 +38,7 @@ void tick_resume_oneshot(void) | |||
38 | { | 38 | { |
39 | struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); | 39 | struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); |
40 | 40 | ||
41 | clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); | 41 | clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); |
42 | clockevents_program_event(dev, ktime_get(), true); | 42 | clockevents_program_event(dev, ktime_get(), true); |
43 | } | 43 | } |
44 | 44 | ||
@@ -50,7 +50,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev, | |||
50 | ktime_t next_event) | 50 | ktime_t next_event) |
51 | { | 51 | { |
52 | newdev->event_handler = handler; | 52 | newdev->event_handler = handler; |
53 | clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT); | 53 | clockevents_set_state(newdev, CLOCK_EVT_STATE_ONESHOT); |
54 | clockevents_program_event(newdev, next_event, true); | 54 | clockevents_program_event(newdev, next_event, true); |
55 | } | 55 | } |
56 | 56 | ||
@@ -81,7 +81,7 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) | |||
81 | 81 | ||
82 | td->mode = TICKDEV_MODE_ONESHOT; | 82 | td->mode = TICKDEV_MODE_ONESHOT; |
83 | dev->event_handler = handler; | 83 | dev->event_handler = handler; |
84 | clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); | 84 | clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); |
85 | tick_broadcast_switch_to_oneshot(); | 85 | tick_broadcast_switch_to_oneshot(); |
86 | return 0; | 86 | return 0; |
87 | } | 87 | } |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a4c4edac4528..914259128145 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -34,7 +34,7 @@ | |||
34 | /* | 34 | /* |
35 | * Per cpu nohz control structure | 35 | * Per cpu nohz control structure |
36 | */ | 36 | */ |
37 | DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); | 37 | static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); |
38 | 38 | ||
39 | /* | 39 | /* |
40 | * The time, when the last jiffy update happened. Protected by jiffies_lock. | 40 | * The time, when the last jiffy update happened. Protected by jiffies_lock. |
@@ -416,6 +416,11 @@ static int __init setup_tick_nohz(char *str) | |||
416 | 416 | ||
417 | __setup("nohz=", setup_tick_nohz); | 417 | __setup("nohz=", setup_tick_nohz); |
418 | 418 | ||
419 | int tick_nohz_tick_stopped(void) | ||
420 | { | ||
421 | return __this_cpu_read(tick_cpu_sched.tick_stopped); | ||
422 | } | ||
423 | |||
419 | /** | 424 | /** |
420 | * tick_nohz_update_jiffies - update jiffies when idle was interrupted | 425 | * tick_nohz_update_jiffies - update jiffies when idle was interrupted |
421 | * | 426 | * |
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h new file mode 100644 index 000000000000..28b5da3e1a17 --- /dev/null +++ b/kernel/time/tick-sched.h | |||
@@ -0,0 +1,74 @@ | |||
1 | #ifndef _TICK_SCHED_H | ||
2 | #define _TICK_SCHED_H | ||
3 | |||
4 | #include <linux/hrtimer.h> | ||
5 | |||
6 | enum tick_device_mode { | ||
7 | TICKDEV_MODE_PERIODIC, | ||
8 | TICKDEV_MODE_ONESHOT, | ||
9 | }; | ||
10 | |||
11 | struct tick_device { | ||
12 | struct clock_event_device *evtdev; | ||
13 | enum tick_device_mode mode; | ||
14 | }; | ||
15 | |||
16 | enum tick_nohz_mode { | ||
17 | NOHZ_MODE_INACTIVE, | ||
18 | NOHZ_MODE_LOWRES, | ||
19 | NOHZ_MODE_HIGHRES, | ||
20 | }; | ||
21 | |||
22 | /** | ||
23 | * struct tick_sched - sched tick emulation and no idle tick control/stats | ||
24 | * @sched_timer: hrtimer to schedule the periodic tick in high | ||
25 | * resolution mode | ||
26 | * @last_tick: Store the last tick expiry time when the tick | ||
27 | * timer is modified for nohz sleeps. This is necessary | ||
28 | * to resume the tick timer operation in the timeline | ||
29 | * when the CPU returns from nohz sleep. | ||
30 | * @tick_stopped: Indicator that the idle tick has been stopped | ||
31 | * @idle_jiffies: jiffies at the entry to idle for idle time accounting | ||
32 | * @idle_calls: Total number of idle calls | ||
33 | * @idle_sleeps: Number of idle calls, where the sched tick was stopped | ||
34 | * @idle_entrytime: Time when the idle call was entered | ||
35 | * @idle_waketime: Time when the idle was interrupted | ||
36 | * @idle_exittime: Time when the idle state was left | ||
37 | * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped | ||
38 | * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding | ||
39 | * @sleep_length: Duration of the current idle sleep | ||
40 | * @do_timer_lst: CPU was the last one doing do_timer before going idle | ||
41 | */ | ||
42 | struct tick_sched { | ||
43 | struct hrtimer sched_timer; | ||
44 | unsigned long check_clocks; | ||
45 | enum tick_nohz_mode nohz_mode; | ||
46 | ktime_t last_tick; | ||
47 | int inidle; | ||
48 | int tick_stopped; | ||
49 | unsigned long idle_jiffies; | ||
50 | unsigned long idle_calls; | ||
51 | unsigned long idle_sleeps; | ||
52 | int idle_active; | ||
53 | ktime_t idle_entrytime; | ||
54 | ktime_t idle_waketime; | ||
55 | ktime_t idle_exittime; | ||
56 | ktime_t idle_sleeptime; | ||
57 | ktime_t iowait_sleeptime; | ||
58 | ktime_t sleep_length; | ||
59 | unsigned long last_jiffies; | ||
60 | unsigned long next_jiffies; | ||
61 | ktime_t idle_expires; | ||
62 | int do_timer_last; | ||
63 | }; | ||
64 | |||
65 | extern struct tick_sched *tick_get_tick_sched(int cpu); | ||
66 | |||
67 | extern void tick_setup_sched_timer(void); | ||
68 | #if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS | ||
69 | extern void tick_cancel_sched_timer(int cpu); | ||
70 | #else | ||
71 | static inline void tick_cancel_sched_timer(int cpu) { } | ||
72 | #endif | ||
73 | |||
74 | #endif | ||
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 91db94136c10..946acb72179f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -59,17 +59,15 @@ struct tk_fast { | |||
59 | }; | 59 | }; |
60 | 60 | ||
61 | static struct tk_fast tk_fast_mono ____cacheline_aligned; | 61 | static struct tk_fast tk_fast_mono ____cacheline_aligned; |
62 | static struct tk_fast tk_fast_raw ____cacheline_aligned; | ||
62 | 63 | ||
63 | /* flag for if timekeeping is suspended */ | 64 | /* flag for if timekeeping is suspended */ |
64 | int __read_mostly timekeeping_suspended; | 65 | int __read_mostly timekeeping_suspended; |
65 | 66 | ||
66 | /* Flag for if there is a persistent clock on this platform */ | ||
67 | bool __read_mostly persistent_clock_exist = false; | ||
68 | |||
69 | static inline void tk_normalize_xtime(struct timekeeper *tk) | 67 | static inline void tk_normalize_xtime(struct timekeeper *tk) |
70 | { | 68 | { |
71 | while (tk->tkr.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr.shift)) { | 69 | while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) { |
72 | tk->tkr.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr.shift; | 70 | tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift; |
73 | tk->xtime_sec++; | 71 | tk->xtime_sec++; |
74 | } | 72 | } |
75 | } | 73 | } |
@@ -79,20 +77,20 @@ static inline struct timespec64 tk_xtime(struct timekeeper *tk) | |||
79 | struct timespec64 ts; | 77 | struct timespec64 ts; |
80 | 78 | ||
81 | ts.tv_sec = tk->xtime_sec; | 79 | ts.tv_sec = tk->xtime_sec; |
82 | ts.tv_nsec = (long)(tk->tkr.xtime_nsec >> tk->tkr.shift); | 80 | ts.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); |
83 | return ts; | 81 | return ts; |
84 | } | 82 | } |
85 | 83 | ||
86 | static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts) | 84 | static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts) |
87 | { | 85 | { |
88 | tk->xtime_sec = ts->tv_sec; | 86 | tk->xtime_sec = ts->tv_sec; |
89 | tk->tkr.xtime_nsec = (u64)ts->tv_nsec << tk->tkr.shift; | 87 | tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift; |
90 | } | 88 | } |
91 | 89 | ||
92 | static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts) | 90 | static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts) |
93 | { | 91 | { |
94 | tk->xtime_sec += ts->tv_sec; | 92 | tk->xtime_sec += ts->tv_sec; |
95 | tk->tkr.xtime_nsec += (u64)ts->tv_nsec << tk->tkr.shift; | 93 | tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift; |
96 | tk_normalize_xtime(tk); | 94 | tk_normalize_xtime(tk); |
97 | } | 95 | } |
98 | 96 | ||
@@ -118,6 +116,117 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) | |||
118 | tk->offs_boot = ktime_add(tk->offs_boot, delta); | 116 | tk->offs_boot = ktime_add(tk->offs_boot, delta); |
119 | } | 117 | } |
120 | 118 | ||
119 | #ifdef CONFIG_DEBUG_TIMEKEEPING | ||
120 | #define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */ | ||
121 | /* | ||
122 | * These simple flag variables are managed | ||
123 | * without locks, which is racy, but ok since | ||
124 | * we don't really care about being super | ||
125 | * precise about how many events were seen, | ||
126 | * just that a problem was observed. | ||
127 | */ | ||
128 | static int timekeeping_underflow_seen; | ||
129 | static int timekeeping_overflow_seen; | ||
130 | |||
131 | /* last_warning is only modified under the timekeeping lock */ | ||
132 | static long timekeeping_last_warning; | ||
133 | |||
134 | static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset) | ||
135 | { | ||
136 | |||
137 | cycle_t max_cycles = tk->tkr_mono.clock->max_cycles; | ||
138 | const char *name = tk->tkr_mono.clock->name; | ||
139 | |||
140 | if (offset > max_cycles) { | ||
141 | printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger\n", | ||
142 | offset, name, max_cycles); | ||
143 | printk_deferred(" timekeeping: Your kernel is sick, but tries to cope by capping time updates\n"); | ||
144 | } else { | ||
145 | if (offset > (max_cycles >> 1)) { | ||
146 | printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the the '%s' clock's 50%% safety margin (%lld)\n", | ||
147 | offset, name, max_cycles >> 1); | ||
148 | printk_deferred(" timekeeping: Your kernel is still fine, but is feeling a bit nervous\n"); | ||
149 | } | ||
150 | } | ||
151 | |||
152 | if (timekeeping_underflow_seen) { | ||
153 | if (jiffies - timekeeping_last_warning > WARNING_FREQ) { | ||
154 | printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name); | ||
155 | printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); | ||
156 | printk_deferred(" Your kernel is probably still fine.\n"); | ||
157 | timekeeping_last_warning = jiffies; | ||
158 | } | ||
159 | timekeeping_underflow_seen = 0; | ||
160 | } | ||
161 | |||
162 | if (timekeeping_overflow_seen) { | ||
163 | if (jiffies - timekeeping_last_warning > WARNING_FREQ) { | ||
164 | printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name); | ||
165 | printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); | ||
166 | printk_deferred(" Your kernel is probably still fine.\n"); | ||
167 | timekeeping_last_warning = jiffies; | ||
168 | } | ||
169 | timekeeping_overflow_seen = 0; | ||
170 | } | ||
171 | } | ||
172 | |||
173 | static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr) | ||
174 | { | ||
175 | cycle_t now, last, mask, max, delta; | ||
176 | unsigned int seq; | ||
177 | |||
178 | /* | ||
179 | * Since we're called holding a seqlock, the data may shift | ||
180 | * under us while we're doing the calculation. This can cause | ||
181 | * false positives, since we'd note a problem but throw the | ||
182 | * results away. So nest another seqlock here to atomically | ||
183 | * grab the points we are checking with. | ||
184 | */ | ||
185 | do { | ||
186 | seq = read_seqcount_begin(&tk_core.seq); | ||
187 | now = tkr->read(tkr->clock); | ||
188 | last = tkr->cycle_last; | ||
189 | mask = tkr->mask; | ||
190 | max = tkr->clock->max_cycles; | ||
191 | } while (read_seqcount_retry(&tk_core.seq, seq)); | ||
192 | |||
193 | delta = clocksource_delta(now, last, mask); | ||
194 | |||
195 | /* | ||
196 | * Try to catch underflows by checking if we are seeing small | ||
197 | * mask-relative negative values. | ||
198 | */ | ||
199 | if (unlikely((~delta & mask) < (mask >> 3))) { | ||
200 | timekeeping_underflow_seen = 1; | ||
201 | delta = 0; | ||
202 | } | ||
203 | |||
204 | /* Cap delta value to the max_cycles values to avoid mult overflows */ | ||
205 | if (unlikely(delta > max)) { | ||
206 | timekeeping_overflow_seen = 1; | ||
207 | delta = tkr->clock->max_cycles; | ||
208 | } | ||
209 | |||
210 | return delta; | ||
211 | } | ||
212 | #else | ||
213 | static inline void timekeeping_check_update(struct timekeeper *tk, cycle_t offset) | ||
214 | { | ||
215 | } | ||
216 | static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr) | ||
217 | { | ||
218 | cycle_t cycle_now, delta; | ||
219 | |||
220 | /* read clocksource */ | ||
221 | cycle_now = tkr->read(tkr->clock); | ||
222 | |||
223 | /* calculate the delta since the last update_wall_time */ | ||
224 | delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask); | ||
225 | |||
226 | return delta; | ||
227 | } | ||
228 | #endif | ||
229 | |||
121 | /** | 230 | /** |
122 | * tk_setup_internals - Set up internals to use clocksource clock. | 231 | * tk_setup_internals - Set up internals to use clocksource clock. |
123 | * | 232 | * |
@@ -135,11 +244,16 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) | |||
135 | u64 tmp, ntpinterval; | 244 | u64 tmp, ntpinterval; |
136 | struct clocksource *old_clock; | 245 | struct clocksource *old_clock; |
137 | 246 | ||
138 | old_clock = tk->tkr.clock; | 247 | old_clock = tk->tkr_mono.clock; |
139 | tk->tkr.clock = clock; | 248 | tk->tkr_mono.clock = clock; |
140 | tk->tkr.read = clock->read; | 249 | tk->tkr_mono.read = clock->read; |
141 | tk->tkr.mask = clock->mask; | 250 | tk->tkr_mono.mask = clock->mask; |
142 | tk->tkr.cycle_last = tk->tkr.read(clock); | 251 | tk->tkr_mono.cycle_last = tk->tkr_mono.read(clock); |
252 | |||
253 | tk->tkr_raw.clock = clock; | ||
254 | tk->tkr_raw.read = clock->read; | ||
255 | tk->tkr_raw.mask = clock->mask; | ||
256 | tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last; | ||
143 | 257 | ||
144 | /* Do the ns -> cycle conversion first, using original mult */ | 258 | /* Do the ns -> cycle conversion first, using original mult */ |
145 | tmp = NTP_INTERVAL_LENGTH; | 259 | tmp = NTP_INTERVAL_LENGTH; |
@@ -163,11 +277,14 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) | |||
163 | if (old_clock) { | 277 | if (old_clock) { |
164 | int shift_change = clock->shift - old_clock->shift; | 278 | int shift_change = clock->shift - old_clock->shift; |
165 | if (shift_change < 0) | 279 | if (shift_change < 0) |
166 | tk->tkr.xtime_nsec >>= -shift_change; | 280 | tk->tkr_mono.xtime_nsec >>= -shift_change; |
167 | else | 281 | else |
168 | tk->tkr.xtime_nsec <<= shift_change; | 282 | tk->tkr_mono.xtime_nsec <<= shift_change; |
169 | } | 283 | } |
170 | tk->tkr.shift = clock->shift; | 284 | tk->tkr_raw.xtime_nsec = 0; |
285 | |||
286 | tk->tkr_mono.shift = clock->shift; | ||
287 | tk->tkr_raw.shift = clock->shift; | ||
171 | 288 | ||
172 | tk->ntp_error = 0; | 289 | tk->ntp_error = 0; |
173 | tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; | 290 | tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; |
@@ -178,7 +295,8 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) | |||
178 | * active clocksource. These value will be adjusted via NTP | 295 | * active clocksource. These value will be adjusted via NTP |
179 | * to counteract clock drifting. | 296 | * to counteract clock drifting. |
180 | */ | 297 | */ |
181 | tk->tkr.mult = clock->mult; | 298 | tk->tkr_mono.mult = clock->mult; |
299 | tk->tkr_raw.mult = clock->mult; | ||
182 | tk->ntp_err_mult = 0; | 300 | tk->ntp_err_mult = 0; |
183 | } | 301 | } |
184 | 302 | ||
@@ -193,14 +311,10 @@ static inline u32 arch_gettimeoffset(void) { return 0; } | |||
193 | 311 | ||
194 | static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) | 312 | static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) |
195 | { | 313 | { |
196 | cycle_t cycle_now, delta; | 314 | cycle_t delta; |
197 | s64 nsec; | 315 | s64 nsec; |
198 | 316 | ||
199 | /* read clocksource: */ | 317 | delta = timekeeping_get_delta(tkr); |
200 | cycle_now = tkr->read(tkr->clock); | ||
201 | |||
202 | /* calculate the delta since the last update_wall_time: */ | ||
203 | delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask); | ||
204 | 318 | ||
205 | nsec = delta * tkr->mult + tkr->xtime_nsec; | 319 | nsec = delta * tkr->mult + tkr->xtime_nsec; |
206 | nsec >>= tkr->shift; | 320 | nsec >>= tkr->shift; |
@@ -209,25 +323,6 @@ static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) | |||
209 | return nsec + arch_gettimeoffset(); | 323 | return nsec + arch_gettimeoffset(); |
210 | } | 324 | } |
211 | 325 | ||
212 | static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) | ||
213 | { | ||
214 | struct clocksource *clock = tk->tkr.clock; | ||
215 | cycle_t cycle_now, delta; | ||
216 | s64 nsec; | ||
217 | |||
218 | /* read clocksource: */ | ||
219 | cycle_now = tk->tkr.read(clock); | ||
220 | |||
221 | /* calculate the delta since the last update_wall_time: */ | ||
222 | delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask); | ||
223 | |||
224 | /* convert delta to nanoseconds. */ | ||
225 | nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift); | ||
226 | |||
227 | /* If arch requires, add in get_arch_timeoffset() */ | ||
228 | return nsec + arch_gettimeoffset(); | ||
229 | } | ||
230 | |||
231 | /** | 326 | /** |
232 | * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper. | 327 | * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper. |
233 | * @tkr: Timekeeping readout base from which we take the update | 328 | * @tkr: Timekeeping readout base from which we take the update |
@@ -267,18 +362,18 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) | |||
267 | * slightly wrong timestamp (a few nanoseconds). See | 362 | * slightly wrong timestamp (a few nanoseconds). See |
268 | * @ktime_get_mono_fast_ns. | 363 | * @ktime_get_mono_fast_ns. |
269 | */ | 364 | */ |
270 | static void update_fast_timekeeper(struct tk_read_base *tkr) | 365 | static void update_fast_timekeeper(struct tk_read_base *tkr, struct tk_fast *tkf) |
271 | { | 366 | { |
272 | struct tk_read_base *base = tk_fast_mono.base; | 367 | struct tk_read_base *base = tkf->base; |
273 | 368 | ||
274 | /* Force readers off to base[1] */ | 369 | /* Force readers off to base[1] */ |
275 | raw_write_seqcount_latch(&tk_fast_mono.seq); | 370 | raw_write_seqcount_latch(&tkf->seq); |
276 | 371 | ||
277 | /* Update base[0] */ | 372 | /* Update base[0] */ |
278 | memcpy(base, tkr, sizeof(*base)); | 373 | memcpy(base, tkr, sizeof(*base)); |
279 | 374 | ||
280 | /* Force readers back to base[0] */ | 375 | /* Force readers back to base[0] */ |
281 | raw_write_seqcount_latch(&tk_fast_mono.seq); | 376 | raw_write_seqcount_latch(&tkf->seq); |
282 | 377 | ||
283 | /* Update base[1] */ | 378 | /* Update base[1] */ |
284 | memcpy(base + 1, base, sizeof(*base)); | 379 | memcpy(base + 1, base, sizeof(*base)); |
@@ -316,22 +411,33 @@ static void update_fast_timekeeper(struct tk_read_base *tkr) | |||
316 | * of the following timestamps. Callers need to be aware of that and | 411 | * of the following timestamps. Callers need to be aware of that and |
317 | * deal with it. | 412 | * deal with it. |
318 | */ | 413 | */ |
319 | u64 notrace ktime_get_mono_fast_ns(void) | 414 | static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) |
320 | { | 415 | { |
321 | struct tk_read_base *tkr; | 416 | struct tk_read_base *tkr; |
322 | unsigned int seq; | 417 | unsigned int seq; |
323 | u64 now; | 418 | u64 now; |
324 | 419 | ||
325 | do { | 420 | do { |
326 | seq = raw_read_seqcount(&tk_fast_mono.seq); | 421 | seq = raw_read_seqcount(&tkf->seq); |
327 | tkr = tk_fast_mono.base + (seq & 0x01); | 422 | tkr = tkf->base + (seq & 0x01); |
328 | now = ktime_to_ns(tkr->base_mono) + timekeeping_get_ns(tkr); | 423 | now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr); |
424 | } while (read_seqcount_retry(&tkf->seq, seq)); | ||
329 | 425 | ||
330 | } while (read_seqcount_retry(&tk_fast_mono.seq, seq)); | ||
331 | return now; | 426 | return now; |
332 | } | 427 | } |
428 | |||
429 | u64 ktime_get_mono_fast_ns(void) | ||
430 | { | ||
431 | return __ktime_get_fast_ns(&tk_fast_mono); | ||
432 | } | ||
333 | EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); | 433 | EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); |
334 | 434 | ||
435 | u64 ktime_get_raw_fast_ns(void) | ||
436 | { | ||
437 | return __ktime_get_fast_ns(&tk_fast_raw); | ||
438 | } | ||
439 | EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); | ||
440 | |||
335 | /* Suspend-time cycles value for halted fast timekeeper. */ | 441 | /* Suspend-time cycles value for halted fast timekeeper. */ |
336 | static cycle_t cycles_at_suspend; | 442 | static cycle_t cycles_at_suspend; |
337 | 443 | ||
@@ -353,12 +459,17 @@ static cycle_t dummy_clock_read(struct clocksource *cs) | |||
353 | static void halt_fast_timekeeper(struct timekeeper *tk) | 459 | static void halt_fast_timekeeper(struct timekeeper *tk) |
354 | { | 460 | { |
355 | static struct tk_read_base tkr_dummy; | 461 | static struct tk_read_base tkr_dummy; |
356 | struct tk_read_base *tkr = &tk->tkr; | 462 | struct tk_read_base *tkr = &tk->tkr_mono; |
357 | 463 | ||
358 | memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); | 464 | memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); |
359 | cycles_at_suspend = tkr->read(tkr->clock); | 465 | cycles_at_suspend = tkr->read(tkr->clock); |
360 | tkr_dummy.read = dummy_clock_read; | 466 | tkr_dummy.read = dummy_clock_read; |
361 | update_fast_timekeeper(&tkr_dummy); | 467 | update_fast_timekeeper(&tkr_dummy, &tk_fast_mono); |
468 | |||
469 | tkr = &tk->tkr_raw; | ||
470 | memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); | ||
471 | tkr_dummy.read = dummy_clock_read; | ||
472 | update_fast_timekeeper(&tkr_dummy, &tk_fast_raw); | ||
362 | } | 473 | } |
363 | 474 | ||
364 | #ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD | 475 | #ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD |
@@ -369,8 +480,8 @@ static inline void update_vsyscall(struct timekeeper *tk) | |||
369 | 480 | ||
370 | xt = timespec64_to_timespec(tk_xtime(tk)); | 481 | xt = timespec64_to_timespec(tk_xtime(tk)); |
371 | wm = timespec64_to_timespec(tk->wall_to_monotonic); | 482 | wm = timespec64_to_timespec(tk->wall_to_monotonic); |
372 | update_vsyscall_old(&xt, &wm, tk->tkr.clock, tk->tkr.mult, | 483 | update_vsyscall_old(&xt, &wm, tk->tkr_mono.clock, tk->tkr_mono.mult, |
373 | tk->tkr.cycle_last); | 484 | tk->tkr_mono.cycle_last); |
374 | } | 485 | } |
375 | 486 | ||
376 | static inline void old_vsyscall_fixup(struct timekeeper *tk) | 487 | static inline void old_vsyscall_fixup(struct timekeeper *tk) |
@@ -387,11 +498,11 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk) | |||
387 | * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD | 498 | * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD |
388 | * users are removed, this can be killed. | 499 | * users are removed, this can be killed. |
389 | */ | 500 | */ |
390 | remainder = tk->tkr.xtime_nsec & ((1ULL << tk->tkr.shift) - 1); | 501 | remainder = tk->tkr_mono.xtime_nsec & ((1ULL << tk->tkr_mono.shift) - 1); |
391 | tk->tkr.xtime_nsec -= remainder; | 502 | tk->tkr_mono.xtime_nsec -= remainder; |
392 | tk->tkr.xtime_nsec += 1ULL << tk->tkr.shift; | 503 | tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift; |
393 | tk->ntp_error += remainder << tk->ntp_error_shift; | 504 | tk->ntp_error += remainder << tk->ntp_error_shift; |
394 | tk->ntp_error -= (1ULL << tk->tkr.shift) << tk->ntp_error_shift; | 505 | tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift; |
395 | } | 506 | } |
396 | #else | 507 | #else |
397 | #define old_vsyscall_fixup(tk) | 508 | #define old_vsyscall_fixup(tk) |
@@ -456,17 +567,17 @@ static inline void tk_update_ktime_data(struct timekeeper *tk) | |||
456 | */ | 567 | */ |
457 | seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec); | 568 | seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec); |
458 | nsec = (u32) tk->wall_to_monotonic.tv_nsec; | 569 | nsec = (u32) tk->wall_to_monotonic.tv_nsec; |
459 | tk->tkr.base_mono = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); | 570 | tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); |
460 | 571 | ||
461 | /* Update the monotonic raw base */ | 572 | /* Update the monotonic raw base */ |
462 | tk->base_raw = timespec64_to_ktime(tk->raw_time); | 573 | tk->tkr_raw.base = timespec64_to_ktime(tk->raw_time); |
463 | 574 | ||
464 | /* | 575 | /* |
465 | * The sum of the nanoseconds portions of xtime and | 576 | * The sum of the nanoseconds portions of xtime and |
466 | * wall_to_monotonic can be greater/equal one second. Take | 577 | * wall_to_monotonic can be greater/equal one second. Take |
467 | * this into account before updating tk->ktime_sec. | 578 | * this into account before updating tk->ktime_sec. |
468 | */ | 579 | */ |
469 | nsec += (u32)(tk->tkr.xtime_nsec >> tk->tkr.shift); | 580 | nsec += (u32)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); |
470 | if (nsec >= NSEC_PER_SEC) | 581 | if (nsec >= NSEC_PER_SEC) |
471 | seconds++; | 582 | seconds++; |
472 | tk->ktime_sec = seconds; | 583 | tk->ktime_sec = seconds; |
@@ -489,7 +600,8 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) | |||
489 | memcpy(&shadow_timekeeper, &tk_core.timekeeper, | 600 | memcpy(&shadow_timekeeper, &tk_core.timekeeper, |
490 | sizeof(tk_core.timekeeper)); | 601 | sizeof(tk_core.timekeeper)); |
491 | 602 | ||
492 | update_fast_timekeeper(&tk->tkr); | 603 | update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); |
604 | update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); | ||
493 | } | 605 | } |
494 | 606 | ||
495 | /** | 607 | /** |
@@ -501,22 +613,23 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) | |||
501 | */ | 613 | */ |
502 | static void timekeeping_forward_now(struct timekeeper *tk) | 614 | static void timekeeping_forward_now(struct timekeeper *tk) |
503 | { | 615 | { |
504 | struct clocksource *clock = tk->tkr.clock; | 616 | struct clocksource *clock = tk->tkr_mono.clock; |
505 | cycle_t cycle_now, delta; | 617 | cycle_t cycle_now, delta; |
506 | s64 nsec; | 618 | s64 nsec; |
507 | 619 | ||
508 | cycle_now = tk->tkr.read(clock); | 620 | cycle_now = tk->tkr_mono.read(clock); |
509 | delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask); | 621 | delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask); |
510 | tk->tkr.cycle_last = cycle_now; | 622 | tk->tkr_mono.cycle_last = cycle_now; |
623 | tk->tkr_raw.cycle_last = cycle_now; | ||
511 | 624 | ||
512 | tk->tkr.xtime_nsec += delta * tk->tkr.mult; | 625 | tk->tkr_mono.xtime_nsec += delta * tk->tkr_mono.mult; |
513 | 626 | ||
514 | /* If arch requires, add in get_arch_timeoffset() */ | 627 | /* If arch requires, add in get_arch_timeoffset() */ |
515 | tk->tkr.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr.shift; | 628 | tk->tkr_mono.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_mono.shift; |
516 | 629 | ||
517 | tk_normalize_xtime(tk); | 630 | tk_normalize_xtime(tk); |
518 | 631 | ||
519 | nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift); | 632 | nsec = clocksource_cyc2ns(delta, tk->tkr_raw.mult, tk->tkr_raw.shift); |
520 | timespec64_add_ns(&tk->raw_time, nsec); | 633 | timespec64_add_ns(&tk->raw_time, nsec); |
521 | } | 634 | } |
522 | 635 | ||
@@ -537,7 +650,7 @@ int __getnstimeofday64(struct timespec64 *ts) | |||
537 | seq = read_seqcount_begin(&tk_core.seq); | 650 | seq = read_seqcount_begin(&tk_core.seq); |
538 | 651 | ||
539 | ts->tv_sec = tk->xtime_sec; | 652 | ts->tv_sec = tk->xtime_sec; |
540 | nsecs = timekeeping_get_ns(&tk->tkr); | 653 | nsecs = timekeeping_get_ns(&tk->tkr_mono); |
541 | 654 | ||
542 | } while (read_seqcount_retry(&tk_core.seq, seq)); | 655 | } while (read_seqcount_retry(&tk_core.seq, seq)); |
543 | 656 | ||
@@ -577,8 +690,8 @@ ktime_t ktime_get(void) | |||
577 | 690 | ||
578 | do { | 691 | do { |
579 | seq = read_seqcount_begin(&tk_core.seq); | 692 | seq = read_seqcount_begin(&tk_core.seq); |
580 | base = tk->tkr.base_mono; | 693 | base = tk->tkr_mono.base; |
581 | nsecs = timekeeping_get_ns(&tk->tkr); | 694 | nsecs = timekeeping_get_ns(&tk->tkr_mono); |
582 | 695 | ||
583 | } while (read_seqcount_retry(&tk_core.seq, seq)); | 696 | } while (read_seqcount_retry(&tk_core.seq, seq)); |
584 | 697 | ||
@@ -603,8 +716,8 @@ ktime_t ktime_get_with_offset(enum tk_offsets offs) | |||
603 | 716 | ||
604 | do { | 717 | do { |
605 | seq = read_seqcount_begin(&tk_core.seq); | 718 | seq = read_seqcount_begin(&tk_core.seq); |
606 | base = ktime_add(tk->tkr.base_mono, *offset); | 719 | base = ktime_add(tk->tkr_mono.base, *offset); |
607 | nsecs = timekeeping_get_ns(&tk->tkr); | 720 | nsecs = timekeeping_get_ns(&tk->tkr_mono); |
608 | 721 | ||
609 | } while (read_seqcount_retry(&tk_core.seq, seq)); | 722 | } while (read_seqcount_retry(&tk_core.seq, seq)); |
610 | 723 | ||
@@ -645,8 +758,8 @@ ktime_t ktime_get_raw(void) | |||
645 | 758 | ||
646 | do { | 759 | do { |
647 | seq = read_seqcount_begin(&tk_core.seq); | 760 | seq = read_seqcount_begin(&tk_core.seq); |
648 | base = tk->base_raw; | 761 | base = tk->tkr_raw.base; |
649 | nsecs = timekeeping_get_ns_raw(tk); | 762 | nsecs = timekeeping_get_ns(&tk->tkr_raw); |
650 | 763 | ||
651 | } while (read_seqcount_retry(&tk_core.seq, seq)); | 764 | } while (read_seqcount_retry(&tk_core.seq, seq)); |
652 | 765 | ||
@@ -674,7 +787,7 @@ void ktime_get_ts64(struct timespec64 *ts) | |||
674 | do { | 787 | do { |
675 | seq = read_seqcount_begin(&tk_core.seq); | 788 | seq = read_seqcount_begin(&tk_core.seq); |
676 | ts->tv_sec = tk->xtime_sec; | 789 | ts->tv_sec = tk->xtime_sec; |
677 | nsec = timekeeping_get_ns(&tk->tkr); | 790 | nsec = timekeeping_get_ns(&tk->tkr_mono); |
678 | tomono = tk->wall_to_monotonic; | 791 | tomono = tk->wall_to_monotonic; |
679 | 792 | ||
680 | } while (read_seqcount_retry(&tk_core.seq, seq)); | 793 | } while (read_seqcount_retry(&tk_core.seq, seq)); |
@@ -759,8 +872,8 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) | |||
759 | ts_real->tv_sec = tk->xtime_sec; | 872 | ts_real->tv_sec = tk->xtime_sec; |
760 | ts_real->tv_nsec = 0; | 873 | ts_real->tv_nsec = 0; |
761 | 874 | ||
762 | nsecs_raw = timekeeping_get_ns_raw(tk); | 875 | nsecs_raw = timekeeping_get_ns(&tk->tkr_raw); |
763 | nsecs_real = timekeeping_get_ns(&tk->tkr); | 876 | nsecs_real = timekeeping_get_ns(&tk->tkr_mono); |
764 | 877 | ||
765 | } while (read_seqcount_retry(&tk_core.seq, seq)); | 878 | } while (read_seqcount_retry(&tk_core.seq, seq)); |
766 | 879 | ||
@@ -943,7 +1056,7 @@ static int change_clocksource(void *data) | |||
943 | */ | 1056 | */ |
944 | if (try_module_get(new->owner)) { | 1057 | if (try_module_get(new->owner)) { |
945 | if (!new->enable || new->enable(new) == 0) { | 1058 | if (!new->enable || new->enable(new) == 0) { |
946 | old = tk->tkr.clock; | 1059 | old = tk->tkr_mono.clock; |
947 | tk_setup_internals(tk, new); | 1060 | tk_setup_internals(tk, new); |
948 | if (old->disable) | 1061 | if (old->disable) |
949 | old->disable(old); | 1062 | old->disable(old); |
@@ -971,11 +1084,11 @@ int timekeeping_notify(struct clocksource *clock) | |||
971 | { | 1084 | { |
972 | struct timekeeper *tk = &tk_core.timekeeper; | 1085 | struct timekeeper *tk = &tk_core.timekeeper; |
973 | 1086 | ||
974 | if (tk->tkr.clock == clock) | 1087 | if (tk->tkr_mono.clock == clock) |
975 | return 0; | 1088 | return 0; |
976 | stop_machine(change_clocksource, clock, NULL); | 1089 | stop_machine(change_clocksource, clock, NULL); |
977 | tick_clock_notify(); | 1090 | tick_clock_notify(); |
978 | return tk->tkr.clock == clock ? 0 : -1; | 1091 | return tk->tkr_mono.clock == clock ? 0 : -1; |
979 | } | 1092 | } |
980 | 1093 | ||
981 | /** | 1094 | /** |
@@ -993,7 +1106,7 @@ void getrawmonotonic64(struct timespec64 *ts) | |||
993 | 1106 | ||
994 | do { | 1107 | do { |
995 | seq = read_seqcount_begin(&tk_core.seq); | 1108 | seq = read_seqcount_begin(&tk_core.seq); |
996 | nsecs = timekeeping_get_ns_raw(tk); | 1109 | nsecs = timekeeping_get_ns(&tk->tkr_raw); |
997 | ts64 = tk->raw_time; | 1110 | ts64 = tk->raw_time; |
998 | 1111 | ||
999 | } while (read_seqcount_retry(&tk_core.seq, seq)); | 1112 | } while (read_seqcount_retry(&tk_core.seq, seq)); |
@@ -1016,7 +1129,7 @@ int timekeeping_valid_for_hres(void) | |||
1016 | do { | 1129 | do { |
1017 | seq = read_seqcount_begin(&tk_core.seq); | 1130 | seq = read_seqcount_begin(&tk_core.seq); |
1018 | 1131 | ||
1019 | ret = tk->tkr.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; | 1132 | ret = tk->tkr_mono.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; |
1020 | 1133 | ||
1021 | } while (read_seqcount_retry(&tk_core.seq, seq)); | 1134 | } while (read_seqcount_retry(&tk_core.seq, seq)); |
1022 | 1135 | ||
@@ -1035,7 +1148,7 @@ u64 timekeeping_max_deferment(void) | |||
1035 | do { | 1148 | do { |
1036 | seq = read_seqcount_begin(&tk_core.seq); | 1149 | seq = read_seqcount_begin(&tk_core.seq); |
1037 | 1150 | ||
1038 | ret = tk->tkr.clock->max_idle_ns; | 1151 | ret = tk->tkr_mono.clock->max_idle_ns; |
1039 | 1152 | ||
1040 | } while (read_seqcount_retry(&tk_core.seq, seq)); | 1153 | } while (read_seqcount_retry(&tk_core.seq, seq)); |
1041 | 1154 | ||
@@ -1057,6 +1170,14 @@ void __weak read_persistent_clock(struct timespec *ts) | |||
1057 | ts->tv_nsec = 0; | 1170 | ts->tv_nsec = 0; |
1058 | } | 1171 | } |
1059 | 1172 | ||
1173 | void __weak read_persistent_clock64(struct timespec64 *ts64) | ||
1174 | { | ||
1175 | struct timespec ts; | ||
1176 | |||
1177 | read_persistent_clock(&ts); | ||
1178 | *ts64 = timespec_to_timespec64(ts); | ||
1179 | } | ||
1180 | |||
1060 | /** | 1181 | /** |
1061 | * read_boot_clock - Return time of the system start. | 1182 | * read_boot_clock - Return time of the system start. |
1062 | * | 1183 | * |
@@ -1072,6 +1193,20 @@ void __weak read_boot_clock(struct timespec *ts) | |||
1072 | ts->tv_nsec = 0; | 1193 | ts->tv_nsec = 0; |
1073 | } | 1194 | } |
1074 | 1195 | ||
1196 | void __weak read_boot_clock64(struct timespec64 *ts64) | ||
1197 | { | ||
1198 | struct timespec ts; | ||
1199 | |||
1200 | read_boot_clock(&ts); | ||
1201 | *ts64 = timespec_to_timespec64(ts); | ||
1202 | } | ||
1203 | |||
1204 | /* Flag for if timekeeping_resume() has injected sleeptime */ | ||
1205 | static bool sleeptime_injected; | ||
1206 | |||
1207 | /* Flag for if there is a persistent clock on this platform */ | ||
1208 | static bool persistent_clock_exists; | ||
1209 | |||
1075 | /* | 1210 | /* |
1076 | * timekeeping_init - Initializes the clocksource and common timekeeping values | 1211 | * timekeeping_init - Initializes the clocksource and common timekeeping values |
1077 | */ | 1212 | */ |
@@ -1081,20 +1216,17 @@ void __init timekeeping_init(void) | |||
1081 | struct clocksource *clock; | 1216 | struct clocksource *clock; |
1082 | unsigned long flags; | 1217 | unsigned long flags; |
1083 | struct timespec64 now, boot, tmp; | 1218 | struct timespec64 now, boot, tmp; |
1084 | struct timespec ts; | ||
1085 | 1219 | ||
1086 | read_persistent_clock(&ts); | 1220 | read_persistent_clock64(&now); |
1087 | now = timespec_to_timespec64(ts); | ||
1088 | if (!timespec64_valid_strict(&now)) { | 1221 | if (!timespec64_valid_strict(&now)) { |
1089 | pr_warn("WARNING: Persistent clock returned invalid value!\n" | 1222 | pr_warn("WARNING: Persistent clock returned invalid value!\n" |
1090 | " Check your CMOS/BIOS settings.\n"); | 1223 | " Check your CMOS/BIOS settings.\n"); |
1091 | now.tv_sec = 0; | 1224 | now.tv_sec = 0; |
1092 | now.tv_nsec = 0; | 1225 | now.tv_nsec = 0; |
1093 | } else if (now.tv_sec || now.tv_nsec) | 1226 | } else if (now.tv_sec || now.tv_nsec) |
1094 | persistent_clock_exist = true; | 1227 | persistent_clock_exists = true; |
1095 | 1228 | ||
1096 | read_boot_clock(&ts); | 1229 | read_boot_clock64(&boot); |
1097 | boot = timespec_to_timespec64(ts); | ||
1098 | if (!timespec64_valid_strict(&boot)) { | 1230 | if (!timespec64_valid_strict(&boot)) { |
1099 | pr_warn("WARNING: Boot clock returned invalid value!\n" | 1231 | pr_warn("WARNING: Boot clock returned invalid value!\n" |
1100 | " Check your CMOS/BIOS settings.\n"); | 1232 | " Check your CMOS/BIOS settings.\n"); |
@@ -1114,7 +1246,6 @@ void __init timekeeping_init(void) | |||
1114 | tk_set_xtime(tk, &now); | 1246 | tk_set_xtime(tk, &now); |
1115 | tk->raw_time.tv_sec = 0; | 1247 | tk->raw_time.tv_sec = 0; |
1116 | tk->raw_time.tv_nsec = 0; | 1248 | tk->raw_time.tv_nsec = 0; |
1117 | tk->base_raw.tv64 = 0; | ||
1118 | if (boot.tv_sec == 0 && boot.tv_nsec == 0) | 1249 | if (boot.tv_sec == 0 && boot.tv_nsec == 0) |
1119 | boot = tk_xtime(tk); | 1250 | boot = tk_xtime(tk); |
1120 | 1251 | ||
@@ -1127,7 +1258,7 @@ void __init timekeeping_init(void) | |||
1127 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); | 1258 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); |
1128 | } | 1259 | } |
1129 | 1260 | ||
1130 | /* time in seconds when suspend began */ | 1261 | /* time in seconds when suspend began for persistent clock */ |
1131 | static struct timespec64 timekeeping_suspend_time; | 1262 | static struct timespec64 timekeeping_suspend_time; |
1132 | 1263 | ||
1133 | /** | 1264 | /** |
@@ -1152,12 +1283,49 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, | |||
1152 | tk_debug_account_sleep_time(delta); | 1283 | tk_debug_account_sleep_time(delta); |
1153 | } | 1284 | } |
1154 | 1285 | ||
1286 | #if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE) | ||
1287 | /** | ||
1288 | * We have three kinds of time sources to use for sleep time | ||
1289 | * injection, the preference order is: | ||
1290 | * 1) non-stop clocksource | ||
1291 | * 2) persistent clock (ie: RTC accessible when irqs are off) | ||
1292 | * 3) RTC | ||
1293 | * | ||
1294 | * 1) and 2) are used by timekeeping, 3) by RTC subsystem. | ||
1295 | * If system has neither 1) nor 2), 3) will be used finally. | ||
1296 | * | ||
1297 | * | ||
1298 | * If timekeeping has injected sleeptime via either 1) or 2), | ||
1299 | * 3) becomes needless, so in this case we don't need to call | ||
1300 | * rtc_resume(), and this is what timekeeping_rtc_skipresume() | ||
1301 | * means. | ||
1302 | */ | ||
1303 | bool timekeeping_rtc_skipresume(void) | ||
1304 | { | ||
1305 | return sleeptime_injected; | ||
1306 | } | ||
1307 | |||
1308 | /** | ||
1309 | * 1) can be determined whether to use or not only when doing | ||
1310 | * timekeeping_resume() which is invoked after rtc_suspend(), | ||
1311 | * so we can't skip rtc_suspend() surely if system has 1). | ||
1312 | * | ||
1313 | * But if system has 2), 2) will definitely be used, so in this | ||
1314 | * case we don't need to call rtc_suspend(), and this is what | ||
1315 | * timekeeping_rtc_skipsuspend() means. | ||
1316 | */ | ||
1317 | bool timekeeping_rtc_skipsuspend(void) | ||
1318 | { | ||
1319 | return persistent_clock_exists; | ||
1320 | } | ||
1321 | |||
1155 | /** | 1322 | /** |
1156 | * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values | 1323 | * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values |
1157 | * @delta: pointer to a timespec64 delta value | 1324 | * @delta: pointer to a timespec64 delta value |
1158 | * | 1325 | * |
1159 | * This hook is for architectures that cannot support read_persistent_clock | 1326 | * This hook is for architectures that cannot support read_persistent_clock64 |
1160 | * because their RTC/persistent clock is only accessible when irqs are enabled. | 1327 | * because their RTC/persistent clock is only accessible when irqs are enabled. |
1328 | * and also don't have an effective nonstop clocksource. | ||
1161 | * | 1329 | * |
1162 | * This function should only be called by rtc_resume(), and allows | 1330 | * This function should only be called by rtc_resume(), and allows |
1163 | * a suspend offset to be injected into the timekeeping values. | 1331 | * a suspend offset to be injected into the timekeeping values. |
@@ -1167,13 +1335,6 @@ void timekeeping_inject_sleeptime64(struct timespec64 *delta) | |||
1167 | struct timekeeper *tk = &tk_core.timekeeper; | 1335 | struct timekeeper *tk = &tk_core.timekeeper; |
1168 | unsigned long flags; | 1336 | unsigned long flags; |
1169 | 1337 | ||
1170 | /* | ||
1171 | * Make sure we don't set the clock twice, as timekeeping_resume() | ||
1172 | * already did it | ||
1173 | */ | ||
1174 | if (has_persistent_clock()) | ||
1175 | return; | ||
1176 | |||
1177 | raw_spin_lock_irqsave(&timekeeper_lock, flags); | 1338 | raw_spin_lock_irqsave(&timekeeper_lock, flags); |
1178 | write_seqcount_begin(&tk_core.seq); | 1339 | write_seqcount_begin(&tk_core.seq); |
1179 | 1340 | ||
@@ -1189,26 +1350,21 @@ void timekeeping_inject_sleeptime64(struct timespec64 *delta) | |||
1189 | /* signal hrtimers about time change */ | 1350 | /* signal hrtimers about time change */ |
1190 | clock_was_set(); | 1351 | clock_was_set(); |
1191 | } | 1352 | } |
1353 | #endif | ||
1192 | 1354 | ||
1193 | /** | 1355 | /** |
1194 | * timekeeping_resume - Resumes the generic timekeeping subsystem. | 1356 | * timekeeping_resume - Resumes the generic timekeeping subsystem. |
1195 | * | ||
1196 | * This is for the generic clocksource timekeeping. | ||
1197 | * xtime/wall_to_monotonic/jiffies/etc are | ||
1198 | * still managed by arch specific suspend/resume code. | ||
1199 | */ | 1357 | */ |
1200 | void timekeeping_resume(void) | 1358 | void timekeeping_resume(void) |
1201 | { | 1359 | { |
1202 | struct timekeeper *tk = &tk_core.timekeeper; | 1360 | struct timekeeper *tk = &tk_core.timekeeper; |
1203 | struct clocksource *clock = tk->tkr.clock; | 1361 | struct clocksource *clock = tk->tkr_mono.clock; |
1204 | unsigned long flags; | 1362 | unsigned long flags; |
1205 | struct timespec64 ts_new, ts_delta; | 1363 | struct timespec64 ts_new, ts_delta; |
1206 | struct timespec tmp; | ||
1207 | cycle_t cycle_now, cycle_delta; | 1364 | cycle_t cycle_now, cycle_delta; |
1208 | bool suspendtime_found = false; | ||
1209 | 1365 | ||
1210 | read_persistent_clock(&tmp); | 1366 | sleeptime_injected = false; |
1211 | ts_new = timespec_to_timespec64(tmp); | 1367 | read_persistent_clock64(&ts_new); |
1212 | 1368 | ||
1213 | clockevents_resume(); | 1369 | clockevents_resume(); |
1214 | clocksource_resume(); | 1370 | clocksource_resume(); |
@@ -1228,16 +1384,16 @@ void timekeeping_resume(void) | |||
1228 | * The less preferred source will only be tried if there is no better | 1384 | * The less preferred source will only be tried if there is no better |
1229 | * usable source. The rtc part is handled separately in rtc core code. | 1385 | * usable source. The rtc part is handled separately in rtc core code. |
1230 | */ | 1386 | */ |
1231 | cycle_now = tk->tkr.read(clock); | 1387 | cycle_now = tk->tkr_mono.read(clock); |
1232 | if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) && | 1388 | if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) && |
1233 | cycle_now > tk->tkr.cycle_last) { | 1389 | cycle_now > tk->tkr_mono.cycle_last) { |
1234 | u64 num, max = ULLONG_MAX; | 1390 | u64 num, max = ULLONG_MAX; |
1235 | u32 mult = clock->mult; | 1391 | u32 mult = clock->mult; |
1236 | u32 shift = clock->shift; | 1392 | u32 shift = clock->shift; |
1237 | s64 nsec = 0; | 1393 | s64 nsec = 0; |
1238 | 1394 | ||
1239 | cycle_delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, | 1395 | cycle_delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, |
1240 | tk->tkr.mask); | 1396 | tk->tkr_mono.mask); |
1241 | 1397 | ||
1242 | /* | 1398 | /* |
1243 | * "cycle_delta * mutl" may cause 64 bits overflow, if the | 1399 | * "cycle_delta * mutl" may cause 64 bits overflow, if the |
@@ -1253,17 +1409,19 @@ void timekeeping_resume(void) | |||
1253 | nsec += ((u64) cycle_delta * mult) >> shift; | 1409 | nsec += ((u64) cycle_delta * mult) >> shift; |
1254 | 1410 | ||
1255 | ts_delta = ns_to_timespec64(nsec); | 1411 | ts_delta = ns_to_timespec64(nsec); |
1256 | suspendtime_found = true; | 1412 | sleeptime_injected = true; |
1257 | } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) { | 1413 | } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) { |
1258 | ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time); | 1414 | ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time); |
1259 | suspendtime_found = true; | 1415 | sleeptime_injected = true; |
1260 | } | 1416 | } |
1261 | 1417 | ||
1262 | if (suspendtime_found) | 1418 | if (sleeptime_injected) |
1263 | __timekeeping_inject_sleeptime(tk, &ts_delta); | 1419 | __timekeeping_inject_sleeptime(tk, &ts_delta); |
1264 | 1420 | ||
1265 | /* Re-base the last cycle value */ | 1421 | /* Re-base the last cycle value */ |
1266 | tk->tkr.cycle_last = cycle_now; | 1422 | tk->tkr_mono.cycle_last = cycle_now; |
1423 | tk->tkr_raw.cycle_last = cycle_now; | ||
1424 | |||
1267 | tk->ntp_error = 0; | 1425 | tk->ntp_error = 0; |
1268 | timekeeping_suspended = 0; | 1426 | timekeeping_suspended = 0; |
1269 | timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); | 1427 | timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); |
@@ -1272,9 +1430,7 @@ void timekeeping_resume(void) | |||
1272 | 1430 | ||
1273 | touch_softlockup_watchdog(); | 1431 | touch_softlockup_watchdog(); |
1274 | 1432 | ||
1275 | clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL); | 1433 | tick_resume(); |
1276 | |||
1277 | /* Resume hrtimers */ | ||
1278 | hrtimers_resume(); | 1434 | hrtimers_resume(); |
1279 | } | 1435 | } |
1280 | 1436 | ||
@@ -1284,10 +1440,8 @@ int timekeeping_suspend(void) | |||
1284 | unsigned long flags; | 1440 | unsigned long flags; |
1285 | struct timespec64 delta, delta_delta; | 1441 | struct timespec64 delta, delta_delta; |
1286 | static struct timespec64 old_delta; | 1442 | static struct timespec64 old_delta; |
1287 | struct timespec tmp; | ||
1288 | 1443 | ||
1289 | read_persistent_clock(&tmp); | 1444 | read_persistent_clock64(&timekeeping_suspend_time); |
1290 | timekeeping_suspend_time = timespec_to_timespec64(tmp); | ||
1291 | 1445 | ||
1292 | /* | 1446 | /* |
1293 | * On some systems the persistent_clock can not be detected at | 1447 | * On some systems the persistent_clock can not be detected at |
@@ -1295,31 +1449,33 @@ int timekeeping_suspend(void) | |||
1295 | * value returned, update the persistent_clock_exists flag. | 1449 | * value returned, update the persistent_clock_exists flag. |
1296 | */ | 1450 | */ |
1297 | if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec) | 1451 | if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec) |
1298 | persistent_clock_exist = true; | 1452 | persistent_clock_exists = true; |
1299 | 1453 | ||
1300 | raw_spin_lock_irqsave(&timekeeper_lock, flags); | 1454 | raw_spin_lock_irqsave(&timekeeper_lock, flags); |
1301 | write_seqcount_begin(&tk_core.seq); | 1455 | write_seqcount_begin(&tk_core.seq); |
1302 | timekeeping_forward_now(tk); | 1456 | timekeeping_forward_now(tk); |
1303 | timekeeping_suspended = 1; | 1457 | timekeeping_suspended = 1; |
1304 | 1458 | ||
1305 | /* | 1459 | if (persistent_clock_exists) { |
1306 | * To avoid drift caused by repeated suspend/resumes, | ||
1307 | * which each can add ~1 second drift error, | ||
1308 | * try to compensate so the difference in system time | ||
1309 | * and persistent_clock time stays close to constant. | ||
1310 | */ | ||
1311 | delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time); | ||
1312 | delta_delta = timespec64_sub(delta, old_delta); | ||
1313 | if (abs(delta_delta.tv_sec) >= 2) { | ||
1314 | /* | 1460 | /* |
1315 | * if delta_delta is too large, assume time correction | 1461 | * To avoid drift caused by repeated suspend/resumes, |
1316 | * has occured and set old_delta to the current delta. | 1462 | * which each can add ~1 second drift error, |
1463 | * try to compensate so the difference in system time | ||
1464 | * and persistent_clock time stays close to constant. | ||
1317 | */ | 1465 | */ |
1318 | old_delta = delta; | 1466 | delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time); |
1319 | } else { | 1467 | delta_delta = timespec64_sub(delta, old_delta); |
1320 | /* Otherwise try to adjust old_system to compensate */ | 1468 | if (abs(delta_delta.tv_sec) >= 2) { |
1321 | timekeeping_suspend_time = | 1469 | /* |
1322 | timespec64_add(timekeeping_suspend_time, delta_delta); | 1470 | * if delta_delta is too large, assume time correction |
1471 | * has occurred and set old_delta to the current delta. | ||
1472 | */ | ||
1473 | old_delta = delta; | ||
1474 | } else { | ||
1475 | /* Otherwise try to adjust old_system to compensate */ | ||
1476 | timekeeping_suspend_time = | ||
1477 | timespec64_add(timekeeping_suspend_time, delta_delta); | ||
1478 | } | ||
1323 | } | 1479 | } |
1324 | 1480 | ||
1325 | timekeeping_update(tk, TK_MIRROR); | 1481 | timekeeping_update(tk, TK_MIRROR); |
@@ -1327,7 +1483,7 @@ int timekeeping_suspend(void) | |||
1327 | write_seqcount_end(&tk_core.seq); | 1483 | write_seqcount_end(&tk_core.seq); |
1328 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); | 1484 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); |
1329 | 1485 | ||
1330 | clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); | 1486 | tick_suspend(); |
1331 | clocksource_suspend(); | 1487 | clocksource_suspend(); |
1332 | clockevents_suspend(); | 1488 | clockevents_suspend(); |
1333 | 1489 | ||
@@ -1416,15 +1572,15 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, | |||
1416 | * | 1572 | * |
1417 | * XXX - TODO: Doc ntp_error calculation. | 1573 | * XXX - TODO: Doc ntp_error calculation. |
1418 | */ | 1574 | */ |
1419 | if ((mult_adj > 0) && (tk->tkr.mult + mult_adj < mult_adj)) { | 1575 | if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) { |
1420 | /* NTP adjustment caused clocksource mult overflow */ | 1576 | /* NTP adjustment caused clocksource mult overflow */ |
1421 | WARN_ON_ONCE(1); | 1577 | WARN_ON_ONCE(1); |
1422 | return; | 1578 | return; |
1423 | } | 1579 | } |
1424 | 1580 | ||
1425 | tk->tkr.mult += mult_adj; | 1581 | tk->tkr_mono.mult += mult_adj; |
1426 | tk->xtime_interval += interval; | 1582 | tk->xtime_interval += interval; |
1427 | tk->tkr.xtime_nsec -= offset; | 1583 | tk->tkr_mono.xtime_nsec -= offset; |
1428 | tk->ntp_error -= (interval - offset) << tk->ntp_error_shift; | 1584 | tk->ntp_error -= (interval - offset) << tk->ntp_error_shift; |
1429 | } | 1585 | } |
1430 | 1586 | ||
@@ -1486,13 +1642,13 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) | |||
1486 | tk->ntp_err_mult = 0; | 1642 | tk->ntp_err_mult = 0; |
1487 | } | 1643 | } |
1488 | 1644 | ||
1489 | if (unlikely(tk->tkr.clock->maxadj && | 1645 | if (unlikely(tk->tkr_mono.clock->maxadj && |
1490 | (abs(tk->tkr.mult - tk->tkr.clock->mult) | 1646 | (abs(tk->tkr_mono.mult - tk->tkr_mono.clock->mult) |
1491 | > tk->tkr.clock->maxadj))) { | 1647 | > tk->tkr_mono.clock->maxadj))) { |
1492 | printk_once(KERN_WARNING | 1648 | printk_once(KERN_WARNING |
1493 | "Adjusting %s more than 11%% (%ld vs %ld)\n", | 1649 | "Adjusting %s more than 11%% (%ld vs %ld)\n", |
1494 | tk->tkr.clock->name, (long)tk->tkr.mult, | 1650 | tk->tkr_mono.clock->name, (long)tk->tkr_mono.mult, |
1495 | (long)tk->tkr.clock->mult + tk->tkr.clock->maxadj); | 1651 | (long)tk->tkr_mono.clock->mult + tk->tkr_mono.clock->maxadj); |
1496 | } | 1652 | } |
1497 | 1653 | ||
1498 | /* | 1654 | /* |
@@ -1509,9 +1665,9 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) | |||
1509 | * We'll correct this error next time through this function, when | 1665 | * We'll correct this error next time through this function, when |
1510 | * xtime_nsec is not as small. | 1666 | * xtime_nsec is not as small. |
1511 | */ | 1667 | */ |
1512 | if (unlikely((s64)tk->tkr.xtime_nsec < 0)) { | 1668 | if (unlikely((s64)tk->tkr_mono.xtime_nsec < 0)) { |
1513 | s64 neg = -(s64)tk->tkr.xtime_nsec; | 1669 | s64 neg = -(s64)tk->tkr_mono.xtime_nsec; |
1514 | tk->tkr.xtime_nsec = 0; | 1670 | tk->tkr_mono.xtime_nsec = 0; |
1515 | tk->ntp_error += neg << tk->ntp_error_shift; | 1671 | tk->ntp_error += neg << tk->ntp_error_shift; |
1516 | } | 1672 | } |
1517 | } | 1673 | } |
@@ -1526,13 +1682,13 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) | |||
1526 | */ | 1682 | */ |
1527 | static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) | 1683 | static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) |
1528 | { | 1684 | { |
1529 | u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr.shift; | 1685 | u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr_mono.shift; |
1530 | unsigned int clock_set = 0; | 1686 | unsigned int clock_set = 0; |
1531 | 1687 | ||
1532 | while (tk->tkr.xtime_nsec >= nsecps) { | 1688 | while (tk->tkr_mono.xtime_nsec >= nsecps) { |
1533 | int leap; | 1689 | int leap; |
1534 | 1690 | ||
1535 | tk->tkr.xtime_nsec -= nsecps; | 1691 | tk->tkr_mono.xtime_nsec -= nsecps; |
1536 | tk->xtime_sec++; | 1692 | tk->xtime_sec++; |
1537 | 1693 | ||
1538 | /* Figure out if its a leap sec and apply if needed */ | 1694 | /* Figure out if its a leap sec and apply if needed */ |
@@ -1577,9 +1733,10 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, | |||
1577 | 1733 | ||
1578 | /* Accumulate one shifted interval */ | 1734 | /* Accumulate one shifted interval */ |
1579 | offset -= interval; | 1735 | offset -= interval; |
1580 | tk->tkr.cycle_last += interval; | 1736 | tk->tkr_mono.cycle_last += interval; |
1737 | tk->tkr_raw.cycle_last += interval; | ||
1581 | 1738 | ||
1582 | tk->tkr.xtime_nsec += tk->xtime_interval << shift; | 1739 | tk->tkr_mono.xtime_nsec += tk->xtime_interval << shift; |
1583 | *clock_set |= accumulate_nsecs_to_secs(tk); | 1740 | *clock_set |= accumulate_nsecs_to_secs(tk); |
1584 | 1741 | ||
1585 | /* Accumulate raw time */ | 1742 | /* Accumulate raw time */ |
@@ -1622,14 +1779,17 @@ void update_wall_time(void) | |||
1622 | #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET | 1779 | #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET |
1623 | offset = real_tk->cycle_interval; | 1780 | offset = real_tk->cycle_interval; |
1624 | #else | 1781 | #else |
1625 | offset = clocksource_delta(tk->tkr.read(tk->tkr.clock), | 1782 | offset = clocksource_delta(tk->tkr_mono.read(tk->tkr_mono.clock), |
1626 | tk->tkr.cycle_last, tk->tkr.mask); | 1783 | tk->tkr_mono.cycle_last, tk->tkr_mono.mask); |
1627 | #endif | 1784 | #endif |
1628 | 1785 | ||
1629 | /* Check if there's really nothing to do */ | 1786 | /* Check if there's really nothing to do */ |
1630 | if (offset < real_tk->cycle_interval) | 1787 | if (offset < real_tk->cycle_interval) |
1631 | goto out; | 1788 | goto out; |
1632 | 1789 | ||
1790 | /* Do some additional sanity checking */ | ||
1791 | timekeeping_check_update(real_tk, offset); | ||
1792 | |||
1633 | /* | 1793 | /* |
1634 | * With NO_HZ we may have to accumulate many cycle_intervals | 1794 | * With NO_HZ we may have to accumulate many cycle_intervals |
1635 | * (think "ticks") worth of time at once. To do this efficiently, | 1795 | * (think "ticks") worth of time at once. To do this efficiently, |
@@ -1784,8 +1944,8 @@ ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot, | |||
1784 | do { | 1944 | do { |
1785 | seq = read_seqcount_begin(&tk_core.seq); | 1945 | seq = read_seqcount_begin(&tk_core.seq); |
1786 | 1946 | ||
1787 | base = tk->tkr.base_mono; | 1947 | base = tk->tkr_mono.base; |
1788 | nsecs = tk->tkr.xtime_nsec >> tk->tkr.shift; | 1948 | nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; |
1789 | 1949 | ||
1790 | *offs_real = tk->offs_real; | 1950 | *offs_real = tk->offs_real; |
1791 | *offs_boot = tk->offs_boot; | 1951 | *offs_boot = tk->offs_boot; |
@@ -1816,8 +1976,8 @@ ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot, | |||
1816 | do { | 1976 | do { |
1817 | seq = read_seqcount_begin(&tk_core.seq); | 1977 | seq = read_seqcount_begin(&tk_core.seq); |
1818 | 1978 | ||
1819 | base = tk->tkr.base_mono; | 1979 | base = tk->tkr_mono.base; |
1820 | nsecs = timekeeping_get_ns(&tk->tkr); | 1980 | nsecs = timekeeping_get_ns(&tk->tkr_mono); |
1821 | 1981 | ||
1822 | *offs_real = tk->offs_real; | 1982 | *offs_real = tk->offs_real; |
1823 | *offs_boot = tk->offs_boot; | 1983 | *offs_boot = tk->offs_boot; |
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index 1d91416055d5..ead8794b9a4e 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h | |||
@@ -19,4 +19,11 @@ extern void timekeeping_clocktai(struct timespec *ts); | |||
19 | extern int timekeeping_suspend(void); | 19 | extern int timekeeping_suspend(void); |
20 | extern void timekeeping_resume(void); | 20 | extern void timekeeping_resume(void); |
21 | 21 | ||
22 | extern void do_timer(unsigned long ticks); | ||
23 | extern void update_wall_time(void); | ||
24 | |||
25 | extern seqlock_t jiffies_lock; | ||
26 | |||
27 | #define CS_NAME_LEN 32 | ||
28 | |||
22 | #endif | 29 | #endif |
diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 2d3f5c504939..2ece3aa5069c 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c | |||
@@ -90,8 +90,18 @@ struct tvec_base { | |||
90 | struct tvec tv5; | 90 | struct tvec tv5; |
91 | } ____cacheline_aligned; | 91 | } ____cacheline_aligned; |
92 | 92 | ||
93 | /* | ||
94 | * __TIMER_INITIALIZER() needs to set ->base to a valid pointer (because we've | ||
95 | * made NULL special, hint: lock_timer_base()) and we cannot get a compile time | ||
96 | * pointer to per-cpu entries because we don't know where we'll map the section, | ||
97 | * even for the boot cpu. | ||
98 | * | ||
99 | * And so we use boot_tvec_bases for boot CPU and per-cpu __tvec_bases for the | ||
100 | * rest of them. | ||
101 | */ | ||
93 | struct tvec_base boot_tvec_bases; | 102 | struct tvec_base boot_tvec_bases; |
94 | EXPORT_SYMBOL(boot_tvec_bases); | 103 | EXPORT_SYMBOL(boot_tvec_bases); |
104 | |||
95 | static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; | 105 | static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; |
96 | 106 | ||
97 | /* Functions below help us manage 'deferrable' flag */ | 107 | /* Functions below help us manage 'deferrable' flag */ |
@@ -1027,6 +1037,8 @@ int try_to_del_timer_sync(struct timer_list *timer) | |||
1027 | EXPORT_SYMBOL(try_to_del_timer_sync); | 1037 | EXPORT_SYMBOL(try_to_del_timer_sync); |
1028 | 1038 | ||
1029 | #ifdef CONFIG_SMP | 1039 | #ifdef CONFIG_SMP |
1040 | static DEFINE_PER_CPU(struct tvec_base, __tvec_bases); | ||
1041 | |||
1030 | /** | 1042 | /** |
1031 | * del_timer_sync - deactivate a timer and wait for the handler to finish. | 1043 | * del_timer_sync - deactivate a timer and wait for the handler to finish. |
1032 | * @timer: the timer to be deactivated | 1044 | * @timer: the timer to be deactivated |
@@ -1532,64 +1544,6 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout) | |||
1532 | } | 1544 | } |
1533 | EXPORT_SYMBOL(schedule_timeout_uninterruptible); | 1545 | EXPORT_SYMBOL(schedule_timeout_uninterruptible); |
1534 | 1546 | ||
1535 | static int init_timers_cpu(int cpu) | ||
1536 | { | ||
1537 | int j; | ||
1538 | struct tvec_base *base; | ||
1539 | static char tvec_base_done[NR_CPUS]; | ||
1540 | |||
1541 | if (!tvec_base_done[cpu]) { | ||
1542 | static char boot_done; | ||
1543 | |||
1544 | if (boot_done) { | ||
1545 | /* | ||
1546 | * The APs use this path later in boot | ||
1547 | */ | ||
1548 | base = kzalloc_node(sizeof(*base), GFP_KERNEL, | ||
1549 | cpu_to_node(cpu)); | ||
1550 | if (!base) | ||
1551 | return -ENOMEM; | ||
1552 | |||
1553 | /* Make sure tvec_base has TIMER_FLAG_MASK bits free */ | ||
1554 | if (WARN_ON(base != tbase_get_base(base))) { | ||
1555 | kfree(base); | ||
1556 | return -ENOMEM; | ||
1557 | } | ||
1558 | per_cpu(tvec_bases, cpu) = base; | ||
1559 | } else { | ||
1560 | /* | ||
1561 | * This is for the boot CPU - we use compile-time | ||
1562 | * static initialisation because per-cpu memory isn't | ||
1563 | * ready yet and because the memory allocators are not | ||
1564 | * initialised either. | ||
1565 | */ | ||
1566 | boot_done = 1; | ||
1567 | base = &boot_tvec_bases; | ||
1568 | } | ||
1569 | spin_lock_init(&base->lock); | ||
1570 | tvec_base_done[cpu] = 1; | ||
1571 | base->cpu = cpu; | ||
1572 | } else { | ||
1573 | base = per_cpu(tvec_bases, cpu); | ||
1574 | } | ||
1575 | |||
1576 | |||
1577 | for (j = 0; j < TVN_SIZE; j++) { | ||
1578 | INIT_LIST_HEAD(base->tv5.vec + j); | ||
1579 | INIT_LIST_HEAD(base->tv4.vec + j); | ||
1580 | INIT_LIST_HEAD(base->tv3.vec + j); | ||
1581 | INIT_LIST_HEAD(base->tv2.vec + j); | ||
1582 | } | ||
1583 | for (j = 0; j < TVR_SIZE; j++) | ||
1584 | INIT_LIST_HEAD(base->tv1.vec + j); | ||
1585 | |||
1586 | base->timer_jiffies = jiffies; | ||
1587 | base->next_timer = base->timer_jiffies; | ||
1588 | base->active_timers = 0; | ||
1589 | base->all_timers = 0; | ||
1590 | return 0; | ||
1591 | } | ||
1592 | |||
1593 | #ifdef CONFIG_HOTPLUG_CPU | 1547 | #ifdef CONFIG_HOTPLUG_CPU |
1594 | static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head) | 1548 | static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head) |
1595 | { | 1549 | { |
@@ -1631,55 +1585,86 @@ static void migrate_timers(int cpu) | |||
1631 | migrate_timer_list(new_base, old_base->tv5.vec + i); | 1585 | migrate_timer_list(new_base, old_base->tv5.vec + i); |
1632 | } | 1586 | } |
1633 | 1587 | ||
1588 | old_base->active_timers = 0; | ||
1589 | old_base->all_timers = 0; | ||
1590 | |||
1634 | spin_unlock(&old_base->lock); | 1591 | spin_unlock(&old_base->lock); |
1635 | spin_unlock_irq(&new_base->lock); | 1592 | spin_unlock_irq(&new_base->lock); |
1636 | put_cpu_var(tvec_bases); | 1593 | put_cpu_var(tvec_bases); |
1637 | } | 1594 | } |
1638 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
1639 | 1595 | ||
1640 | static int timer_cpu_notify(struct notifier_block *self, | 1596 | static int timer_cpu_notify(struct notifier_block *self, |
1641 | unsigned long action, void *hcpu) | 1597 | unsigned long action, void *hcpu) |
1642 | { | 1598 | { |
1643 | long cpu = (long)hcpu; | 1599 | switch (action) { |
1644 | int err; | ||
1645 | |||
1646 | switch(action) { | ||
1647 | case CPU_UP_PREPARE: | ||
1648 | case CPU_UP_PREPARE_FROZEN: | ||
1649 | err = init_timers_cpu(cpu); | ||
1650 | if (err < 0) | ||
1651 | return notifier_from_errno(err); | ||
1652 | break; | ||
1653 | #ifdef CONFIG_HOTPLUG_CPU | ||
1654 | case CPU_DEAD: | 1600 | case CPU_DEAD: |
1655 | case CPU_DEAD_FROZEN: | 1601 | case CPU_DEAD_FROZEN: |
1656 | migrate_timers(cpu); | 1602 | migrate_timers((long)hcpu); |
1657 | break; | 1603 | break; |
1658 | #endif | ||
1659 | default: | 1604 | default: |
1660 | break; | 1605 | break; |
1661 | } | 1606 | } |
1607 | |||
1662 | return NOTIFY_OK; | 1608 | return NOTIFY_OK; |
1663 | } | 1609 | } |
1664 | 1610 | ||
1665 | static struct notifier_block timers_nb = { | 1611 | static inline void timer_register_cpu_notifier(void) |
1666 | .notifier_call = timer_cpu_notify, | 1612 | { |
1667 | }; | 1613 | cpu_notifier(timer_cpu_notify, 0); |
1614 | } | ||
1615 | #else | ||
1616 | static inline void timer_register_cpu_notifier(void) { } | ||
1617 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
1668 | 1618 | ||
1619 | static void __init init_timer_cpu(struct tvec_base *base, int cpu) | ||
1620 | { | ||
1621 | int j; | ||
1669 | 1622 | ||
1670 | void __init init_timers(void) | 1623 | BUG_ON(base != tbase_get_base(base)); |
1624 | |||
1625 | base->cpu = cpu; | ||
1626 | per_cpu(tvec_bases, cpu) = base; | ||
1627 | spin_lock_init(&base->lock); | ||
1628 | |||
1629 | for (j = 0; j < TVN_SIZE; j++) { | ||
1630 | INIT_LIST_HEAD(base->tv5.vec + j); | ||
1631 | INIT_LIST_HEAD(base->tv4.vec + j); | ||
1632 | INIT_LIST_HEAD(base->tv3.vec + j); | ||
1633 | INIT_LIST_HEAD(base->tv2.vec + j); | ||
1634 | } | ||
1635 | for (j = 0; j < TVR_SIZE; j++) | ||
1636 | INIT_LIST_HEAD(base->tv1.vec + j); | ||
1637 | |||
1638 | base->timer_jiffies = jiffies; | ||
1639 | base->next_timer = base->timer_jiffies; | ||
1640 | } | ||
1641 | |||
1642 | static void __init init_timer_cpus(void) | ||
1671 | { | 1643 | { |
1672 | int err; | 1644 | struct tvec_base *base; |
1645 | int local_cpu = smp_processor_id(); | ||
1646 | int cpu; | ||
1673 | 1647 | ||
1648 | for_each_possible_cpu(cpu) { | ||
1649 | if (cpu == local_cpu) | ||
1650 | base = &boot_tvec_bases; | ||
1651 | #ifdef CONFIG_SMP | ||
1652 | else | ||
1653 | base = per_cpu_ptr(&__tvec_bases, cpu); | ||
1654 | #endif | ||
1655 | |||
1656 | init_timer_cpu(base, cpu); | ||
1657 | } | ||
1658 | } | ||
1659 | |||
1660 | void __init init_timers(void) | ||
1661 | { | ||
1674 | /* ensure there are enough low bits for flags in timer->base pointer */ | 1662 | /* ensure there are enough low bits for flags in timer->base pointer */ |
1675 | BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK); | 1663 | BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK); |
1676 | 1664 | ||
1677 | err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, | 1665 | init_timer_cpus(); |
1678 | (void *)(long)smp_processor_id()); | ||
1679 | BUG_ON(err != NOTIFY_OK); | ||
1680 | |||
1681 | init_timer_stats(); | 1666 | init_timer_stats(); |
1682 | register_cpu_notifier(&timers_nb); | 1667 | timer_register_cpu_notifier(); |
1683 | open_softirq(TIMER_SOFTIRQ, run_timer_softirq); | 1668 | open_softirq(TIMER_SOFTIRQ, run_timer_softirq); |
1684 | } | 1669 | } |
1685 | 1670 | ||
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 61ed862cdd37..e878c2e0ba45 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c | |||
@@ -16,10 +16,10 @@ | |||
16 | #include <linux/sched.h> | 16 | #include <linux/sched.h> |
17 | #include <linux/seq_file.h> | 17 | #include <linux/seq_file.h> |
18 | #include <linux/kallsyms.h> | 18 | #include <linux/kallsyms.h> |
19 | #include <linux/tick.h> | ||
20 | 19 | ||
21 | #include <asm/uaccess.h> | 20 | #include <asm/uaccess.h> |
22 | 21 | ||
22 | #include "tick-internal.h" | ||
23 | 23 | ||
24 | struct timer_list_iter { | 24 | struct timer_list_iter { |
25 | int cpu; | 25 | int cpu; |
@@ -228,9 +228,35 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) | |||
228 | print_name_offset(m, dev->set_next_event); | 228 | print_name_offset(m, dev->set_next_event); |
229 | SEQ_printf(m, "\n"); | 229 | SEQ_printf(m, "\n"); |
230 | 230 | ||
231 | SEQ_printf(m, " set_mode: "); | 231 | if (dev->set_mode) { |
232 | print_name_offset(m, dev->set_mode); | 232 | SEQ_printf(m, " set_mode: "); |
233 | SEQ_printf(m, "\n"); | 233 | print_name_offset(m, dev->set_mode); |
234 | SEQ_printf(m, "\n"); | ||
235 | } else { | ||
236 | if (dev->set_state_shutdown) { | ||
237 | SEQ_printf(m, " shutdown: "); | ||
238 | print_name_offset(m, dev->set_state_shutdown); | ||
239 | SEQ_printf(m, "\n"); | ||
240 | } | ||
241 | |||
242 | if (dev->set_state_periodic) { | ||
243 | SEQ_printf(m, " periodic: "); | ||
244 | print_name_offset(m, dev->set_state_periodic); | ||
245 | SEQ_printf(m, "\n"); | ||
246 | } | ||
247 | |||
248 | if (dev->set_state_oneshot) { | ||
249 | SEQ_printf(m, " oneshot: "); | ||
250 | print_name_offset(m, dev->set_state_oneshot); | ||
251 | SEQ_printf(m, "\n"); | ||
252 | } | ||
253 | |||
254 | if (dev->tick_resume) { | ||
255 | SEQ_printf(m, " resume: "); | ||
256 | print_name_offset(m, dev->tick_resume); | ||
257 | SEQ_printf(m, "\n"); | ||
258 | } | ||
259 | } | ||
234 | 260 | ||
235 | SEQ_printf(m, " event_handler: "); | 261 | SEQ_printf(m, " event_handler: "); |
236 | print_name_offset(m, dev->event_handler); | 262 | print_name_offset(m, dev->event_handler); |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index a5da09c899dd..3b9a48ae153a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
@@ -432,6 +432,14 @@ config UPROBE_EVENT | |||
432 | This option is required if you plan to use perf-probe subcommand | 432 | This option is required if you plan to use perf-probe subcommand |
433 | of perf tools on user space applications. | 433 | of perf tools on user space applications. |
434 | 434 | ||
435 | config BPF_EVENTS | ||
436 | depends on BPF_SYSCALL | ||
437 | depends on KPROBE_EVENT | ||
438 | bool | ||
439 | default y | ||
440 | help | ||
441 | This allows the user to attach BPF programs to kprobe events. | ||
442 | |||
435 | config PROBE_EVENTS | 443 | config PROBE_EVENTS |
436 | def_bool n | 444 | def_bool n |
437 | 445 | ||
@@ -599,6 +607,34 @@ config RING_BUFFER_STARTUP_TEST | |||
599 | 607 | ||
600 | If unsure, say N | 608 | If unsure, say N |
601 | 609 | ||
610 | config TRACE_ENUM_MAP_FILE | ||
611 | bool "Show enum mappings for trace events" | ||
612 | depends on TRACING | ||
613 | help | ||
614 | The "print fmt" of the trace events will show the enum names instead | ||
615 | of their values. This can cause problems for user space tools that | ||
616 | use this string to parse the raw data as user space does not know | ||
617 | how to convert the string to its value. | ||
618 | |||
619 | To fix this, there's a special macro in the kernel that can be used | ||
620 | to convert the enum into its value. If this macro is used, then the | ||
621 | print fmt strings will have the enums converted to their values. | ||
622 | |||
623 | If something does not get converted properly, this option can be | ||
624 | used to show what enums the kernel tried to convert. | ||
625 | |||
626 | This option is for debugging the enum conversions. A file is created | ||
627 | in the tracing directory called "enum_map" that will show the enum | ||
628 | names matched with their values and what trace event system they | ||
629 | belong too. | ||
630 | |||
631 | Normally, the mapping of the strings to values will be freed after | ||
632 | boot up or module load. With this option, they will not be freed, as | ||
633 | they are needed for the "enum_map" file. Enabling this option will | ||
634 | increase the memory footprint of the running kernel. | ||
635 | |||
636 | If unsure, say N | ||
637 | |||
602 | endif # FTRACE | 638 | endif # FTRACE |
603 | 639 | ||
604 | endif # TRACING_SUPPORT | 640 | endif # TRACING_SUPPORT |
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 98f26588255e..9b1044e936a6 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile | |||
@@ -53,6 +53,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o | |||
53 | endif | 53 | endif |
54 | obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o | 54 | obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o |
55 | obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o | 55 | obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o |
56 | obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o | ||
56 | obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o | 57 | obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o |
57 | obj-$(CONFIG_TRACEPOINTS) += power-traces.o | 58 | obj-$(CONFIG_TRACEPOINTS) += power-traces.o |
58 | ifeq ($(CONFIG_PM),y) | 59 | ifeq ($(CONFIG_PM),y) |
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c new file mode 100644 index 000000000000..2d56ce501632 --- /dev/null +++ b/kernel/trace/bpf_trace.c | |||
@@ -0,0 +1,222 @@ | |||
1 | /* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com | ||
2 | * | ||
3 | * This program is free software; you can redistribute it and/or | ||
4 | * modify it under the terms of version 2 of the GNU General Public | ||
5 | * License as published by the Free Software Foundation. | ||
6 | */ | ||
7 | #include <linux/kernel.h> | ||
8 | #include <linux/types.h> | ||
9 | #include <linux/slab.h> | ||
10 | #include <linux/bpf.h> | ||
11 | #include <linux/filter.h> | ||
12 | #include <linux/uaccess.h> | ||
13 | #include <linux/ctype.h> | ||
14 | #include "trace.h" | ||
15 | |||
16 | static DEFINE_PER_CPU(int, bpf_prog_active); | ||
17 | |||
18 | /** | ||
19 | * trace_call_bpf - invoke BPF program | ||
20 | * @prog: BPF program | ||
21 | * @ctx: opaque context pointer | ||
22 | * | ||
23 | * kprobe handlers execute BPF programs via this helper. | ||
24 | * Can be used from static tracepoints in the future. | ||
25 | * | ||
26 | * Return: BPF programs always return an integer which is interpreted by | ||
27 | * kprobe handler as: | ||
28 | * 0 - return from kprobe (event is filtered out) | ||
29 | * 1 - store kprobe event into ring buffer | ||
30 | * Other values are reserved and currently alias to 1 | ||
31 | */ | ||
32 | unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx) | ||
33 | { | ||
34 | unsigned int ret; | ||
35 | |||
36 | if (in_nmi()) /* not supported yet */ | ||
37 | return 1; | ||
38 | |||
39 | preempt_disable(); | ||
40 | |||
41 | if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) { | ||
42 | /* | ||
43 | * since some bpf program is already running on this cpu, | ||
44 | * don't call into another bpf program (same or different) | ||
45 | * and don't send kprobe event into ring-buffer, | ||
46 | * so return zero here | ||
47 | */ | ||
48 | ret = 0; | ||
49 | goto out; | ||
50 | } | ||
51 | |||
52 | rcu_read_lock(); | ||
53 | ret = BPF_PROG_RUN(prog, ctx); | ||
54 | rcu_read_unlock(); | ||
55 | |||
56 | out: | ||
57 | __this_cpu_dec(bpf_prog_active); | ||
58 | preempt_enable(); | ||
59 | |||
60 | return ret; | ||
61 | } | ||
62 | EXPORT_SYMBOL_GPL(trace_call_bpf); | ||
63 | |||
64 | static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | ||
65 | { | ||
66 | void *dst = (void *) (long) r1; | ||
67 | int size = (int) r2; | ||
68 | void *unsafe_ptr = (void *) (long) r3; | ||
69 | |||
70 | return probe_kernel_read(dst, unsafe_ptr, size); | ||
71 | } | ||
72 | |||
73 | static const struct bpf_func_proto bpf_probe_read_proto = { | ||
74 | .func = bpf_probe_read, | ||
75 | .gpl_only = true, | ||
76 | .ret_type = RET_INTEGER, | ||
77 | .arg1_type = ARG_PTR_TO_STACK, | ||
78 | .arg2_type = ARG_CONST_STACK_SIZE, | ||
79 | .arg3_type = ARG_ANYTHING, | ||
80 | }; | ||
81 | |||
82 | static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | ||
83 | { | ||
84 | /* NMI safe access to clock monotonic */ | ||
85 | return ktime_get_mono_fast_ns(); | ||
86 | } | ||
87 | |||
88 | static const struct bpf_func_proto bpf_ktime_get_ns_proto = { | ||
89 | .func = bpf_ktime_get_ns, | ||
90 | .gpl_only = true, | ||
91 | .ret_type = RET_INTEGER, | ||
92 | }; | ||
93 | |||
94 | /* | ||
95 | * limited trace_printk() | ||
96 | * only %d %u %x %ld %lu %lx %lld %llu %llx %p conversion specifiers allowed | ||
97 | */ | ||
98 | static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) | ||
99 | { | ||
100 | char *fmt = (char *) (long) r1; | ||
101 | int mod[3] = {}; | ||
102 | int fmt_cnt = 0; | ||
103 | int i; | ||
104 | |||
105 | /* | ||
106 | * bpf_check()->check_func_arg()->check_stack_boundary() | ||
107 | * guarantees that fmt points to bpf program stack, | ||
108 | * fmt_size bytes of it were initialized and fmt_size > 0 | ||
109 | */ | ||
110 | if (fmt[--fmt_size] != 0) | ||
111 | return -EINVAL; | ||
112 | |||
113 | /* check format string for allowed specifiers */ | ||
114 | for (i = 0; i < fmt_size; i++) { | ||
115 | if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) | ||
116 | return -EINVAL; | ||
117 | |||
118 | if (fmt[i] != '%') | ||
119 | continue; | ||
120 | |||
121 | if (fmt_cnt >= 3) | ||
122 | return -EINVAL; | ||
123 | |||
124 | /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */ | ||
125 | i++; | ||
126 | if (fmt[i] == 'l') { | ||
127 | mod[fmt_cnt]++; | ||
128 | i++; | ||
129 | } else if (fmt[i] == 'p') { | ||
130 | mod[fmt_cnt]++; | ||
131 | i++; | ||
132 | if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0) | ||
133 | return -EINVAL; | ||
134 | fmt_cnt++; | ||
135 | continue; | ||
136 | } | ||
137 | |||
138 | if (fmt[i] == 'l') { | ||
139 | mod[fmt_cnt]++; | ||
140 | i++; | ||
141 | } | ||
142 | |||
143 | if (fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x') | ||
144 | return -EINVAL; | ||
145 | fmt_cnt++; | ||
146 | } | ||
147 | |||
148 | return __trace_printk(1/* fake ip will not be printed */, fmt, | ||
149 | mod[0] == 2 ? r3 : mod[0] == 1 ? (long) r3 : (u32) r3, | ||
150 | mod[1] == 2 ? r4 : mod[1] == 1 ? (long) r4 : (u32) r4, | ||
151 | mod[2] == 2 ? r5 : mod[2] == 1 ? (long) r5 : (u32) r5); | ||
152 | } | ||
153 | |||
154 | static const struct bpf_func_proto bpf_trace_printk_proto = { | ||
155 | .func = bpf_trace_printk, | ||
156 | .gpl_only = true, | ||
157 | .ret_type = RET_INTEGER, | ||
158 | .arg1_type = ARG_PTR_TO_STACK, | ||
159 | .arg2_type = ARG_CONST_STACK_SIZE, | ||
160 | }; | ||
161 | |||
162 | static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) | ||
163 | { | ||
164 | switch (func_id) { | ||
165 | case BPF_FUNC_map_lookup_elem: | ||
166 | return &bpf_map_lookup_elem_proto; | ||
167 | case BPF_FUNC_map_update_elem: | ||
168 | return &bpf_map_update_elem_proto; | ||
169 | case BPF_FUNC_map_delete_elem: | ||
170 | return &bpf_map_delete_elem_proto; | ||
171 | case BPF_FUNC_probe_read: | ||
172 | return &bpf_probe_read_proto; | ||
173 | case BPF_FUNC_ktime_get_ns: | ||
174 | return &bpf_ktime_get_ns_proto; | ||
175 | |||
176 | case BPF_FUNC_trace_printk: | ||
177 | /* | ||
178 | * this program might be calling bpf_trace_printk, | ||
179 | * so allocate per-cpu printk buffers | ||
180 | */ | ||
181 | trace_printk_init_buffers(); | ||
182 | |||
183 | return &bpf_trace_printk_proto; | ||
184 | default: | ||
185 | return NULL; | ||
186 | } | ||
187 | } | ||
188 | |||
189 | /* bpf+kprobe programs can access fields of 'struct pt_regs' */ | ||
190 | static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type) | ||
191 | { | ||
192 | /* check bounds */ | ||
193 | if (off < 0 || off >= sizeof(struct pt_regs)) | ||
194 | return false; | ||
195 | |||
196 | /* only read is allowed */ | ||
197 | if (type != BPF_READ) | ||
198 | return false; | ||
199 | |||
200 | /* disallow misaligned access */ | ||
201 | if (off % size != 0) | ||
202 | return false; | ||
203 | |||
204 | return true; | ||
205 | } | ||
206 | |||
207 | static struct bpf_verifier_ops kprobe_prog_ops = { | ||
208 | .get_func_proto = kprobe_prog_func_proto, | ||
209 | .is_valid_access = kprobe_prog_is_valid_access, | ||
210 | }; | ||
211 | |||
212 | static struct bpf_prog_type_list kprobe_tl = { | ||
213 | .ops = &kprobe_prog_ops, | ||
214 | .type = BPF_PROG_TYPE_KPROBE, | ||
215 | }; | ||
216 | |||
217 | static int __init register_kprobe_prog_ops(void) | ||
218 | { | ||
219 | bpf_register_prog_type(&kprobe_tl); | ||
220 | return 0; | ||
221 | } | ||
222 | late_initcall(register_kprobe_prog_ops); | ||
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4f228024055b..02bece4a99ea 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -18,7 +18,7 @@ | |||
18 | #include <linux/kallsyms.h> | 18 | #include <linux/kallsyms.h> |
19 | #include <linux/seq_file.h> | 19 | #include <linux/seq_file.h> |
20 | #include <linux/suspend.h> | 20 | #include <linux/suspend.h> |
21 | #include <linux/debugfs.h> | 21 | #include <linux/tracefs.h> |
22 | #include <linux/hardirq.h> | 22 | #include <linux/hardirq.h> |
23 | #include <linux/kthread.h> | 23 | #include <linux/kthread.h> |
24 | #include <linux/uaccess.h> | 24 | #include <linux/uaccess.h> |
@@ -249,6 +249,19 @@ static void update_function_graph_func(void); | |||
249 | static inline void update_function_graph_func(void) { } | 249 | static inline void update_function_graph_func(void) { } |
250 | #endif | 250 | #endif |
251 | 251 | ||
252 | |||
253 | static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops) | ||
254 | { | ||
255 | /* | ||
256 | * If this is a dynamic ops or we force list func, | ||
257 | * then it needs to call the list anyway. | ||
258 | */ | ||
259 | if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC) | ||
260 | return ftrace_ops_list_func; | ||
261 | |||
262 | return ftrace_ops_get_func(ops); | ||
263 | } | ||
264 | |||
252 | static void update_ftrace_function(void) | 265 | static void update_ftrace_function(void) |
253 | { | 266 | { |
254 | ftrace_func_t func; | 267 | ftrace_func_t func; |
@@ -270,7 +283,7 @@ static void update_ftrace_function(void) | |||
270 | * then have the mcount trampoline call the function directly. | 283 | * then have the mcount trampoline call the function directly. |
271 | */ | 284 | */ |
272 | } else if (ftrace_ops_list->next == &ftrace_list_end) { | 285 | } else if (ftrace_ops_list->next == &ftrace_list_end) { |
273 | func = ftrace_ops_get_func(ftrace_ops_list); | 286 | func = ftrace_ops_get_list_func(ftrace_ops_list); |
274 | 287 | ||
275 | } else { | 288 | } else { |
276 | /* Just use the default ftrace_ops */ | 289 | /* Just use the default ftrace_ops */ |
@@ -1008,7 +1021,7 @@ static struct tracer_stat function_stats __initdata = { | |||
1008 | .stat_show = function_stat_show | 1021 | .stat_show = function_stat_show |
1009 | }; | 1022 | }; |
1010 | 1023 | ||
1011 | static __init void ftrace_profile_debugfs(struct dentry *d_tracer) | 1024 | static __init void ftrace_profile_tracefs(struct dentry *d_tracer) |
1012 | { | 1025 | { |
1013 | struct ftrace_profile_stat *stat; | 1026 | struct ftrace_profile_stat *stat; |
1014 | struct dentry *entry; | 1027 | struct dentry *entry; |
@@ -1044,15 +1057,15 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer) | |||
1044 | } | 1057 | } |
1045 | } | 1058 | } |
1046 | 1059 | ||
1047 | entry = debugfs_create_file("function_profile_enabled", 0644, | 1060 | entry = tracefs_create_file("function_profile_enabled", 0644, |
1048 | d_tracer, NULL, &ftrace_profile_fops); | 1061 | d_tracer, NULL, &ftrace_profile_fops); |
1049 | if (!entry) | 1062 | if (!entry) |
1050 | pr_warning("Could not create debugfs " | 1063 | pr_warning("Could not create tracefs " |
1051 | "'function_profile_enabled' entry\n"); | 1064 | "'function_profile_enabled' entry\n"); |
1052 | } | 1065 | } |
1053 | 1066 | ||
1054 | #else /* CONFIG_FUNCTION_PROFILER */ | 1067 | #else /* CONFIG_FUNCTION_PROFILER */ |
1055 | static __init void ftrace_profile_debugfs(struct dentry *d_tracer) | 1068 | static __init void ftrace_profile_tracefs(struct dentry *d_tracer) |
1056 | { | 1069 | { |
1057 | } | 1070 | } |
1058 | #endif /* CONFIG_FUNCTION_PROFILER */ | 1071 | #endif /* CONFIG_FUNCTION_PROFILER */ |
@@ -4712,7 +4725,7 @@ void ftrace_destroy_filter_files(struct ftrace_ops *ops) | |||
4712 | mutex_unlock(&ftrace_lock); | 4725 | mutex_unlock(&ftrace_lock); |
4713 | } | 4726 | } |
4714 | 4727 | ||
4715 | static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) | 4728 | static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer) |
4716 | { | 4729 | { |
4717 | 4730 | ||
4718 | trace_create_file("available_filter_functions", 0444, | 4731 | trace_create_file("available_filter_functions", 0444, |
@@ -5020,7 +5033,7 @@ static int __init ftrace_nodyn_init(void) | |||
5020 | } | 5033 | } |
5021 | core_initcall(ftrace_nodyn_init); | 5034 | core_initcall(ftrace_nodyn_init); |
5022 | 5035 | ||
5023 | static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } | 5036 | static inline int ftrace_init_dyn_tracefs(struct dentry *d_tracer) { return 0; } |
5024 | static inline void ftrace_startup_enable(int command) { } | 5037 | static inline void ftrace_startup_enable(int command) { } |
5025 | static inline void ftrace_startup_all(int command) { } | 5038 | static inline void ftrace_startup_all(int command) { } |
5026 | /* Keep as macros so we do not need to define the commands */ | 5039 | /* Keep as macros so we do not need to define the commands */ |
@@ -5209,13 +5222,6 @@ static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip, | |||
5209 | ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) | 5222 | ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) |
5210 | { | 5223 | { |
5211 | /* | 5224 | /* |
5212 | * If this is a dynamic ops or we force list func, | ||
5213 | * then it needs to call the list anyway. | ||
5214 | */ | ||
5215 | if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC) | ||
5216 | return ftrace_ops_list_func; | ||
5217 | |||
5218 | /* | ||
5219 | * If the func handles its own recursion, call it directly. | 5225 | * If the func handles its own recursion, call it directly. |
5220 | * Otherwise call the recursion protected function that | 5226 | * Otherwise call the recursion protected function that |
5221 | * will call the ftrace ops function. | 5227 | * will call the ftrace ops function. |
@@ -5473,7 +5479,7 @@ static const struct file_operations ftrace_pid_fops = { | |||
5473 | .release = ftrace_pid_release, | 5479 | .release = ftrace_pid_release, |
5474 | }; | 5480 | }; |
5475 | 5481 | ||
5476 | static __init int ftrace_init_debugfs(void) | 5482 | static __init int ftrace_init_tracefs(void) |
5477 | { | 5483 | { |
5478 | struct dentry *d_tracer; | 5484 | struct dentry *d_tracer; |
5479 | 5485 | ||
@@ -5481,16 +5487,16 @@ static __init int ftrace_init_debugfs(void) | |||
5481 | if (IS_ERR(d_tracer)) | 5487 | if (IS_ERR(d_tracer)) |
5482 | return 0; | 5488 | return 0; |
5483 | 5489 | ||
5484 | ftrace_init_dyn_debugfs(d_tracer); | 5490 | ftrace_init_dyn_tracefs(d_tracer); |
5485 | 5491 | ||
5486 | trace_create_file("set_ftrace_pid", 0644, d_tracer, | 5492 | trace_create_file("set_ftrace_pid", 0644, d_tracer, |
5487 | NULL, &ftrace_pid_fops); | 5493 | NULL, &ftrace_pid_fops); |
5488 | 5494 | ||
5489 | ftrace_profile_debugfs(d_tracer); | 5495 | ftrace_profile_tracefs(d_tracer); |
5490 | 5496 | ||
5491 | return 0; | 5497 | return 0; |
5492 | } | 5498 | } |
5493 | fs_initcall(ftrace_init_debugfs); | 5499 | fs_initcall(ftrace_init_tracefs); |
5494 | 5500 | ||
5495 | /** | 5501 | /** |
5496 | * ftrace_kill - kill ftrace | 5502 | * ftrace_kill - kill ftrace |
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 5040d44fe5a3..0315d43176d8 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -2679,7 +2679,7 @@ static DEFINE_PER_CPU(unsigned int, current_context); | |||
2679 | 2679 | ||
2680 | static __always_inline int trace_recursive_lock(void) | 2680 | static __always_inline int trace_recursive_lock(void) |
2681 | { | 2681 | { |
2682 | unsigned int val = this_cpu_read(current_context); | 2682 | unsigned int val = __this_cpu_read(current_context); |
2683 | int bit; | 2683 | int bit; |
2684 | 2684 | ||
2685 | if (in_interrupt()) { | 2685 | if (in_interrupt()) { |
@@ -2696,18 +2696,14 @@ static __always_inline int trace_recursive_lock(void) | |||
2696 | return 1; | 2696 | return 1; |
2697 | 2697 | ||
2698 | val |= (1 << bit); | 2698 | val |= (1 << bit); |
2699 | this_cpu_write(current_context, val); | 2699 | __this_cpu_write(current_context, val); |
2700 | 2700 | ||
2701 | return 0; | 2701 | return 0; |
2702 | } | 2702 | } |
2703 | 2703 | ||
2704 | static __always_inline void trace_recursive_unlock(void) | 2704 | static __always_inline void trace_recursive_unlock(void) |
2705 | { | 2705 | { |
2706 | unsigned int val = this_cpu_read(current_context); | 2706 | __this_cpu_and(current_context, __this_cpu_read(current_context) - 1); |
2707 | |||
2708 | val--; | ||
2709 | val &= this_cpu_read(current_context); | ||
2710 | this_cpu_write(current_context, val); | ||
2711 | } | 2707 | } |
2712 | 2708 | ||
2713 | #else | 2709 | #else |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 62c6506d663f..91eecaaa43e0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -20,6 +20,7 @@ | |||
20 | #include <linux/notifier.h> | 20 | #include <linux/notifier.h> |
21 | #include <linux/irqflags.h> | 21 | #include <linux/irqflags.h> |
22 | #include <linux/debugfs.h> | 22 | #include <linux/debugfs.h> |
23 | #include <linux/tracefs.h> | ||
23 | #include <linux/pagemap.h> | 24 | #include <linux/pagemap.h> |
24 | #include <linux/hardirq.h> | 25 | #include <linux/hardirq.h> |
25 | #include <linux/linkage.h> | 26 | #include <linux/linkage.h> |
@@ -31,6 +32,7 @@ | |||
31 | #include <linux/splice.h> | 32 | #include <linux/splice.h> |
32 | #include <linux/kdebug.h> | 33 | #include <linux/kdebug.h> |
33 | #include <linux/string.h> | 34 | #include <linux/string.h> |
35 | #include <linux/mount.h> | ||
34 | #include <linux/rwsem.h> | 36 | #include <linux/rwsem.h> |
35 | #include <linux/slab.h> | 37 | #include <linux/slab.h> |
36 | #include <linux/ctype.h> | 38 | #include <linux/ctype.h> |
@@ -123,6 +125,42 @@ enum ftrace_dump_mode ftrace_dump_on_oops; | |||
123 | /* When set, tracing will stop when a WARN*() is hit */ | 125 | /* When set, tracing will stop when a WARN*() is hit */ |
124 | int __disable_trace_on_warning; | 126 | int __disable_trace_on_warning; |
125 | 127 | ||
128 | #ifdef CONFIG_TRACE_ENUM_MAP_FILE | ||
129 | /* Map of enums to their values, for "enum_map" file */ | ||
130 | struct trace_enum_map_head { | ||
131 | struct module *mod; | ||
132 | unsigned long length; | ||
133 | }; | ||
134 | |||
135 | union trace_enum_map_item; | ||
136 | |||
137 | struct trace_enum_map_tail { | ||
138 | /* | ||
139 | * "end" is first and points to NULL as it must be different | ||
140 | * than "mod" or "enum_string" | ||
141 | */ | ||
142 | union trace_enum_map_item *next; | ||
143 | const char *end; /* points to NULL */ | ||
144 | }; | ||
145 | |||
146 | static DEFINE_MUTEX(trace_enum_mutex); | ||
147 | |||
148 | /* | ||
149 | * The trace_enum_maps are saved in an array with two extra elements, | ||
150 | * one at the beginning, and one at the end. The beginning item contains | ||
151 | * the count of the saved maps (head.length), and the module they | ||
152 | * belong to if not built in (head.mod). The ending item contains a | ||
153 | * pointer to the next array of saved enum_map items. | ||
154 | */ | ||
155 | union trace_enum_map_item { | ||
156 | struct trace_enum_map map; | ||
157 | struct trace_enum_map_head head; | ||
158 | struct trace_enum_map_tail tail; | ||
159 | }; | ||
160 | |||
161 | static union trace_enum_map_item *trace_enum_maps; | ||
162 | #endif /* CONFIG_TRACE_ENUM_MAP_FILE */ | ||
163 | |||
126 | static int tracing_set_tracer(struct trace_array *tr, const char *buf); | 164 | static int tracing_set_tracer(struct trace_array *tr, const char *buf); |
127 | 165 | ||
128 | #define MAX_TRACER_SIZE 100 | 166 | #define MAX_TRACER_SIZE 100 |
@@ -3908,6 +3946,182 @@ static const struct file_operations tracing_saved_cmdlines_size_fops = { | |||
3908 | .write = tracing_saved_cmdlines_size_write, | 3946 | .write = tracing_saved_cmdlines_size_write, |
3909 | }; | 3947 | }; |
3910 | 3948 | ||
3949 | #ifdef CONFIG_TRACE_ENUM_MAP_FILE | ||
3950 | static union trace_enum_map_item * | ||
3951 | update_enum_map(union trace_enum_map_item *ptr) | ||
3952 | { | ||
3953 | if (!ptr->map.enum_string) { | ||
3954 | if (ptr->tail.next) { | ||
3955 | ptr = ptr->tail.next; | ||
3956 | /* Set ptr to the next real item (skip head) */ | ||
3957 | ptr++; | ||
3958 | } else | ||
3959 | return NULL; | ||
3960 | } | ||
3961 | return ptr; | ||
3962 | } | ||
3963 | |||
3964 | static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos) | ||
3965 | { | ||
3966 | union trace_enum_map_item *ptr = v; | ||
3967 | |||
3968 | /* | ||
3969 | * Paranoid! If ptr points to end, we don't want to increment past it. | ||
3970 | * This really should never happen. | ||
3971 | */ | ||
3972 | ptr = update_enum_map(ptr); | ||
3973 | if (WARN_ON_ONCE(!ptr)) | ||
3974 | return NULL; | ||
3975 | |||
3976 | ptr++; | ||
3977 | |||
3978 | (*pos)++; | ||
3979 | |||
3980 | ptr = update_enum_map(ptr); | ||
3981 | |||
3982 | return ptr; | ||
3983 | } | ||
3984 | |||
3985 | static void *enum_map_start(struct seq_file *m, loff_t *pos) | ||
3986 | { | ||
3987 | union trace_enum_map_item *v; | ||
3988 | loff_t l = 0; | ||
3989 | |||
3990 | mutex_lock(&trace_enum_mutex); | ||
3991 | |||
3992 | v = trace_enum_maps; | ||
3993 | if (v) | ||
3994 | v++; | ||
3995 | |||
3996 | while (v && l < *pos) { | ||
3997 | v = enum_map_next(m, v, &l); | ||
3998 | } | ||
3999 | |||
4000 | return v; | ||
4001 | } | ||
4002 | |||
4003 | static void enum_map_stop(struct seq_file *m, void *v) | ||
4004 | { | ||
4005 | mutex_unlock(&trace_enum_mutex); | ||
4006 | } | ||
4007 | |||
4008 | static int enum_map_show(struct seq_file *m, void *v) | ||
4009 | { | ||
4010 | union trace_enum_map_item *ptr = v; | ||
4011 | |||
4012 | seq_printf(m, "%s %ld (%s)\n", | ||
4013 | ptr->map.enum_string, ptr->map.enum_value, | ||
4014 | ptr->map.system); | ||
4015 | |||
4016 | return 0; | ||
4017 | } | ||
4018 | |||
4019 | static const struct seq_operations tracing_enum_map_seq_ops = { | ||
4020 | .start = enum_map_start, | ||
4021 | .next = enum_map_next, | ||
4022 | .stop = enum_map_stop, | ||
4023 | .show = enum_map_show, | ||
4024 | }; | ||
4025 | |||
4026 | static int tracing_enum_map_open(struct inode *inode, struct file *filp) | ||
4027 | { | ||
4028 | if (tracing_disabled) | ||
4029 | return -ENODEV; | ||
4030 | |||
4031 | return seq_open(filp, &tracing_enum_map_seq_ops); | ||
4032 | } | ||
4033 | |||
4034 | static const struct file_operations tracing_enum_map_fops = { | ||
4035 | .open = tracing_enum_map_open, | ||
4036 | .read = seq_read, | ||
4037 | .llseek = seq_lseek, | ||
4038 | .release = seq_release, | ||
4039 | }; | ||
4040 | |||
4041 | static inline union trace_enum_map_item * | ||
4042 | trace_enum_jmp_to_tail(union trace_enum_map_item *ptr) | ||
4043 | { | ||
4044 | /* Return tail of array given the head */ | ||
4045 | return ptr + ptr->head.length + 1; | ||
4046 | } | ||
4047 | |||
4048 | static void | ||
4049 | trace_insert_enum_map_file(struct module *mod, struct trace_enum_map **start, | ||
4050 | int len) | ||
4051 | { | ||
4052 | struct trace_enum_map **stop; | ||
4053 | struct trace_enum_map **map; | ||
4054 | union trace_enum_map_item *map_array; | ||
4055 | union trace_enum_map_item *ptr; | ||
4056 | |||
4057 | stop = start + len; | ||
4058 | |||
4059 | /* | ||
4060 | * The trace_enum_maps contains the map plus a head and tail item, | ||
4061 | * where the head holds the module and length of array, and the | ||
4062 | * tail holds a pointer to the next list. | ||
4063 | */ | ||
4064 | map_array = kmalloc(sizeof(*map_array) * (len + 2), GFP_KERNEL); | ||
4065 | if (!map_array) { | ||
4066 | pr_warning("Unable to allocate trace enum mapping\n"); | ||
4067 | return; | ||
4068 | } | ||
4069 | |||
4070 | mutex_lock(&trace_enum_mutex); | ||
4071 | |||
4072 | if (!trace_enum_maps) | ||
4073 | trace_enum_maps = map_array; | ||
4074 | else { | ||
4075 | ptr = trace_enum_maps; | ||
4076 | for (;;) { | ||
4077 | ptr = trace_enum_jmp_to_tail(ptr); | ||
4078 | if (!ptr->tail.next) | ||
4079 | break; | ||
4080 | ptr = ptr->tail.next; | ||
4081 | |||
4082 | } | ||
4083 | ptr->tail.next = map_array; | ||
4084 | } | ||
4085 | map_array->head.mod = mod; | ||
4086 | map_array->head.length = len; | ||
4087 | map_array++; | ||
4088 | |||
4089 | for (map = start; (unsigned long)map < (unsigned long)stop; map++) { | ||
4090 | map_array->map = **map; | ||
4091 | map_array++; | ||
4092 | } | ||
4093 | memset(map_array, 0, sizeof(*map_array)); | ||
4094 | |||
4095 | mutex_unlock(&trace_enum_mutex); | ||
4096 | } | ||
4097 | |||
4098 | static void trace_create_enum_file(struct dentry *d_tracer) | ||
4099 | { | ||
4100 | trace_create_file("enum_map", 0444, d_tracer, | ||
4101 | NULL, &tracing_enum_map_fops); | ||
4102 | } | ||
4103 | |||
4104 | #else /* CONFIG_TRACE_ENUM_MAP_FILE */ | ||
4105 | static inline void trace_create_enum_file(struct dentry *d_tracer) { } | ||
4106 | static inline void trace_insert_enum_map_file(struct module *mod, | ||
4107 | struct trace_enum_map **start, int len) { } | ||
4108 | #endif /* !CONFIG_TRACE_ENUM_MAP_FILE */ | ||
4109 | |||
4110 | static void trace_insert_enum_map(struct module *mod, | ||
4111 | struct trace_enum_map **start, int len) | ||
4112 | { | ||
4113 | struct trace_enum_map **map; | ||
4114 | |||
4115 | if (len <= 0) | ||
4116 | return; | ||
4117 | |||
4118 | map = start; | ||
4119 | |||
4120 | trace_event_enum_update(map, len); | ||
4121 | |||
4122 | trace_insert_enum_map_file(mod, start, len); | ||
4123 | } | ||
4124 | |||
3911 | static ssize_t | 4125 | static ssize_t |
3912 | tracing_set_trace_read(struct file *filp, char __user *ubuf, | 4126 | tracing_set_trace_read(struct file *filp, char __user *ubuf, |
3913 | size_t cnt, loff_t *ppos) | 4127 | size_t cnt, loff_t *ppos) |
@@ -4105,9 +4319,24 @@ static void tracing_set_nop(struct trace_array *tr) | |||
4105 | tr->current_trace = &nop_trace; | 4319 | tr->current_trace = &nop_trace; |
4106 | } | 4320 | } |
4107 | 4321 | ||
4108 | static int tracing_set_tracer(struct trace_array *tr, const char *buf) | 4322 | static void update_tracer_options(struct trace_array *tr, struct tracer *t) |
4109 | { | 4323 | { |
4110 | static struct trace_option_dentry *topts; | 4324 | static struct trace_option_dentry *topts; |
4325 | |||
4326 | /* Only enable if the directory has been created already. */ | ||
4327 | if (!tr->dir) | ||
4328 | return; | ||
4329 | |||
4330 | /* Currently, only the top instance has options */ | ||
4331 | if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL)) | ||
4332 | return; | ||
4333 | |||
4334 | destroy_trace_option_files(topts); | ||
4335 | topts = create_trace_option_files(tr, t); | ||
4336 | } | ||
4337 | |||
4338 | static int tracing_set_tracer(struct trace_array *tr, const char *buf) | ||
4339 | { | ||
4111 | struct tracer *t; | 4340 | struct tracer *t; |
4112 | #ifdef CONFIG_TRACER_MAX_TRACE | 4341 | #ifdef CONFIG_TRACER_MAX_TRACE |
4113 | bool had_max_tr; | 4342 | bool had_max_tr; |
@@ -4172,11 +4401,7 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf) | |||
4172 | free_snapshot(tr); | 4401 | free_snapshot(tr); |
4173 | } | 4402 | } |
4174 | #endif | 4403 | #endif |
4175 | /* Currently, only the top instance has options */ | 4404 | update_tracer_options(tr, t); |
4176 | if (tr->flags & TRACE_ARRAY_FL_GLOBAL) { | ||
4177 | destroy_trace_option_files(topts); | ||
4178 | topts = create_trace_option_files(tr, t); | ||
4179 | } | ||
4180 | 4405 | ||
4181 | #ifdef CONFIG_TRACER_MAX_TRACE | 4406 | #ifdef CONFIG_TRACER_MAX_TRACE |
4182 | if (t->use_max_tr && !had_max_tr) { | 4407 | if (t->use_max_tr && !had_max_tr) { |
@@ -5817,6 +6042,14 @@ static inline __init int register_snapshot_cmd(void) { return 0; } | |||
5817 | 6042 | ||
5818 | static struct dentry *tracing_get_dentry(struct trace_array *tr) | 6043 | static struct dentry *tracing_get_dentry(struct trace_array *tr) |
5819 | { | 6044 | { |
6045 | if (WARN_ON(!tr->dir)) | ||
6046 | return ERR_PTR(-ENODEV); | ||
6047 | |||
6048 | /* Top directory uses NULL as the parent */ | ||
6049 | if (tr->flags & TRACE_ARRAY_FL_GLOBAL) | ||
6050 | return NULL; | ||
6051 | |||
6052 | /* All sub buffers have a descriptor */ | ||
5820 | return tr->dir; | 6053 | return tr->dir; |
5821 | } | 6054 | } |
5822 | 6055 | ||
@@ -5831,10 +6064,10 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu) | |||
5831 | if (IS_ERR(d_tracer)) | 6064 | if (IS_ERR(d_tracer)) |
5832 | return NULL; | 6065 | return NULL; |
5833 | 6066 | ||
5834 | tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer); | 6067 | tr->percpu_dir = tracefs_create_dir("per_cpu", d_tracer); |
5835 | 6068 | ||
5836 | WARN_ONCE(!tr->percpu_dir, | 6069 | WARN_ONCE(!tr->percpu_dir, |
5837 | "Could not create debugfs directory 'per_cpu/%d'\n", cpu); | 6070 | "Could not create tracefs directory 'per_cpu/%d'\n", cpu); |
5838 | 6071 | ||
5839 | return tr->percpu_dir; | 6072 | return tr->percpu_dir; |
5840 | } | 6073 | } |
@@ -5851,7 +6084,7 @@ trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent, | |||
5851 | } | 6084 | } |
5852 | 6085 | ||
5853 | static void | 6086 | static void |
5854 | tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) | 6087 | tracing_init_tracefs_percpu(struct trace_array *tr, long cpu) |
5855 | { | 6088 | { |
5856 | struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu); | 6089 | struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu); |
5857 | struct dentry *d_cpu; | 6090 | struct dentry *d_cpu; |
@@ -5861,9 +6094,9 @@ tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) | |||
5861 | return; | 6094 | return; |
5862 | 6095 | ||
5863 | snprintf(cpu_dir, 30, "cpu%ld", cpu); | 6096 | snprintf(cpu_dir, 30, "cpu%ld", cpu); |
5864 | d_cpu = debugfs_create_dir(cpu_dir, d_percpu); | 6097 | d_cpu = tracefs_create_dir(cpu_dir, d_percpu); |
5865 | if (!d_cpu) { | 6098 | if (!d_cpu) { |
5866 | pr_warning("Could not create debugfs '%s' entry\n", cpu_dir); | 6099 | pr_warning("Could not create tracefs '%s' entry\n", cpu_dir); |
5867 | return; | 6100 | return; |
5868 | } | 6101 | } |
5869 | 6102 | ||
@@ -6015,9 +6248,9 @@ struct dentry *trace_create_file(const char *name, | |||
6015 | { | 6248 | { |
6016 | struct dentry *ret; | 6249 | struct dentry *ret; |
6017 | 6250 | ||
6018 | ret = debugfs_create_file(name, mode, parent, data, fops); | 6251 | ret = tracefs_create_file(name, mode, parent, data, fops); |
6019 | if (!ret) | 6252 | if (!ret) |
6020 | pr_warning("Could not create debugfs '%s' entry\n", name); | 6253 | pr_warning("Could not create tracefs '%s' entry\n", name); |
6021 | 6254 | ||
6022 | return ret; | 6255 | return ret; |
6023 | } | 6256 | } |
@@ -6034,9 +6267,9 @@ static struct dentry *trace_options_init_dentry(struct trace_array *tr) | |||
6034 | if (IS_ERR(d_tracer)) | 6267 | if (IS_ERR(d_tracer)) |
6035 | return NULL; | 6268 | return NULL; |
6036 | 6269 | ||
6037 | tr->options = debugfs_create_dir("options", d_tracer); | 6270 | tr->options = tracefs_create_dir("options", d_tracer); |
6038 | if (!tr->options) { | 6271 | if (!tr->options) { |
6039 | pr_warning("Could not create debugfs directory 'options'\n"); | 6272 | pr_warning("Could not create tracefs directory 'options'\n"); |
6040 | return NULL; | 6273 | return NULL; |
6041 | } | 6274 | } |
6042 | 6275 | ||
@@ -6105,7 +6338,7 @@ destroy_trace_option_files(struct trace_option_dentry *topts) | |||
6105 | return; | 6338 | return; |
6106 | 6339 | ||
6107 | for (cnt = 0; topts[cnt].opt; cnt++) | 6340 | for (cnt = 0; topts[cnt].opt; cnt++) |
6108 | debugfs_remove(topts[cnt].entry); | 6341 | tracefs_remove(topts[cnt].entry); |
6109 | 6342 | ||
6110 | kfree(topts); | 6343 | kfree(topts); |
6111 | } | 6344 | } |
@@ -6194,7 +6427,7 @@ static const struct file_operations rb_simple_fops = { | |||
6194 | struct dentry *trace_instance_dir; | 6427 | struct dentry *trace_instance_dir; |
6195 | 6428 | ||
6196 | static void | 6429 | static void |
6197 | init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer); | 6430 | init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer); |
6198 | 6431 | ||
6199 | static int | 6432 | static int |
6200 | allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size) | 6433 | allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size) |
@@ -6271,7 +6504,7 @@ static void free_trace_buffers(struct trace_array *tr) | |||
6271 | #endif | 6504 | #endif |
6272 | } | 6505 | } |
6273 | 6506 | ||
6274 | static int new_instance_create(const char *name) | 6507 | static int instance_mkdir(const char *name) |
6275 | { | 6508 | { |
6276 | struct trace_array *tr; | 6509 | struct trace_array *tr; |
6277 | int ret; | 6510 | int ret; |
@@ -6310,17 +6543,17 @@ static int new_instance_create(const char *name) | |||
6310 | if (allocate_trace_buffers(tr, trace_buf_size) < 0) | 6543 | if (allocate_trace_buffers(tr, trace_buf_size) < 0) |
6311 | goto out_free_tr; | 6544 | goto out_free_tr; |
6312 | 6545 | ||
6313 | tr->dir = debugfs_create_dir(name, trace_instance_dir); | 6546 | tr->dir = tracefs_create_dir(name, trace_instance_dir); |
6314 | if (!tr->dir) | 6547 | if (!tr->dir) |
6315 | goto out_free_tr; | 6548 | goto out_free_tr; |
6316 | 6549 | ||
6317 | ret = event_trace_add_tracer(tr->dir, tr); | 6550 | ret = event_trace_add_tracer(tr->dir, tr); |
6318 | if (ret) { | 6551 | if (ret) { |
6319 | debugfs_remove_recursive(tr->dir); | 6552 | tracefs_remove_recursive(tr->dir); |
6320 | goto out_free_tr; | 6553 | goto out_free_tr; |
6321 | } | 6554 | } |
6322 | 6555 | ||
6323 | init_tracer_debugfs(tr, tr->dir); | 6556 | init_tracer_tracefs(tr, tr->dir); |
6324 | 6557 | ||
6325 | list_add(&tr->list, &ftrace_trace_arrays); | 6558 | list_add(&tr->list, &ftrace_trace_arrays); |
6326 | 6559 | ||
@@ -6341,7 +6574,7 @@ static int new_instance_create(const char *name) | |||
6341 | 6574 | ||
6342 | } | 6575 | } |
6343 | 6576 | ||
6344 | static int instance_delete(const char *name) | 6577 | static int instance_rmdir(const char *name) |
6345 | { | 6578 | { |
6346 | struct trace_array *tr; | 6579 | struct trace_array *tr; |
6347 | int found = 0; | 6580 | int found = 0; |
@@ -6382,82 +6615,17 @@ static int instance_delete(const char *name) | |||
6382 | return ret; | 6615 | return ret; |
6383 | } | 6616 | } |
6384 | 6617 | ||
6385 | static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t mode) | ||
6386 | { | ||
6387 | struct dentry *parent; | ||
6388 | int ret; | ||
6389 | |||
6390 | /* Paranoid: Make sure the parent is the "instances" directory */ | ||
6391 | parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); | ||
6392 | if (WARN_ON_ONCE(parent != trace_instance_dir)) | ||
6393 | return -ENOENT; | ||
6394 | |||
6395 | /* | ||
6396 | * The inode mutex is locked, but debugfs_create_dir() will also | ||
6397 | * take the mutex. As the instances directory can not be destroyed | ||
6398 | * or changed in any other way, it is safe to unlock it, and | ||
6399 | * let the dentry try. If two users try to make the same dir at | ||
6400 | * the same time, then the new_instance_create() will determine the | ||
6401 | * winner. | ||
6402 | */ | ||
6403 | mutex_unlock(&inode->i_mutex); | ||
6404 | |||
6405 | ret = new_instance_create(dentry->d_iname); | ||
6406 | |||
6407 | mutex_lock(&inode->i_mutex); | ||
6408 | |||
6409 | return ret; | ||
6410 | } | ||
6411 | |||
6412 | static int instance_rmdir(struct inode *inode, struct dentry *dentry) | ||
6413 | { | ||
6414 | struct dentry *parent; | ||
6415 | int ret; | ||
6416 | |||
6417 | /* Paranoid: Make sure the parent is the "instances" directory */ | ||
6418 | parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); | ||
6419 | if (WARN_ON_ONCE(parent != trace_instance_dir)) | ||
6420 | return -ENOENT; | ||
6421 | |||
6422 | /* The caller did a dget() on dentry */ | ||
6423 | mutex_unlock(&dentry->d_inode->i_mutex); | ||
6424 | |||
6425 | /* | ||
6426 | * The inode mutex is locked, but debugfs_create_dir() will also | ||
6427 | * take the mutex. As the instances directory can not be destroyed | ||
6428 | * or changed in any other way, it is safe to unlock it, and | ||
6429 | * let the dentry try. If two users try to make the same dir at | ||
6430 | * the same time, then the instance_delete() will determine the | ||
6431 | * winner. | ||
6432 | */ | ||
6433 | mutex_unlock(&inode->i_mutex); | ||
6434 | |||
6435 | ret = instance_delete(dentry->d_iname); | ||
6436 | |||
6437 | mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); | ||
6438 | mutex_lock(&dentry->d_inode->i_mutex); | ||
6439 | |||
6440 | return ret; | ||
6441 | } | ||
6442 | |||
6443 | static const struct inode_operations instance_dir_inode_operations = { | ||
6444 | .lookup = simple_lookup, | ||
6445 | .mkdir = instance_mkdir, | ||
6446 | .rmdir = instance_rmdir, | ||
6447 | }; | ||
6448 | |||
6449 | static __init void create_trace_instances(struct dentry *d_tracer) | 6618 | static __init void create_trace_instances(struct dentry *d_tracer) |
6450 | { | 6619 | { |
6451 | trace_instance_dir = debugfs_create_dir("instances", d_tracer); | 6620 | trace_instance_dir = tracefs_create_instance_dir("instances", d_tracer, |
6621 | instance_mkdir, | ||
6622 | instance_rmdir); | ||
6452 | if (WARN_ON(!trace_instance_dir)) | 6623 | if (WARN_ON(!trace_instance_dir)) |
6453 | return; | 6624 | return; |
6454 | |||
6455 | /* Hijack the dir inode operations, to allow mkdir */ | ||
6456 | trace_instance_dir->d_inode->i_op = &instance_dir_inode_operations; | ||
6457 | } | 6625 | } |
6458 | 6626 | ||
6459 | static void | 6627 | static void |
6460 | init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) | 6628 | init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) |
6461 | { | 6629 | { |
6462 | int cpu; | 6630 | int cpu; |
6463 | 6631 | ||
@@ -6511,10 +6679,32 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) | |||
6511 | #endif | 6679 | #endif |
6512 | 6680 | ||
6513 | for_each_tracing_cpu(cpu) | 6681 | for_each_tracing_cpu(cpu) |
6514 | tracing_init_debugfs_percpu(tr, cpu); | 6682 | tracing_init_tracefs_percpu(tr, cpu); |
6515 | 6683 | ||
6516 | } | 6684 | } |
6517 | 6685 | ||
6686 | static struct vfsmount *trace_automount(void *ingore) | ||
6687 | { | ||
6688 | struct vfsmount *mnt; | ||
6689 | struct file_system_type *type; | ||
6690 | |||
6691 | /* | ||
6692 | * To maintain backward compatibility for tools that mount | ||
6693 | * debugfs to get to the tracing facility, tracefs is automatically | ||
6694 | * mounted to the debugfs/tracing directory. | ||
6695 | */ | ||
6696 | type = get_fs_type("tracefs"); | ||
6697 | if (!type) | ||
6698 | return NULL; | ||
6699 | mnt = vfs_kern_mount(type, 0, "tracefs", NULL); | ||
6700 | put_filesystem(type); | ||
6701 | if (IS_ERR(mnt)) | ||
6702 | return NULL; | ||
6703 | mntget(mnt); | ||
6704 | |||
6705 | return mnt; | ||
6706 | } | ||
6707 | |||
6518 | /** | 6708 | /** |
6519 | * tracing_init_dentry - initialize top level trace array | 6709 | * tracing_init_dentry - initialize top level trace array |
6520 | * | 6710 | * |
@@ -6526,23 +6716,112 @@ struct dentry *tracing_init_dentry(void) | |||
6526 | { | 6716 | { |
6527 | struct trace_array *tr = &global_trace; | 6717 | struct trace_array *tr = &global_trace; |
6528 | 6718 | ||
6719 | /* The top level trace array uses NULL as parent */ | ||
6529 | if (tr->dir) | 6720 | if (tr->dir) |
6530 | return tr->dir; | 6721 | return NULL; |
6531 | 6722 | ||
6532 | if (WARN_ON(!debugfs_initialized())) | 6723 | if (WARN_ON(!debugfs_initialized())) |
6533 | return ERR_PTR(-ENODEV); | 6724 | return ERR_PTR(-ENODEV); |
6534 | 6725 | ||
6535 | tr->dir = debugfs_create_dir("tracing", NULL); | 6726 | /* |
6536 | 6727 | * As there may still be users that expect the tracing | |
6728 | * files to exist in debugfs/tracing, we must automount | ||
6729 | * the tracefs file system there, so older tools still | ||
6730 | * work with the newer kerenl. | ||
6731 | */ | ||
6732 | tr->dir = debugfs_create_automount("tracing", NULL, | ||
6733 | trace_automount, NULL); | ||
6537 | if (!tr->dir) { | 6734 | if (!tr->dir) { |
6538 | pr_warn_once("Could not create debugfs directory 'tracing'\n"); | 6735 | pr_warn_once("Could not create debugfs directory 'tracing'\n"); |
6539 | return ERR_PTR(-ENOMEM); | 6736 | return ERR_PTR(-ENOMEM); |
6540 | } | 6737 | } |
6541 | 6738 | ||
6542 | return tr->dir; | 6739 | return NULL; |
6740 | } | ||
6741 | |||
6742 | extern struct trace_enum_map *__start_ftrace_enum_maps[]; | ||
6743 | extern struct trace_enum_map *__stop_ftrace_enum_maps[]; | ||
6744 | |||
6745 | static void __init trace_enum_init(void) | ||
6746 | { | ||
6747 | int len; | ||
6748 | |||
6749 | len = __stop_ftrace_enum_maps - __start_ftrace_enum_maps; | ||
6750 | trace_insert_enum_map(NULL, __start_ftrace_enum_maps, len); | ||
6751 | } | ||
6752 | |||
6753 | #ifdef CONFIG_MODULES | ||
6754 | static void trace_module_add_enums(struct module *mod) | ||
6755 | { | ||
6756 | if (!mod->num_trace_enums) | ||
6757 | return; | ||
6758 | |||
6759 | /* | ||
6760 | * Modules with bad taint do not have events created, do | ||
6761 | * not bother with enums either. | ||
6762 | */ | ||
6763 | if (trace_module_has_bad_taint(mod)) | ||
6764 | return; | ||
6765 | |||
6766 | trace_insert_enum_map(mod, mod->trace_enums, mod->num_trace_enums); | ||
6543 | } | 6767 | } |
6544 | 6768 | ||
6545 | static __init int tracer_init_debugfs(void) | 6769 | #ifdef CONFIG_TRACE_ENUM_MAP_FILE |
6770 | static void trace_module_remove_enums(struct module *mod) | ||
6771 | { | ||
6772 | union trace_enum_map_item *map; | ||
6773 | union trace_enum_map_item **last = &trace_enum_maps; | ||
6774 | |||
6775 | if (!mod->num_trace_enums) | ||
6776 | return; | ||
6777 | |||
6778 | mutex_lock(&trace_enum_mutex); | ||
6779 | |||
6780 | map = trace_enum_maps; | ||
6781 | |||
6782 | while (map) { | ||
6783 | if (map->head.mod == mod) | ||
6784 | break; | ||
6785 | map = trace_enum_jmp_to_tail(map); | ||
6786 | last = &map->tail.next; | ||
6787 | map = map->tail.next; | ||
6788 | } | ||
6789 | if (!map) | ||
6790 | goto out; | ||
6791 | |||
6792 | *last = trace_enum_jmp_to_tail(map)->tail.next; | ||
6793 | kfree(map); | ||
6794 | out: | ||
6795 | mutex_unlock(&trace_enum_mutex); | ||
6796 | } | ||
6797 | #else | ||
6798 | static inline void trace_module_remove_enums(struct module *mod) { } | ||
6799 | #endif /* CONFIG_TRACE_ENUM_MAP_FILE */ | ||
6800 | |||
6801 | static int trace_module_notify(struct notifier_block *self, | ||
6802 | unsigned long val, void *data) | ||
6803 | { | ||
6804 | struct module *mod = data; | ||
6805 | |||
6806 | switch (val) { | ||
6807 | case MODULE_STATE_COMING: | ||
6808 | trace_module_add_enums(mod); | ||
6809 | break; | ||
6810 | case MODULE_STATE_GOING: | ||
6811 | trace_module_remove_enums(mod); | ||
6812 | break; | ||
6813 | } | ||
6814 | |||
6815 | return 0; | ||
6816 | } | ||
6817 | |||
6818 | static struct notifier_block trace_module_nb = { | ||
6819 | .notifier_call = trace_module_notify, | ||
6820 | .priority = 0, | ||
6821 | }; | ||
6822 | #endif /* CONFIG_MODULES */ | ||
6823 | |||
6824 | static __init int tracer_init_tracefs(void) | ||
6546 | { | 6825 | { |
6547 | struct dentry *d_tracer; | 6826 | struct dentry *d_tracer; |
6548 | 6827 | ||
@@ -6552,7 +6831,7 @@ static __init int tracer_init_debugfs(void) | |||
6552 | if (IS_ERR(d_tracer)) | 6831 | if (IS_ERR(d_tracer)) |
6553 | return 0; | 6832 | return 0; |
6554 | 6833 | ||
6555 | init_tracer_debugfs(&global_trace, d_tracer); | 6834 | init_tracer_tracefs(&global_trace, d_tracer); |
6556 | 6835 | ||
6557 | trace_create_file("tracing_thresh", 0644, d_tracer, | 6836 | trace_create_file("tracing_thresh", 0644, d_tracer, |
6558 | &global_trace, &tracing_thresh_fops); | 6837 | &global_trace, &tracing_thresh_fops); |
@@ -6566,6 +6845,14 @@ static __init int tracer_init_debugfs(void) | |||
6566 | trace_create_file("saved_cmdlines_size", 0644, d_tracer, | 6845 | trace_create_file("saved_cmdlines_size", 0644, d_tracer, |
6567 | NULL, &tracing_saved_cmdlines_size_fops); | 6846 | NULL, &tracing_saved_cmdlines_size_fops); |
6568 | 6847 | ||
6848 | trace_enum_init(); | ||
6849 | |||
6850 | trace_create_enum_file(d_tracer); | ||
6851 | |||
6852 | #ifdef CONFIG_MODULES | ||
6853 | register_module_notifier(&trace_module_nb); | ||
6854 | #endif | ||
6855 | |||
6569 | #ifdef CONFIG_DYNAMIC_FTRACE | 6856 | #ifdef CONFIG_DYNAMIC_FTRACE |
6570 | trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, | 6857 | trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, |
6571 | &ftrace_update_tot_cnt, &tracing_dyn_info_fops); | 6858 | &ftrace_update_tot_cnt, &tracing_dyn_info_fops); |
@@ -6575,6 +6862,10 @@ static __init int tracer_init_debugfs(void) | |||
6575 | 6862 | ||
6576 | create_trace_options_dir(&global_trace); | 6863 | create_trace_options_dir(&global_trace); |
6577 | 6864 | ||
6865 | /* If the tracer was started via cmdline, create options for it here */ | ||
6866 | if (global_trace.current_trace != &nop_trace) | ||
6867 | update_tracer_options(&global_trace, global_trace.current_trace); | ||
6868 | |||
6578 | return 0; | 6869 | return 0; |
6579 | } | 6870 | } |
6580 | 6871 | ||
@@ -6888,7 +7179,7 @@ void __init trace_init(void) | |||
6888 | tracepoint_printk = 0; | 7179 | tracepoint_printk = 0; |
6889 | } | 7180 | } |
6890 | tracer_alloc_buffers(); | 7181 | tracer_alloc_buffers(); |
6891 | trace_event_init(); | 7182 | trace_event_init(); |
6892 | } | 7183 | } |
6893 | 7184 | ||
6894 | __init static int clear_boot_tracer(void) | 7185 | __init static int clear_boot_tracer(void) |
@@ -6910,5 +7201,5 @@ __init static int clear_boot_tracer(void) | |||
6910 | return 0; | 7201 | return 0; |
6911 | } | 7202 | } |
6912 | 7203 | ||
6913 | fs_initcall(tracer_init_debugfs); | 7204 | fs_initcall(tracer_init_tracefs); |
6914 | late_initcall(clear_boot_tracer); | 7205 | late_initcall(clear_boot_tracer); |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index dd8205a35760..d2612016de94 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -334,7 +334,7 @@ struct tracer_flags { | |||
334 | 334 | ||
335 | 335 | ||
336 | /** | 336 | /** |
337 | * struct tracer - a specific tracer and its callbacks to interact with debugfs | 337 | * struct tracer - a specific tracer and its callbacks to interact with tracefs |
338 | * @name: the name chosen to select it on the available_tracers file | 338 | * @name: the name chosen to select it on the available_tracers file |
339 | * @init: called when one switches to this tracer (echo name > current_tracer) | 339 | * @init: called when one switches to this tracer (echo name > current_tracer) |
340 | * @reset: called when one switches to another tracer | 340 | * @reset: called when one switches to another tracer |
@@ -1309,8 +1309,10 @@ static inline void init_ftrace_syscalls(void) { } | |||
1309 | 1309 | ||
1310 | #ifdef CONFIG_EVENT_TRACING | 1310 | #ifdef CONFIG_EVENT_TRACING |
1311 | void trace_event_init(void); | 1311 | void trace_event_init(void); |
1312 | void trace_event_enum_update(struct trace_enum_map **map, int len); | ||
1312 | #else | 1313 | #else |
1313 | static inline void __init trace_event_init(void) { } | 1314 | static inline void __init trace_event_init(void) { } |
1315 | static inlin void trace_event_enum_update(struct trace_enum_map **map, int len) { } | ||
1314 | #endif | 1316 | #endif |
1315 | 1317 | ||
1316 | extern struct trace_iterator *tracepoint_print_iter; | 1318 | extern struct trace_iterator *tracepoint_print_iter; |
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index e2d027ac66a2..ee7b94a4810a 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h | |||
@@ -223,7 +223,7 @@ FTRACE_ENTRY(bprint, bprint_entry, | |||
223 | __dynamic_array( u32, buf ) | 223 | __dynamic_array( u32, buf ) |
224 | ), | 224 | ), |
225 | 225 | ||
226 | F_printk("%pf: %s", | 226 | F_printk("%ps: %s", |
227 | (void *)__entry->ip, __entry->fmt), | 227 | (void *)__entry->ip, __entry->fmt), |
228 | 228 | ||
229 | FILTER_OTHER | 229 | FILTER_OTHER |
@@ -238,7 +238,7 @@ FTRACE_ENTRY(print, print_entry, | |||
238 | __dynamic_array( char, buf ) | 238 | __dynamic_array( char, buf ) |
239 | ), | 239 | ), |
240 | 240 | ||
241 | F_printk("%pf: %s", | 241 | F_printk("%ps: %s", |
242 | (void *)__entry->ip, __entry->buf), | 242 | (void *)__entry->ip, __entry->buf), |
243 | 243 | ||
244 | FILTER_OTHER | 244 | FILTER_OTHER |
@@ -253,7 +253,7 @@ FTRACE_ENTRY(bputs, bputs_entry, | |||
253 | __field( const char *, str ) | 253 | __field( const char *, str ) |
254 | ), | 254 | ), |
255 | 255 | ||
256 | F_printk("%pf: %s", | 256 | F_printk("%ps: %s", |
257 | (void *)__entry->ip, __entry->str), | 257 | (void *)__entry->ip, __entry->str), |
258 | 258 | ||
259 | FILTER_OTHER | 259 | FILTER_OTHER |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index db54dda10ccc..7da1dfeb322e 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -13,7 +13,7 @@ | |||
13 | #include <linux/workqueue.h> | 13 | #include <linux/workqueue.h> |
14 | #include <linux/spinlock.h> | 14 | #include <linux/spinlock.h> |
15 | #include <linux/kthread.h> | 15 | #include <linux/kthread.h> |
16 | #include <linux/debugfs.h> | 16 | #include <linux/tracefs.h> |
17 | #include <linux/uaccess.h> | 17 | #include <linux/uaccess.h> |
18 | #include <linux/module.h> | 18 | #include <linux/module.h> |
19 | #include <linux/ctype.h> | 19 | #include <linux/ctype.h> |
@@ -480,7 +480,7 @@ static void remove_subsystem(struct ftrace_subsystem_dir *dir) | |||
480 | return; | 480 | return; |
481 | 481 | ||
482 | if (!--dir->nr_events) { | 482 | if (!--dir->nr_events) { |
483 | debugfs_remove_recursive(dir->entry); | 483 | tracefs_remove_recursive(dir->entry); |
484 | list_del(&dir->list); | 484 | list_del(&dir->list); |
485 | __put_system_dir(dir); | 485 | __put_system_dir(dir); |
486 | } | 486 | } |
@@ -499,7 +499,7 @@ static void remove_event_file_dir(struct ftrace_event_file *file) | |||
499 | } | 499 | } |
500 | spin_unlock(&dir->d_lock); | 500 | spin_unlock(&dir->d_lock); |
501 | 501 | ||
502 | debugfs_remove_recursive(dir); | 502 | tracefs_remove_recursive(dir); |
503 | } | 503 | } |
504 | 504 | ||
505 | list_del(&file->list); | 505 | list_del(&file->list); |
@@ -1526,7 +1526,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name, | |||
1526 | } else | 1526 | } else |
1527 | __get_system(system); | 1527 | __get_system(system); |
1528 | 1528 | ||
1529 | dir->entry = debugfs_create_dir(name, parent); | 1529 | dir->entry = tracefs_create_dir(name, parent); |
1530 | if (!dir->entry) { | 1530 | if (!dir->entry) { |
1531 | pr_warn("Failed to create system directory %s\n", name); | 1531 | pr_warn("Failed to create system directory %s\n", name); |
1532 | __put_system(system); | 1532 | __put_system(system); |
@@ -1539,12 +1539,12 @@ event_subsystem_dir(struct trace_array *tr, const char *name, | |||
1539 | dir->subsystem = system; | 1539 | dir->subsystem = system; |
1540 | file->system = dir; | 1540 | file->system = dir; |
1541 | 1541 | ||
1542 | entry = debugfs_create_file("filter", 0644, dir->entry, dir, | 1542 | entry = tracefs_create_file("filter", 0644, dir->entry, dir, |
1543 | &ftrace_subsystem_filter_fops); | 1543 | &ftrace_subsystem_filter_fops); |
1544 | if (!entry) { | 1544 | if (!entry) { |
1545 | kfree(system->filter); | 1545 | kfree(system->filter); |
1546 | system->filter = NULL; | 1546 | system->filter = NULL; |
1547 | pr_warn("Could not create debugfs '%s/filter' entry\n", name); | 1547 | pr_warn("Could not create tracefs '%s/filter' entry\n", name); |
1548 | } | 1548 | } |
1549 | 1549 | ||
1550 | trace_create_file("enable", 0644, dir->entry, dir, | 1550 | trace_create_file("enable", 0644, dir->entry, dir, |
@@ -1585,9 +1585,9 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file) | |||
1585 | d_events = parent; | 1585 | d_events = parent; |
1586 | 1586 | ||
1587 | name = ftrace_event_name(call); | 1587 | name = ftrace_event_name(call); |
1588 | file->dir = debugfs_create_dir(name, d_events); | 1588 | file->dir = tracefs_create_dir(name, d_events); |
1589 | if (!file->dir) { | 1589 | if (!file->dir) { |
1590 | pr_warn("Could not create debugfs '%s' directory\n", name); | 1590 | pr_warn("Could not create tracefs '%s' directory\n", name); |
1591 | return -1; | 1591 | return -1; |
1592 | } | 1592 | } |
1593 | 1593 | ||
@@ -1704,6 +1704,125 @@ __register_event(struct ftrace_event_call *call, struct module *mod) | |||
1704 | return 0; | 1704 | return 0; |
1705 | } | 1705 | } |
1706 | 1706 | ||
1707 | static char *enum_replace(char *ptr, struct trace_enum_map *map, int len) | ||
1708 | { | ||
1709 | int rlen; | ||
1710 | int elen; | ||
1711 | |||
1712 | /* Find the length of the enum value as a string */ | ||
1713 | elen = snprintf(ptr, 0, "%ld", map->enum_value); | ||
1714 | /* Make sure there's enough room to replace the string with the value */ | ||
1715 | if (len < elen) | ||
1716 | return NULL; | ||
1717 | |||
1718 | snprintf(ptr, elen + 1, "%ld", map->enum_value); | ||
1719 | |||
1720 | /* Get the rest of the string of ptr */ | ||
1721 | rlen = strlen(ptr + len); | ||
1722 | memmove(ptr + elen, ptr + len, rlen); | ||
1723 | /* Make sure we end the new string */ | ||
1724 | ptr[elen + rlen] = 0; | ||
1725 | |||
1726 | return ptr + elen; | ||
1727 | } | ||
1728 | |||
1729 | static void update_event_printk(struct ftrace_event_call *call, | ||
1730 | struct trace_enum_map *map) | ||
1731 | { | ||
1732 | char *ptr; | ||
1733 | int quote = 0; | ||
1734 | int len = strlen(map->enum_string); | ||
1735 | |||
1736 | for (ptr = call->print_fmt; *ptr; ptr++) { | ||
1737 | if (*ptr == '\\') { | ||
1738 | ptr++; | ||
1739 | /* paranoid */ | ||
1740 | if (!*ptr) | ||
1741 | break; | ||
1742 | continue; | ||
1743 | } | ||
1744 | if (*ptr == '"') { | ||
1745 | quote ^= 1; | ||
1746 | continue; | ||
1747 | } | ||
1748 | if (quote) | ||
1749 | continue; | ||
1750 | if (isdigit(*ptr)) { | ||
1751 | /* skip numbers */ | ||
1752 | do { | ||
1753 | ptr++; | ||
1754 | /* Check for alpha chars like ULL */ | ||
1755 | } while (isalnum(*ptr)); | ||
1756 | /* | ||
1757 | * A number must have some kind of delimiter after | ||
1758 | * it, and we can ignore that too. | ||
1759 | */ | ||
1760 | continue; | ||
1761 | } | ||
1762 | if (isalpha(*ptr) || *ptr == '_') { | ||
1763 | if (strncmp(map->enum_string, ptr, len) == 0 && | ||
1764 | !isalnum(ptr[len]) && ptr[len] != '_') { | ||
1765 | ptr = enum_replace(ptr, map, len); | ||
1766 | /* Hmm, enum string smaller than value */ | ||
1767 | if (WARN_ON_ONCE(!ptr)) | ||
1768 | return; | ||
1769 | /* | ||
1770 | * No need to decrement here, as enum_replace() | ||
1771 | * returns the pointer to the character passed | ||
1772 | * the enum, and two enums can not be placed | ||
1773 | * back to back without something in between. | ||
1774 | * We can skip that something in between. | ||
1775 | */ | ||
1776 | continue; | ||
1777 | } | ||
1778 | skip_more: | ||
1779 | do { | ||
1780 | ptr++; | ||
1781 | } while (isalnum(*ptr) || *ptr == '_'); | ||
1782 | /* | ||
1783 | * If what comes after this variable is a '.' or | ||
1784 | * '->' then we can continue to ignore that string. | ||
1785 | */ | ||
1786 | if (*ptr == '.' || (ptr[0] == '-' && ptr[1] == '>')) { | ||
1787 | ptr += *ptr == '.' ? 1 : 2; | ||
1788 | goto skip_more; | ||
1789 | } | ||
1790 | /* | ||
1791 | * Once again, we can skip the delimiter that came | ||
1792 | * after the string. | ||
1793 | */ | ||
1794 | continue; | ||
1795 | } | ||
1796 | } | ||
1797 | } | ||
1798 | |||
1799 | void trace_event_enum_update(struct trace_enum_map **map, int len) | ||
1800 | { | ||
1801 | struct ftrace_event_call *call, *p; | ||
1802 | const char *last_system = NULL; | ||
1803 | int last_i; | ||
1804 | int i; | ||
1805 | |||
1806 | down_write(&trace_event_sem); | ||
1807 | list_for_each_entry_safe(call, p, &ftrace_events, list) { | ||
1808 | /* events are usually grouped together with systems */ | ||
1809 | if (!last_system || call->class->system != last_system) { | ||
1810 | last_i = 0; | ||
1811 | last_system = call->class->system; | ||
1812 | } | ||
1813 | |||
1814 | for (i = last_i; i < len; i++) { | ||
1815 | if (call->class->system == map[i]->system) { | ||
1816 | /* Save the first system if need be */ | ||
1817 | if (!last_i) | ||
1818 | last_i = i; | ||
1819 | update_event_printk(call, map[i]); | ||
1820 | } | ||
1821 | } | ||
1822 | } | ||
1823 | up_write(&trace_event_sem); | ||
1824 | } | ||
1825 | |||
1707 | static struct ftrace_event_file * | 1826 | static struct ftrace_event_file * |
1708 | trace_create_new_event(struct ftrace_event_call *call, | 1827 | trace_create_new_event(struct ftrace_event_call *call, |
1709 | struct trace_array *tr) | 1828 | struct trace_array *tr) |
@@ -1915,7 +2034,7 @@ static int trace_module_notify(struct notifier_block *self, | |||
1915 | 2034 | ||
1916 | static struct notifier_block trace_module_nb = { | 2035 | static struct notifier_block trace_module_nb = { |
1917 | .notifier_call = trace_module_notify, | 2036 | .notifier_call = trace_module_notify, |
1918 | .priority = 0, | 2037 | .priority = 1, /* higher than trace.c module notify */ |
1919 | }; | 2038 | }; |
1920 | #endif /* CONFIG_MODULES */ | 2039 | #endif /* CONFIG_MODULES */ |
1921 | 2040 | ||
@@ -2228,7 +2347,7 @@ static inline int register_event_cmds(void) { return 0; } | |||
2228 | /* | 2347 | /* |
2229 | * The top level array has already had its ftrace_event_file | 2348 | * The top level array has already had its ftrace_event_file |
2230 | * descriptors created in order to allow for early events to | 2349 | * descriptors created in order to allow for early events to |
2231 | * be recorded. This function is called after the debugfs has been | 2350 | * be recorded. This function is called after the tracefs has been |
2232 | * initialized, and we now have to create the files associated | 2351 | * initialized, and we now have to create the files associated |
2233 | * to the events. | 2352 | * to the events. |
2234 | */ | 2353 | */ |
@@ -2311,16 +2430,16 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) | |||
2311 | struct dentry *d_events; | 2430 | struct dentry *d_events; |
2312 | struct dentry *entry; | 2431 | struct dentry *entry; |
2313 | 2432 | ||
2314 | entry = debugfs_create_file("set_event", 0644, parent, | 2433 | entry = tracefs_create_file("set_event", 0644, parent, |
2315 | tr, &ftrace_set_event_fops); | 2434 | tr, &ftrace_set_event_fops); |
2316 | if (!entry) { | 2435 | if (!entry) { |
2317 | pr_warn("Could not create debugfs 'set_event' entry\n"); | 2436 | pr_warn("Could not create tracefs 'set_event' entry\n"); |
2318 | return -ENOMEM; | 2437 | return -ENOMEM; |
2319 | } | 2438 | } |
2320 | 2439 | ||
2321 | d_events = debugfs_create_dir("events", parent); | 2440 | d_events = tracefs_create_dir("events", parent); |
2322 | if (!d_events) { | 2441 | if (!d_events) { |
2323 | pr_warn("Could not create debugfs 'events' directory\n"); | 2442 | pr_warn("Could not create tracefs 'events' directory\n"); |
2324 | return -ENOMEM; | 2443 | return -ENOMEM; |
2325 | } | 2444 | } |
2326 | 2445 | ||
@@ -2412,7 +2531,7 @@ int event_trace_del_tracer(struct trace_array *tr) | |||
2412 | 2531 | ||
2413 | down_write(&trace_event_sem); | 2532 | down_write(&trace_event_sem); |
2414 | __trace_remove_event_dirs(tr); | 2533 | __trace_remove_event_dirs(tr); |
2415 | debugfs_remove_recursive(tr->event_dir); | 2534 | tracefs_remove_recursive(tr->event_dir); |
2416 | up_write(&trace_event_sem); | 2535 | up_write(&trace_event_sem); |
2417 | 2536 | ||
2418 | tr->event_dir = NULL; | 2537 | tr->event_dir = NULL; |
@@ -2534,10 +2653,10 @@ static __init int event_trace_init(void) | |||
2534 | if (IS_ERR(d_tracer)) | 2653 | if (IS_ERR(d_tracer)) |
2535 | return 0; | 2654 | return 0; |
2536 | 2655 | ||
2537 | entry = debugfs_create_file("available_events", 0444, d_tracer, | 2656 | entry = tracefs_create_file("available_events", 0444, d_tracer, |
2538 | tr, &ftrace_avail_fops); | 2657 | tr, &ftrace_avail_fops); |
2539 | if (!entry) | 2658 | if (!entry) |
2540 | pr_warn("Could not create debugfs 'available_events' entry\n"); | 2659 | pr_warn("Could not create tracefs 'available_events' entry\n"); |
2541 | 2660 | ||
2542 | if (trace_define_common_fields()) | 2661 | if (trace_define_common_fields()) |
2543 | pr_warn("tracing: Failed to allocate common fields"); | 2662 | pr_warn("tracing: Failed to allocate common fields"); |
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 12e2b99be862..174a6a71146c 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c | |||
@@ -177,7 +177,7 @@ struct ftrace_event_call __used event_##call = { \ | |||
177 | }, \ | 177 | }, \ |
178 | .event.type = etype, \ | 178 | .event.type = etype, \ |
179 | .print_fmt = print, \ | 179 | .print_fmt = print, \ |
180 | .flags = TRACE_EVENT_FL_IGNORE_ENABLE | TRACE_EVENT_FL_USE_CALL_FILTER, \ | 180 | .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \ |
181 | }; \ | 181 | }; \ |
182 | struct ftrace_event_call __used \ | 182 | struct ftrace_event_call __used \ |
183 | __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; | 183 | __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; |
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 2d25ad1526bb..9cfea4c6d314 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c | |||
@@ -6,7 +6,6 @@ | |||
6 | * is Copyright (c) Steven Rostedt <srostedt@redhat.com> | 6 | * is Copyright (c) Steven Rostedt <srostedt@redhat.com> |
7 | * | 7 | * |
8 | */ | 8 | */ |
9 | #include <linux/debugfs.h> | ||
10 | #include <linux/uaccess.h> | 9 | #include <linux/uaccess.h> |
11 | #include <linux/ftrace.h> | 10 | #include <linux/ftrace.h> |
12 | #include <linux/slab.h> | 11 | #include <linux/slab.h> |
@@ -151,7 +150,7 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, | |||
151 | * The curr_ret_stack is initialized to -1 and get increased | 150 | * The curr_ret_stack is initialized to -1 and get increased |
152 | * in this function. So it can be less than -1 only if it was | 151 | * in this function. So it can be less than -1 only if it was |
153 | * filtered out via ftrace_graph_notrace_addr() which can be | 152 | * filtered out via ftrace_graph_notrace_addr() which can be |
154 | * set from set_graph_notrace file in debugfs by user. | 153 | * set from set_graph_notrace file in tracefs by user. |
155 | */ | 154 | */ |
156 | if (current->curr_ret_stack < -1) | 155 | if (current->curr_ret_stack < -1) |
157 | return -EBUSY; | 156 | return -EBUSY; |
@@ -1432,7 +1431,7 @@ static const struct file_operations graph_depth_fops = { | |||
1432 | .llseek = generic_file_llseek, | 1431 | .llseek = generic_file_llseek, |
1433 | }; | 1432 | }; |
1434 | 1433 | ||
1435 | static __init int init_graph_debugfs(void) | 1434 | static __init int init_graph_tracefs(void) |
1436 | { | 1435 | { |
1437 | struct dentry *d_tracer; | 1436 | struct dentry *d_tracer; |
1438 | 1437 | ||
@@ -1445,7 +1444,7 @@ static __init int init_graph_debugfs(void) | |||
1445 | 1444 | ||
1446 | return 0; | 1445 | return 0; |
1447 | } | 1446 | } |
1448 | fs_initcall(init_graph_debugfs); | 1447 | fs_initcall(init_graph_tracefs); |
1449 | 1448 | ||
1450 | static __init int init_graph_trace(void) | 1449 | static __init int init_graph_trace(void) |
1451 | { | 1450 | { |
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index d73f565b4e06..d0ce590f06e1 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
@@ -250,7 +250,7 @@ DEFINE_FETCH_symbol(string_size) | |||
250 | #define fetch_file_offset_string_size NULL | 250 | #define fetch_file_offset_string_size NULL |
251 | 251 | ||
252 | /* Fetch type information table */ | 252 | /* Fetch type information table */ |
253 | const struct fetch_type kprobes_fetch_type_table[] = { | 253 | static const struct fetch_type kprobes_fetch_type_table[] = { |
254 | /* Special types */ | 254 | /* Special types */ |
255 | [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, | 255 | [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, |
256 | sizeof(u32), 1, "__data_loc char[]"), | 256 | sizeof(u32), 1, "__data_loc char[]"), |
@@ -760,7 +760,8 @@ static int create_trace_kprobe(int argc, char **argv) | |||
760 | 760 | ||
761 | /* Parse fetch argument */ | 761 | /* Parse fetch argument */ |
762 | ret = traceprobe_parse_probe_arg(arg, &tk->tp.size, parg, | 762 | ret = traceprobe_parse_probe_arg(arg, &tk->tp.size, parg, |
763 | is_return, true); | 763 | is_return, true, |
764 | kprobes_fetch_type_table); | ||
764 | if (ret) { | 765 | if (ret) { |
765 | pr_info("Parse error at argument[%d]. (%d)\n", i, ret); | 766 | pr_info("Parse error at argument[%d]. (%d)\n", i, ret); |
766 | goto error; | 767 | goto error; |
@@ -1134,11 +1135,15 @@ static void | |||
1134 | kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) | 1135 | kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) |
1135 | { | 1136 | { |
1136 | struct ftrace_event_call *call = &tk->tp.call; | 1137 | struct ftrace_event_call *call = &tk->tp.call; |
1138 | struct bpf_prog *prog = call->prog; | ||
1137 | struct kprobe_trace_entry_head *entry; | 1139 | struct kprobe_trace_entry_head *entry; |
1138 | struct hlist_head *head; | 1140 | struct hlist_head *head; |
1139 | int size, __size, dsize; | 1141 | int size, __size, dsize; |
1140 | int rctx; | 1142 | int rctx; |
1141 | 1143 | ||
1144 | if (prog && !trace_call_bpf(prog, regs)) | ||
1145 | return; | ||
1146 | |||
1142 | head = this_cpu_ptr(call->perf_events); | 1147 | head = this_cpu_ptr(call->perf_events); |
1143 | if (hlist_empty(head)) | 1148 | if (hlist_empty(head)) |
1144 | return; | 1149 | return; |
@@ -1165,11 +1170,15 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, | |||
1165 | struct pt_regs *regs) | 1170 | struct pt_regs *regs) |
1166 | { | 1171 | { |
1167 | struct ftrace_event_call *call = &tk->tp.call; | 1172 | struct ftrace_event_call *call = &tk->tp.call; |
1173 | struct bpf_prog *prog = call->prog; | ||
1168 | struct kretprobe_trace_entry_head *entry; | 1174 | struct kretprobe_trace_entry_head *entry; |
1169 | struct hlist_head *head; | 1175 | struct hlist_head *head; |
1170 | int size, __size, dsize; | 1176 | int size, __size, dsize; |
1171 | int rctx; | 1177 | int rctx; |
1172 | 1178 | ||
1179 | if (prog && !trace_call_bpf(prog, regs)) | ||
1180 | return; | ||
1181 | |||
1173 | head = this_cpu_ptr(call->perf_events); | 1182 | head = this_cpu_ptr(call->perf_events); |
1174 | if (hlist_empty(head)) | 1183 | if (hlist_empty(head)) |
1175 | return; | 1184 | return; |
@@ -1286,7 +1295,7 @@ static int register_kprobe_event(struct trace_kprobe *tk) | |||
1286 | kfree(call->print_fmt); | 1295 | kfree(call->print_fmt); |
1287 | return -ENODEV; | 1296 | return -ENODEV; |
1288 | } | 1297 | } |
1289 | call->flags = 0; | 1298 | call->flags = TRACE_EVENT_FL_KPROBE; |
1290 | call->class->reg = kprobe_register; | 1299 | call->class->reg = kprobe_register; |
1291 | call->data = tk; | 1300 | call->data = tk; |
1292 | ret = trace_add_event_call(call); | 1301 | ret = trace_add_event_call(call); |
@@ -1310,7 +1319,7 @@ static int unregister_kprobe_event(struct trace_kprobe *tk) | |||
1310 | return ret; | 1319 | return ret; |
1311 | } | 1320 | } |
1312 | 1321 | ||
1313 | /* Make a debugfs interface for controlling probe points */ | 1322 | /* Make a tracefs interface for controlling probe points */ |
1314 | static __init int init_kprobe_trace(void) | 1323 | static __init int init_kprobe_trace(void) |
1315 | { | 1324 | { |
1316 | struct dentry *d_tracer; | 1325 | struct dentry *d_tracer; |
@@ -1323,20 +1332,20 @@ static __init int init_kprobe_trace(void) | |||
1323 | if (IS_ERR(d_tracer)) | 1332 | if (IS_ERR(d_tracer)) |
1324 | return 0; | 1333 | return 0; |
1325 | 1334 | ||
1326 | entry = debugfs_create_file("kprobe_events", 0644, d_tracer, | 1335 | entry = tracefs_create_file("kprobe_events", 0644, d_tracer, |
1327 | NULL, &kprobe_events_ops); | 1336 | NULL, &kprobe_events_ops); |
1328 | 1337 | ||
1329 | /* Event list interface */ | 1338 | /* Event list interface */ |
1330 | if (!entry) | 1339 | if (!entry) |
1331 | pr_warning("Could not create debugfs " | 1340 | pr_warning("Could not create tracefs " |
1332 | "'kprobe_events' entry\n"); | 1341 | "'kprobe_events' entry\n"); |
1333 | 1342 | ||
1334 | /* Profile interface */ | 1343 | /* Profile interface */ |
1335 | entry = debugfs_create_file("kprobe_profile", 0444, d_tracer, | 1344 | entry = tracefs_create_file("kprobe_profile", 0444, d_tracer, |
1336 | NULL, &kprobe_profile_ops); | 1345 | NULL, &kprobe_profile_ops); |
1337 | 1346 | ||
1338 | if (!entry) | 1347 | if (!entry) |
1339 | pr_warning("Could not create debugfs " | 1348 | pr_warning("Could not create tracefs " |
1340 | "'kprobe_profile' entry\n"); | 1349 | "'kprobe_profile' entry\n"); |
1341 | return 0; | 1350 | return 0; |
1342 | } | 1351 | } |
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index b983b2fd2ca1..1769a81da8a7 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c | |||
@@ -356,17 +356,14 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, | |||
356 | 356 | ||
357 | /* Recursive argument parser */ | 357 | /* Recursive argument parser */ |
358 | static int parse_probe_arg(char *arg, const struct fetch_type *t, | 358 | static int parse_probe_arg(char *arg, const struct fetch_type *t, |
359 | struct fetch_param *f, bool is_return, bool is_kprobe) | 359 | struct fetch_param *f, bool is_return, bool is_kprobe, |
360 | const struct fetch_type *ftbl) | ||
360 | { | 361 | { |
361 | const struct fetch_type *ftbl; | ||
362 | unsigned long param; | 362 | unsigned long param; |
363 | long offset; | 363 | long offset; |
364 | char *tmp; | 364 | char *tmp; |
365 | int ret = 0; | 365 | int ret = 0; |
366 | 366 | ||
367 | ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table; | ||
368 | BUG_ON(ftbl == NULL); | ||
369 | |||
370 | switch (arg[0]) { | 367 | switch (arg[0]) { |
371 | case '$': | 368 | case '$': |
372 | ret = parse_probe_vars(arg + 1, t, f, is_return, is_kprobe); | 369 | ret = parse_probe_vars(arg + 1, t, f, is_return, is_kprobe); |
@@ -447,7 +444,7 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, | |||
447 | dprm->fetch_size = get_fetch_size_function(t, | 444 | dprm->fetch_size = get_fetch_size_function(t, |
448 | dprm->fetch, ftbl); | 445 | dprm->fetch, ftbl); |
449 | ret = parse_probe_arg(arg, t2, &dprm->orig, is_return, | 446 | ret = parse_probe_arg(arg, t2, &dprm->orig, is_return, |
450 | is_kprobe); | 447 | is_kprobe, ftbl); |
451 | if (ret) | 448 | if (ret) |
452 | kfree(dprm); | 449 | kfree(dprm); |
453 | else { | 450 | else { |
@@ -505,15 +502,12 @@ static int __parse_bitfield_probe_arg(const char *bf, | |||
505 | 502 | ||
506 | /* String length checking wrapper */ | 503 | /* String length checking wrapper */ |
507 | int traceprobe_parse_probe_arg(char *arg, ssize_t *size, | 504 | int traceprobe_parse_probe_arg(char *arg, ssize_t *size, |
508 | struct probe_arg *parg, bool is_return, bool is_kprobe) | 505 | struct probe_arg *parg, bool is_return, bool is_kprobe, |
506 | const struct fetch_type *ftbl) | ||
509 | { | 507 | { |
510 | const struct fetch_type *ftbl; | ||
511 | const char *t; | 508 | const char *t; |
512 | int ret; | 509 | int ret; |
513 | 510 | ||
514 | ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table; | ||
515 | BUG_ON(ftbl == NULL); | ||
516 | |||
517 | if (strlen(arg) > MAX_ARGSTR_LEN) { | 511 | if (strlen(arg) > MAX_ARGSTR_LEN) { |
518 | pr_info("Argument is too long.: %s\n", arg); | 512 | pr_info("Argument is too long.: %s\n", arg); |
519 | return -ENOSPC; | 513 | return -ENOSPC; |
@@ -535,7 +529,8 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size, | |||
535 | } | 529 | } |
536 | parg->offset = *size; | 530 | parg->offset = *size; |
537 | *size += parg->type->size; | 531 | *size += parg->type->size; |
538 | ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return, is_kprobe); | 532 | ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return, |
533 | is_kprobe, ftbl); | ||
539 | 534 | ||
540 | if (ret >= 0 && t != NULL) | 535 | if (ret >= 0 && t != NULL) |
541 | ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch); | 536 | ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch); |
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 4f815fbce16d..ab283e146b70 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h | |||
@@ -25,7 +25,7 @@ | |||
25 | #include <linux/seq_file.h> | 25 | #include <linux/seq_file.h> |
26 | #include <linux/slab.h> | 26 | #include <linux/slab.h> |
27 | #include <linux/smp.h> | 27 | #include <linux/smp.h> |
28 | #include <linux/debugfs.h> | 28 | #include <linux/tracefs.h> |
29 | #include <linux/types.h> | 29 | #include <linux/types.h> |
30 | #include <linux/string.h> | 30 | #include <linux/string.h> |
31 | #include <linux/ctype.h> | 31 | #include <linux/ctype.h> |
@@ -229,13 +229,6 @@ ASSIGN_FETCH_FUNC(file_offset, ftype), \ | |||
229 | #define FETCH_TYPE_STRING 0 | 229 | #define FETCH_TYPE_STRING 0 |
230 | #define FETCH_TYPE_STRSIZE 1 | 230 | #define FETCH_TYPE_STRSIZE 1 |
231 | 231 | ||
232 | /* | ||
233 | * Fetch type information table. | ||
234 | * It's declared as a weak symbol due to conditional compilation. | ||
235 | */ | ||
236 | extern __weak const struct fetch_type kprobes_fetch_type_table[]; | ||
237 | extern __weak const struct fetch_type uprobes_fetch_type_table[]; | ||
238 | |||
239 | #ifdef CONFIG_KPROBE_EVENT | 232 | #ifdef CONFIG_KPROBE_EVENT |
240 | struct symbol_cache; | 233 | struct symbol_cache; |
241 | unsigned long update_symbol_cache(struct symbol_cache *sc); | 234 | unsigned long update_symbol_cache(struct symbol_cache *sc); |
@@ -333,7 +326,8 @@ find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file) | |||
333 | } | 326 | } |
334 | 327 | ||
335 | extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size, | 328 | extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size, |
336 | struct probe_arg *parg, bool is_return, bool is_kprobe); | 329 | struct probe_arg *parg, bool is_return, bool is_kprobe, |
330 | const struct fetch_type *ftbl); | ||
337 | 331 | ||
338 | extern int traceprobe_conflict_field_name(const char *name, | 332 | extern int traceprobe_conflict_field_name(const char *name, |
339 | struct probe_arg *args, int narg); | 333 | struct probe_arg *args, int narg); |
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index c3e4fcfddd45..3f34496244e9 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c | |||
@@ -327,11 +327,11 @@ static void t_stop(struct seq_file *m, void *p) | |||
327 | local_irq_enable(); | 327 | local_irq_enable(); |
328 | } | 328 | } |
329 | 329 | ||
330 | static int trace_lookup_stack(struct seq_file *m, long i) | 330 | static void trace_lookup_stack(struct seq_file *m, long i) |
331 | { | 331 | { |
332 | unsigned long addr = stack_dump_trace[i]; | 332 | unsigned long addr = stack_dump_trace[i]; |
333 | 333 | ||
334 | return seq_printf(m, "%pS\n", (void *)addr); | 334 | seq_printf(m, "%pS\n", (void *)addr); |
335 | } | 335 | } |
336 | 336 | ||
337 | static void print_disabled(struct seq_file *m) | 337 | static void print_disabled(struct seq_file *m) |
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 75e19e86c954..6cf935316769 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c | |||
@@ -12,7 +12,7 @@ | |||
12 | #include <linux/list.h> | 12 | #include <linux/list.h> |
13 | #include <linux/slab.h> | 13 | #include <linux/slab.h> |
14 | #include <linux/rbtree.h> | 14 | #include <linux/rbtree.h> |
15 | #include <linux/debugfs.h> | 15 | #include <linux/tracefs.h> |
16 | #include "trace_stat.h" | 16 | #include "trace_stat.h" |
17 | #include "trace.h" | 17 | #include "trace.h" |
18 | 18 | ||
@@ -65,7 +65,7 @@ static void reset_stat_session(struct stat_session *session) | |||
65 | 65 | ||
66 | static void destroy_session(struct stat_session *session) | 66 | static void destroy_session(struct stat_session *session) |
67 | { | 67 | { |
68 | debugfs_remove(session->file); | 68 | tracefs_remove(session->file); |
69 | __reset_stat_session(session); | 69 | __reset_stat_session(session); |
70 | mutex_destroy(&session->stat_mutex); | 70 | mutex_destroy(&session->stat_mutex); |
71 | kfree(session); | 71 | kfree(session); |
@@ -279,9 +279,9 @@ static int tracing_stat_init(void) | |||
279 | if (IS_ERR(d_tracing)) | 279 | if (IS_ERR(d_tracing)) |
280 | return 0; | 280 | return 0; |
281 | 281 | ||
282 | stat_dir = debugfs_create_dir("trace_stat", d_tracing); | 282 | stat_dir = tracefs_create_dir("trace_stat", d_tracing); |
283 | if (!stat_dir) | 283 | if (!stat_dir) |
284 | pr_warning("Could not create debugfs " | 284 | pr_warning("Could not create tracefs " |
285 | "'trace_stat' entry\n"); | 285 | "'trace_stat' entry\n"); |
286 | return 0; | 286 | return 0; |
287 | } | 287 | } |
@@ -291,7 +291,7 @@ static int init_stat_file(struct stat_session *session) | |||
291 | if (!stat_dir && tracing_stat_init()) | 291 | if (!stat_dir && tracing_stat_init()) |
292 | return -ENODEV; | 292 | return -ENODEV; |
293 | 293 | ||
294 | session->file = debugfs_create_file(session->ts->name, 0644, | 294 | session->file = tracefs_create_file(session->ts->name, 0644, |
295 | stat_dir, | 295 | stat_dir, |
296 | session, &tracing_stat_fops); | 296 | session, &tracing_stat_fops); |
297 | if (!session->file) | 297 | if (!session->file) |
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 7dc1c8abecd6..d60fe62ec4fa 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c | |||
@@ -196,7 +196,7 @@ DEFINE_FETCH_file_offset(string) | |||
196 | DEFINE_FETCH_file_offset(string_size) | 196 | DEFINE_FETCH_file_offset(string_size) |
197 | 197 | ||
198 | /* Fetch type information table */ | 198 | /* Fetch type information table */ |
199 | const struct fetch_type uprobes_fetch_type_table[] = { | 199 | static const struct fetch_type uprobes_fetch_type_table[] = { |
200 | /* Special types */ | 200 | /* Special types */ |
201 | [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, | 201 | [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, |
202 | sizeof(u32), 1, "__data_loc char[]"), | 202 | sizeof(u32), 1, "__data_loc char[]"), |
@@ -535,7 +535,8 @@ static int create_trace_uprobe(int argc, char **argv) | |||
535 | 535 | ||
536 | /* Parse fetch argument */ | 536 | /* Parse fetch argument */ |
537 | ret = traceprobe_parse_probe_arg(arg, &tu->tp.size, parg, | 537 | ret = traceprobe_parse_probe_arg(arg, &tu->tp.size, parg, |
538 | is_return, false); | 538 | is_return, false, |
539 | uprobes_fetch_type_table); | ||
539 | if (ret) { | 540 | if (ret) { |
540 | pr_info("Parse error at argument[%d]. (%d)\n", i, ret); | 541 | pr_info("Parse error at argument[%d]. (%d)\n", i, ret); |
541 | goto error; | 542 | goto error; |
@@ -1005,7 +1006,7 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) | |||
1005 | return true; | 1006 | return true; |
1006 | 1007 | ||
1007 | list_for_each_entry(event, &filter->perf_events, hw.tp_list) { | 1008 | list_for_each_entry(event, &filter->perf_events, hw.tp_list) { |
1008 | if (event->hw.tp_target->mm == mm) | 1009 | if (event->hw.target->mm == mm) |
1009 | return true; | 1010 | return true; |
1010 | } | 1011 | } |
1011 | 1012 | ||
@@ -1015,7 +1016,7 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) | |||
1015 | static inline bool | 1016 | static inline bool |
1016 | uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event) | 1017 | uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event) |
1017 | { | 1018 | { |
1018 | return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm); | 1019 | return __uprobe_perf_filter(&tu->filter, event->hw.target->mm); |
1019 | } | 1020 | } |
1020 | 1021 | ||
1021 | static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) | 1022 | static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) |
@@ -1023,10 +1024,10 @@ static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) | |||
1023 | bool done; | 1024 | bool done; |
1024 | 1025 | ||
1025 | write_lock(&tu->filter.rwlock); | 1026 | write_lock(&tu->filter.rwlock); |
1026 | if (event->hw.tp_target) { | 1027 | if (event->hw.target) { |
1027 | list_del(&event->hw.tp_list); | 1028 | list_del(&event->hw.tp_list); |
1028 | done = tu->filter.nr_systemwide || | 1029 | done = tu->filter.nr_systemwide || |
1029 | (event->hw.tp_target->flags & PF_EXITING) || | 1030 | (event->hw.target->flags & PF_EXITING) || |
1030 | uprobe_filter_event(tu, event); | 1031 | uprobe_filter_event(tu, event); |
1031 | } else { | 1032 | } else { |
1032 | tu->filter.nr_systemwide--; | 1033 | tu->filter.nr_systemwide--; |
@@ -1046,7 +1047,7 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) | |||
1046 | int err; | 1047 | int err; |
1047 | 1048 | ||
1048 | write_lock(&tu->filter.rwlock); | 1049 | write_lock(&tu->filter.rwlock); |
1049 | if (event->hw.tp_target) { | 1050 | if (event->hw.target) { |
1050 | /* | 1051 | /* |
1051 | * event->parent != NULL means copy_process(), we can avoid | 1052 | * event->parent != NULL means copy_process(), we can avoid |
1052 | * uprobe_apply(). current->mm must be probed and we can rely | 1053 | * uprobe_apply(). current->mm must be probed and we can rely |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 3174bf8e3538..2316f50b07a4 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
@@ -24,8 +24,33 @@ | |||
24 | #include <linux/kvm_para.h> | 24 | #include <linux/kvm_para.h> |
25 | #include <linux/perf_event.h> | 25 | #include <linux/perf_event.h> |
26 | 26 | ||
27 | int watchdog_user_enabled = 1; | 27 | /* |
28 | * The run state of the lockup detectors is controlled by the content of the | ||
29 | * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit - | ||
30 | * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector. | ||
31 | * | ||
32 | * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled' | ||
33 | * are variables that are only used as an 'interface' between the parameters | ||
34 | * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The | ||
35 | * 'watchdog_thresh' variable is handled differently because its value is not | ||
36 | * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh' | ||
37 | * is equal zero. | ||
38 | */ | ||
39 | #define NMI_WATCHDOG_ENABLED_BIT 0 | ||
40 | #define SOFT_WATCHDOG_ENABLED_BIT 1 | ||
41 | #define NMI_WATCHDOG_ENABLED (1 << NMI_WATCHDOG_ENABLED_BIT) | ||
42 | #define SOFT_WATCHDOG_ENABLED (1 << SOFT_WATCHDOG_ENABLED_BIT) | ||
43 | |||
44 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | ||
45 | static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED; | ||
46 | #else | ||
47 | static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; | ||
48 | #endif | ||
49 | int __read_mostly nmi_watchdog_enabled; | ||
50 | int __read_mostly soft_watchdog_enabled; | ||
51 | int __read_mostly watchdog_user_enabled; | ||
28 | int __read_mostly watchdog_thresh = 10; | 52 | int __read_mostly watchdog_thresh = 10; |
53 | |||
29 | #ifdef CONFIG_SMP | 54 | #ifdef CONFIG_SMP |
30 | int __read_mostly sysctl_softlockup_all_cpu_backtrace; | 55 | int __read_mostly sysctl_softlockup_all_cpu_backtrace; |
31 | #else | 56 | #else |
@@ -58,8 +83,6 @@ static unsigned long soft_lockup_nmi_warn; | |||
58 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | 83 | #ifdef CONFIG_HARDLOCKUP_DETECTOR |
59 | static int hardlockup_panic = | 84 | static int hardlockup_panic = |
60 | CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; | 85 | CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; |
61 | |||
62 | static bool hardlockup_detector_enabled = true; | ||
63 | /* | 86 | /* |
64 | * We may not want to enable hard lockup detection by default in all cases, | 87 | * We may not want to enable hard lockup detection by default in all cases, |
65 | * for example when running the kernel as a guest on a hypervisor. In these | 88 | * for example when running the kernel as a guest on a hypervisor. In these |
@@ -68,14 +91,9 @@ static bool hardlockup_detector_enabled = true; | |||
68 | * kernel command line parameters are parsed, because otherwise it is not | 91 | * kernel command line parameters are parsed, because otherwise it is not |
69 | * possible to override this in hardlockup_panic_setup(). | 92 | * possible to override this in hardlockup_panic_setup(). |
70 | */ | 93 | */ |
71 | void watchdog_enable_hardlockup_detector(bool val) | 94 | void hardlockup_detector_disable(void) |
72 | { | ||
73 | hardlockup_detector_enabled = val; | ||
74 | } | ||
75 | |||
76 | bool watchdog_hardlockup_detector_is_enabled(void) | ||
77 | { | 95 | { |
78 | return hardlockup_detector_enabled; | 96 | watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; |
79 | } | 97 | } |
80 | 98 | ||
81 | static int __init hardlockup_panic_setup(char *str) | 99 | static int __init hardlockup_panic_setup(char *str) |
@@ -85,15 +103,9 @@ static int __init hardlockup_panic_setup(char *str) | |||
85 | else if (!strncmp(str, "nopanic", 7)) | 103 | else if (!strncmp(str, "nopanic", 7)) |
86 | hardlockup_panic = 0; | 104 | hardlockup_panic = 0; |
87 | else if (!strncmp(str, "0", 1)) | 105 | else if (!strncmp(str, "0", 1)) |
88 | watchdog_user_enabled = 0; | 106 | watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; |
89 | else if (!strncmp(str, "1", 1) || !strncmp(str, "2", 1)) { | 107 | else if (!strncmp(str, "1", 1)) |
90 | /* | 108 | watchdog_enabled |= NMI_WATCHDOG_ENABLED; |
91 | * Setting 'nmi_watchdog=1' or 'nmi_watchdog=2' (legacy option) | ||
92 | * has the same effect. | ||
93 | */ | ||
94 | watchdog_user_enabled = 1; | ||
95 | watchdog_enable_hardlockup_detector(true); | ||
96 | } | ||
97 | return 1; | 109 | return 1; |
98 | } | 110 | } |
99 | __setup("nmi_watchdog=", hardlockup_panic_setup); | 111 | __setup("nmi_watchdog=", hardlockup_panic_setup); |
@@ -112,19 +124,18 @@ __setup("softlockup_panic=", softlockup_panic_setup); | |||
112 | 124 | ||
113 | static int __init nowatchdog_setup(char *str) | 125 | static int __init nowatchdog_setup(char *str) |
114 | { | 126 | { |
115 | watchdog_user_enabled = 0; | 127 | watchdog_enabled = 0; |
116 | return 1; | 128 | return 1; |
117 | } | 129 | } |
118 | __setup("nowatchdog", nowatchdog_setup); | 130 | __setup("nowatchdog", nowatchdog_setup); |
119 | 131 | ||
120 | /* deprecated */ | ||
121 | static int __init nosoftlockup_setup(char *str) | 132 | static int __init nosoftlockup_setup(char *str) |
122 | { | 133 | { |
123 | watchdog_user_enabled = 0; | 134 | watchdog_enabled &= ~SOFT_WATCHDOG_ENABLED; |
124 | return 1; | 135 | return 1; |
125 | } | 136 | } |
126 | __setup("nosoftlockup", nosoftlockup_setup); | 137 | __setup("nosoftlockup", nosoftlockup_setup); |
127 | /* */ | 138 | |
128 | #ifdef CONFIG_SMP | 139 | #ifdef CONFIG_SMP |
129 | static int __init softlockup_all_cpu_backtrace_setup(char *str) | 140 | static int __init softlockup_all_cpu_backtrace_setup(char *str) |
130 | { | 141 | { |
@@ -239,10 +250,11 @@ static int is_softlockup(unsigned long touch_ts) | |||
239 | { | 250 | { |
240 | unsigned long now = get_timestamp(); | 251 | unsigned long now = get_timestamp(); |
241 | 252 | ||
242 | /* Warn about unreasonable delays: */ | 253 | if (watchdog_enabled & SOFT_WATCHDOG_ENABLED) { |
243 | if (time_after(now, touch_ts + get_softlockup_thresh())) | 254 | /* Warn about unreasonable delays. */ |
244 | return now - touch_ts; | 255 | if (time_after(now, touch_ts + get_softlockup_thresh())) |
245 | 256 | return now - touch_ts; | |
257 | } | ||
246 | return 0; | 258 | return 0; |
247 | } | 259 | } |
248 | 260 | ||
@@ -477,6 +489,21 @@ static void watchdog(unsigned int cpu) | |||
477 | __this_cpu_write(soft_lockup_hrtimer_cnt, | 489 | __this_cpu_write(soft_lockup_hrtimer_cnt, |
478 | __this_cpu_read(hrtimer_interrupts)); | 490 | __this_cpu_read(hrtimer_interrupts)); |
479 | __touch_watchdog(); | 491 | __touch_watchdog(); |
492 | |||
493 | /* | ||
494 | * watchdog_nmi_enable() clears the NMI_WATCHDOG_ENABLED bit in the | ||
495 | * failure path. Check for failures that can occur asynchronously - | ||
496 | * for example, when CPUs are on-lined - and shut down the hardware | ||
497 | * perf event on each CPU accordingly. | ||
498 | * | ||
499 | * The only non-obvious place this bit can be cleared is through | ||
500 | * watchdog_nmi_enable(), so a pr_info() is placed there. Placing a | ||
501 | * pr_info here would be too noisy as it would result in a message | ||
502 | * every few seconds if the hardlockup was disabled but the softlockup | ||
503 | * enabled. | ||
504 | */ | ||
505 | if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) | ||
506 | watchdog_nmi_disable(cpu); | ||
480 | } | 507 | } |
481 | 508 | ||
482 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | 509 | #ifdef CONFIG_HARDLOCKUP_DETECTOR |
@@ -492,14 +519,9 @@ static int watchdog_nmi_enable(unsigned int cpu) | |||
492 | struct perf_event_attr *wd_attr; | 519 | struct perf_event_attr *wd_attr; |
493 | struct perf_event *event = per_cpu(watchdog_ev, cpu); | 520 | struct perf_event *event = per_cpu(watchdog_ev, cpu); |
494 | 521 | ||
495 | /* | 522 | /* nothing to do if the hard lockup detector is disabled */ |
496 | * Some kernels need to default hard lockup detection to | 523 | if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) |
497 | * 'disabled', for example a guest on a hypervisor. | 524 | goto out; |
498 | */ | ||
499 | if (!watchdog_hardlockup_detector_is_enabled()) { | ||
500 | event = ERR_PTR(-ENOENT); | ||
501 | goto handle_err; | ||
502 | } | ||
503 | 525 | ||
504 | /* is it already setup and enabled? */ | 526 | /* is it already setup and enabled? */ |
505 | if (event && event->state > PERF_EVENT_STATE_OFF) | 527 | if (event && event->state > PERF_EVENT_STATE_OFF) |
@@ -515,7 +537,6 @@ static int watchdog_nmi_enable(unsigned int cpu) | |||
515 | /* Try to register using hardware perf events */ | 537 | /* Try to register using hardware perf events */ |
516 | event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); | 538 | event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); |
517 | 539 | ||
518 | handle_err: | ||
519 | /* save cpu0 error for future comparision */ | 540 | /* save cpu0 error for future comparision */ |
520 | if (cpu == 0 && IS_ERR(event)) | 541 | if (cpu == 0 && IS_ERR(event)) |
521 | cpu0_err = PTR_ERR(event); | 542 | cpu0_err = PTR_ERR(event); |
@@ -527,6 +548,18 @@ handle_err: | |||
527 | goto out_save; | 548 | goto out_save; |
528 | } | 549 | } |
529 | 550 | ||
551 | /* | ||
552 | * Disable the hard lockup detector if _any_ CPU fails to set up | ||
553 | * set up the hardware perf event. The watchdog() function checks | ||
554 | * the NMI_WATCHDOG_ENABLED bit periodically. | ||
555 | * | ||
556 | * The barriers are for syncing up watchdog_enabled across all the | ||
557 | * cpus, as clear_bit() does not use barriers. | ||
558 | */ | ||
559 | smp_mb__before_atomic(); | ||
560 | clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled); | ||
561 | smp_mb__after_atomic(); | ||
562 | |||
530 | /* skip displaying the same error again */ | 563 | /* skip displaying the same error again */ |
531 | if (cpu > 0 && (PTR_ERR(event) == cpu0_err)) | 564 | if (cpu > 0 && (PTR_ERR(event) == cpu0_err)) |
532 | return PTR_ERR(event); | 565 | return PTR_ERR(event); |
@@ -540,6 +573,9 @@ handle_err: | |||
540 | else | 573 | else |
541 | pr_err("disabled (cpu%i): unable to create perf event: %ld\n", | 574 | pr_err("disabled (cpu%i): unable to create perf event: %ld\n", |
542 | cpu, PTR_ERR(event)); | 575 | cpu, PTR_ERR(event)); |
576 | |||
577 | pr_info("Shutting down hard lockup detector on all cpus\n"); | ||
578 | |||
543 | return PTR_ERR(event); | 579 | return PTR_ERR(event); |
544 | 580 | ||
545 | /* success path */ | 581 | /* success path */ |
@@ -567,9 +603,37 @@ static void watchdog_nmi_disable(unsigned int cpu) | |||
567 | cpu0_err = 0; | 603 | cpu0_err = 0; |
568 | } | 604 | } |
569 | } | 605 | } |
606 | |||
607 | void watchdog_nmi_enable_all(void) | ||
608 | { | ||
609 | int cpu; | ||
610 | |||
611 | if (!watchdog_user_enabled) | ||
612 | return; | ||
613 | |||
614 | get_online_cpus(); | ||
615 | for_each_online_cpu(cpu) | ||
616 | watchdog_nmi_enable(cpu); | ||
617 | put_online_cpus(); | ||
618 | } | ||
619 | |||
620 | void watchdog_nmi_disable_all(void) | ||
621 | { | ||
622 | int cpu; | ||
623 | |||
624 | if (!watchdog_running) | ||
625 | return; | ||
626 | |||
627 | get_online_cpus(); | ||
628 | for_each_online_cpu(cpu) | ||
629 | watchdog_nmi_disable(cpu); | ||
630 | put_online_cpus(); | ||
631 | } | ||
570 | #else | 632 | #else |
571 | static int watchdog_nmi_enable(unsigned int cpu) { return 0; } | 633 | static int watchdog_nmi_enable(unsigned int cpu) { return 0; } |
572 | static void watchdog_nmi_disable(unsigned int cpu) { return; } | 634 | static void watchdog_nmi_disable(unsigned int cpu) { return; } |
635 | void watchdog_nmi_enable_all(void) {} | ||
636 | void watchdog_nmi_disable_all(void) {} | ||
573 | #endif /* CONFIG_HARDLOCKUP_DETECTOR */ | 637 | #endif /* CONFIG_HARDLOCKUP_DETECTOR */ |
574 | 638 | ||
575 | static struct smp_hotplug_thread watchdog_threads = { | 639 | static struct smp_hotplug_thread watchdog_threads = { |
@@ -600,7 +664,7 @@ static void restart_watchdog_hrtimer(void *info) | |||
600 | HRTIMER_MODE_REL_PINNED); | 664 | HRTIMER_MODE_REL_PINNED); |
601 | } | 665 | } |
602 | 666 | ||
603 | static void update_timers(int cpu) | 667 | static void update_watchdog(int cpu) |
604 | { | 668 | { |
605 | /* | 669 | /* |
606 | * Make sure that perf event counter will adopt to a new | 670 | * Make sure that perf event counter will adopt to a new |
@@ -615,17 +679,17 @@ static void update_timers(int cpu) | |||
615 | watchdog_nmi_enable(cpu); | 679 | watchdog_nmi_enable(cpu); |
616 | } | 680 | } |
617 | 681 | ||
618 | static void update_timers_all_cpus(void) | 682 | static void update_watchdog_all_cpus(void) |
619 | { | 683 | { |
620 | int cpu; | 684 | int cpu; |
621 | 685 | ||
622 | get_online_cpus(); | 686 | get_online_cpus(); |
623 | for_each_online_cpu(cpu) | 687 | for_each_online_cpu(cpu) |
624 | update_timers(cpu); | 688 | update_watchdog(cpu); |
625 | put_online_cpus(); | 689 | put_online_cpus(); |
626 | } | 690 | } |
627 | 691 | ||
628 | static int watchdog_enable_all_cpus(bool sample_period_changed) | 692 | static int watchdog_enable_all_cpus(void) |
629 | { | 693 | { |
630 | int err = 0; | 694 | int err = 0; |
631 | 695 | ||
@@ -635,8 +699,12 @@ static int watchdog_enable_all_cpus(bool sample_period_changed) | |||
635 | pr_err("Failed to create watchdog threads, disabled\n"); | 699 | pr_err("Failed to create watchdog threads, disabled\n"); |
636 | else | 700 | else |
637 | watchdog_running = 1; | 701 | watchdog_running = 1; |
638 | } else if (sample_period_changed) { | 702 | } else { |
639 | update_timers_all_cpus(); | 703 | /* |
704 | * Enable/disable the lockup detectors or | ||
705 | * change the sample period 'on the fly'. | ||
706 | */ | ||
707 | update_watchdog_all_cpus(); | ||
640 | } | 708 | } |
641 | 709 | ||
642 | return err; | 710 | return err; |
@@ -654,58 +722,159 @@ static void watchdog_disable_all_cpus(void) | |||
654 | } | 722 | } |
655 | 723 | ||
656 | /* | 724 | /* |
657 | * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh | 725 | * Update the run state of the lockup detectors. |
658 | */ | 726 | */ |
727 | static int proc_watchdog_update(void) | ||
728 | { | ||
729 | int err = 0; | ||
659 | 730 | ||
660 | int proc_dowatchdog(struct ctl_table *table, int write, | 731 | /* |
661 | void __user *buffer, size_t *lenp, loff_t *ppos) | 732 | * Watchdog threads won't be started if they are already active. |
733 | * The 'watchdog_running' variable in watchdog_*_all_cpus() takes | ||
734 | * care of this. If those threads are already active, the sample | ||
735 | * period will be updated and the lockup detectors will be enabled | ||
736 | * or disabled 'on the fly'. | ||
737 | */ | ||
738 | if (watchdog_enabled && watchdog_thresh) | ||
739 | err = watchdog_enable_all_cpus(); | ||
740 | else | ||
741 | watchdog_disable_all_cpus(); | ||
742 | |||
743 | return err; | ||
744 | |||
745 | } | ||
746 | |||
747 | static DEFINE_MUTEX(watchdog_proc_mutex); | ||
748 | |||
749 | /* | ||
750 | * common function for watchdog, nmi_watchdog and soft_watchdog parameter | ||
751 | * | ||
752 | * caller | table->data points to | 'which' contains the flag(s) | ||
753 | * -------------------|-----------------------|----------------------------- | ||
754 | * proc_watchdog | watchdog_user_enabled | NMI_WATCHDOG_ENABLED or'ed | ||
755 | * | | with SOFT_WATCHDOG_ENABLED | ||
756 | * -------------------|-----------------------|----------------------------- | ||
757 | * proc_nmi_watchdog | nmi_watchdog_enabled | NMI_WATCHDOG_ENABLED | ||
758 | * -------------------|-----------------------|----------------------------- | ||
759 | * proc_soft_watchdog | soft_watchdog_enabled | SOFT_WATCHDOG_ENABLED | ||
760 | */ | ||
761 | static int proc_watchdog_common(int which, struct ctl_table *table, int write, | ||
762 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
662 | { | 763 | { |
663 | int err, old_thresh, old_enabled; | 764 | int err, old, new; |
664 | bool old_hardlockup; | 765 | int *watchdog_param = (int *)table->data; |
665 | static DEFINE_MUTEX(watchdog_proc_mutex); | ||
666 | 766 | ||
667 | mutex_lock(&watchdog_proc_mutex); | 767 | mutex_lock(&watchdog_proc_mutex); |
668 | old_thresh = ACCESS_ONCE(watchdog_thresh); | ||
669 | old_enabled = ACCESS_ONCE(watchdog_user_enabled); | ||
670 | old_hardlockup = watchdog_hardlockup_detector_is_enabled(); | ||
671 | 768 | ||
672 | err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | ||
673 | if (err || !write) | ||
674 | goto out; | ||
675 | |||
676 | set_sample_period(); | ||
677 | /* | 769 | /* |
678 | * Watchdog threads shouldn't be enabled if they are | 770 | * If the parameter is being read return the state of the corresponding |
679 | * disabled. The 'watchdog_running' variable check in | 771 | * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the |
680 | * watchdog_*_all_cpus() function takes care of this. | 772 | * run state of the lockup detectors. |
681 | */ | 773 | */ |
682 | if (watchdog_user_enabled && watchdog_thresh) { | 774 | if (!write) { |
775 | *watchdog_param = (watchdog_enabled & which) != 0; | ||
776 | err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | ||
777 | } else { | ||
778 | err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | ||
779 | if (err) | ||
780 | goto out; | ||
781 | |||
683 | /* | 782 | /* |
684 | * Prevent a change in watchdog_thresh accidentally overriding | 783 | * There is a race window between fetching the current value |
685 | * the enablement of the hardlockup detector. | 784 | * from 'watchdog_enabled' and storing the new value. During |
785 | * this race window, watchdog_nmi_enable() can sneak in and | ||
786 | * clear the NMI_WATCHDOG_ENABLED bit in 'watchdog_enabled'. | ||
787 | * The 'cmpxchg' detects this race and the loop retries. | ||
686 | */ | 788 | */ |
687 | if (watchdog_user_enabled != old_enabled) | 789 | do { |
688 | watchdog_enable_hardlockup_detector(true); | 790 | old = watchdog_enabled; |
689 | err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh); | 791 | /* |
690 | } else | 792 | * If the parameter value is not zero set the |
691 | watchdog_disable_all_cpus(); | 793 | * corresponding bit(s), else clear it(them). |
794 | */ | ||
795 | if (*watchdog_param) | ||
796 | new = old | which; | ||
797 | else | ||
798 | new = old & ~which; | ||
799 | } while (cmpxchg(&watchdog_enabled, old, new) != old); | ||
692 | 800 | ||
693 | /* Restore old values on failure */ | 801 | /* |
694 | if (err) { | 802 | * Update the run state of the lockup detectors. |
695 | watchdog_thresh = old_thresh; | 803 | * Restore 'watchdog_enabled' on failure. |
696 | watchdog_user_enabled = old_enabled; | 804 | */ |
697 | watchdog_enable_hardlockup_detector(old_hardlockup); | 805 | err = proc_watchdog_update(); |
806 | if (err) | ||
807 | watchdog_enabled = old; | ||
698 | } | 808 | } |
699 | out: | 809 | out: |
700 | mutex_unlock(&watchdog_proc_mutex); | 810 | mutex_unlock(&watchdog_proc_mutex); |
701 | return err; | 811 | return err; |
702 | } | 812 | } |
813 | |||
814 | /* | ||
815 | * /proc/sys/kernel/watchdog | ||
816 | */ | ||
817 | int proc_watchdog(struct ctl_table *table, int write, | ||
818 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
819 | { | ||
820 | return proc_watchdog_common(NMI_WATCHDOG_ENABLED|SOFT_WATCHDOG_ENABLED, | ||
821 | table, write, buffer, lenp, ppos); | ||
822 | } | ||
823 | |||
824 | /* | ||
825 | * /proc/sys/kernel/nmi_watchdog | ||
826 | */ | ||
827 | int proc_nmi_watchdog(struct ctl_table *table, int write, | ||
828 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
829 | { | ||
830 | return proc_watchdog_common(NMI_WATCHDOG_ENABLED, | ||
831 | table, write, buffer, lenp, ppos); | ||
832 | } | ||
833 | |||
834 | /* | ||
835 | * /proc/sys/kernel/soft_watchdog | ||
836 | */ | ||
837 | int proc_soft_watchdog(struct ctl_table *table, int write, | ||
838 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
839 | { | ||
840 | return proc_watchdog_common(SOFT_WATCHDOG_ENABLED, | ||
841 | table, write, buffer, lenp, ppos); | ||
842 | } | ||
843 | |||
844 | /* | ||
845 | * /proc/sys/kernel/watchdog_thresh | ||
846 | */ | ||
847 | int proc_watchdog_thresh(struct ctl_table *table, int write, | ||
848 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
849 | { | ||
850 | int err, old; | ||
851 | |||
852 | mutex_lock(&watchdog_proc_mutex); | ||
853 | |||
854 | old = ACCESS_ONCE(watchdog_thresh); | ||
855 | err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | ||
856 | |||
857 | if (err || !write) | ||
858 | goto out; | ||
859 | |||
860 | /* | ||
861 | * Update the sample period. | ||
862 | * Restore 'watchdog_thresh' on failure. | ||
863 | */ | ||
864 | set_sample_period(); | ||
865 | err = proc_watchdog_update(); | ||
866 | if (err) | ||
867 | watchdog_thresh = old; | ||
868 | out: | ||
869 | mutex_unlock(&watchdog_proc_mutex); | ||
870 | return err; | ||
871 | } | ||
703 | #endif /* CONFIG_SYSCTL */ | 872 | #endif /* CONFIG_SYSCTL */ |
704 | 873 | ||
705 | void __init lockup_detector_init(void) | 874 | void __init lockup_detector_init(void) |
706 | { | 875 | { |
707 | set_sample_period(); | 876 | set_sample_period(); |
708 | 877 | ||
709 | if (watchdog_user_enabled) | 878 | if (watchdog_enabled) |
710 | watchdog_enable_all_cpus(false); | 879 | watchdog_enable_all_cpus(); |
711 | } | 880 | } |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 41ff75b478c6..586ad91300b0 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -159,6 +159,7 @@ struct worker_pool { | |||
159 | 159 | ||
160 | /* see manage_workers() for details on the two manager mutexes */ | 160 | /* see manage_workers() for details on the two manager mutexes */ |
161 | struct mutex manager_arb; /* manager arbitration */ | 161 | struct mutex manager_arb; /* manager arbitration */ |
162 | struct worker *manager; /* L: purely informational */ | ||
162 | struct mutex attach_mutex; /* attach/detach exclusion */ | 163 | struct mutex attach_mutex; /* attach/detach exclusion */ |
163 | struct list_head workers; /* A: attached workers */ | 164 | struct list_head workers; /* A: attached workers */ |
164 | struct completion *detach_completion; /* all workers detached */ | 165 | struct completion *detach_completion; /* all workers detached */ |
@@ -230,7 +231,7 @@ struct wq_device; | |||
230 | */ | 231 | */ |
231 | struct workqueue_struct { | 232 | struct workqueue_struct { |
232 | struct list_head pwqs; /* WR: all pwqs of this wq */ | 233 | struct list_head pwqs; /* WR: all pwqs of this wq */ |
233 | struct list_head list; /* PL: list of all workqueues */ | 234 | struct list_head list; /* PR: list of all workqueues */ |
234 | 235 | ||
235 | struct mutex mutex; /* protects this wq */ | 236 | struct mutex mutex; /* protects this wq */ |
236 | int work_color; /* WQ: current work color */ | 237 | int work_color; /* WQ: current work color */ |
@@ -257,6 +258,13 @@ struct workqueue_struct { | |||
257 | #endif | 258 | #endif |
258 | char name[WQ_NAME_LEN]; /* I: workqueue name */ | 259 | char name[WQ_NAME_LEN]; /* I: workqueue name */ |
259 | 260 | ||
261 | /* | ||
262 | * Destruction of workqueue_struct is sched-RCU protected to allow | ||
263 | * walking the workqueues list without grabbing wq_pool_mutex. | ||
264 | * This is used to dump all workqueues from sysrq. | ||
265 | */ | ||
266 | struct rcu_head rcu; | ||
267 | |||
260 | /* hot fields used during command issue, aligned to cacheline */ | 268 | /* hot fields used during command issue, aligned to cacheline */ |
261 | unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ | 269 | unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ |
262 | struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */ | 270 | struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */ |
@@ -288,7 +296,7 @@ static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf; | |||
288 | static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ | 296 | static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ |
289 | static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ | 297 | static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ |
290 | 298 | ||
291 | static LIST_HEAD(workqueues); /* PL: list of all workqueues */ | 299 | static LIST_HEAD(workqueues); /* PR: list of all workqueues */ |
292 | static bool workqueue_freezing; /* PL: have wqs started freezing? */ | 300 | static bool workqueue_freezing; /* PL: have wqs started freezing? */ |
293 | 301 | ||
294 | /* the per-cpu worker pools */ | 302 | /* the per-cpu worker pools */ |
@@ -324,6 +332,7 @@ EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq); | |||
324 | static int worker_thread(void *__worker); | 332 | static int worker_thread(void *__worker); |
325 | static void copy_workqueue_attrs(struct workqueue_attrs *to, | 333 | static void copy_workqueue_attrs(struct workqueue_attrs *to, |
326 | const struct workqueue_attrs *from); | 334 | const struct workqueue_attrs *from); |
335 | static void workqueue_sysfs_unregister(struct workqueue_struct *wq); | ||
327 | 336 | ||
328 | #define CREATE_TRACE_POINTS | 337 | #define CREATE_TRACE_POINTS |
329 | #include <trace/events/workqueue.h> | 338 | #include <trace/events/workqueue.h> |
@@ -1911,9 +1920,11 @@ static bool manage_workers(struct worker *worker) | |||
1911 | */ | 1920 | */ |
1912 | if (!mutex_trylock(&pool->manager_arb)) | 1921 | if (!mutex_trylock(&pool->manager_arb)) |
1913 | return false; | 1922 | return false; |
1923 | pool->manager = worker; | ||
1914 | 1924 | ||
1915 | maybe_create_worker(pool); | 1925 | maybe_create_worker(pool); |
1916 | 1926 | ||
1927 | pool->manager = NULL; | ||
1917 | mutex_unlock(&pool->manager_arb); | 1928 | mutex_unlock(&pool->manager_arb); |
1918 | return true; | 1929 | return true; |
1919 | } | 1930 | } |
@@ -2303,6 +2314,7 @@ repeat: | |||
2303 | struct wq_barrier { | 2314 | struct wq_barrier { |
2304 | struct work_struct work; | 2315 | struct work_struct work; |
2305 | struct completion done; | 2316 | struct completion done; |
2317 | struct task_struct *task; /* purely informational */ | ||
2306 | }; | 2318 | }; |
2307 | 2319 | ||
2308 | static void wq_barrier_func(struct work_struct *work) | 2320 | static void wq_barrier_func(struct work_struct *work) |
@@ -2351,6 +2363,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, | |||
2351 | INIT_WORK_ONSTACK(&barr->work, wq_barrier_func); | 2363 | INIT_WORK_ONSTACK(&barr->work, wq_barrier_func); |
2352 | __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); | 2364 | __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); |
2353 | init_completion(&barr->done); | 2365 | init_completion(&barr->done); |
2366 | barr->task = current; | ||
2354 | 2367 | ||
2355 | /* | 2368 | /* |
2356 | * If @target is currently being executed, schedule the | 2369 | * If @target is currently being executed, schedule the |
@@ -2989,323 +3002,6 @@ int execute_in_process_context(work_func_t fn, struct execute_work *ew) | |||
2989 | } | 3002 | } |
2990 | EXPORT_SYMBOL_GPL(execute_in_process_context); | 3003 | EXPORT_SYMBOL_GPL(execute_in_process_context); |
2991 | 3004 | ||
2992 | #ifdef CONFIG_SYSFS | ||
2993 | /* | ||
2994 | * Workqueues with WQ_SYSFS flag set is visible to userland via | ||
2995 | * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the | ||
2996 | * following attributes. | ||
2997 | * | ||
2998 | * per_cpu RO bool : whether the workqueue is per-cpu or unbound | ||
2999 | * max_active RW int : maximum number of in-flight work items | ||
3000 | * | ||
3001 | * Unbound workqueues have the following extra attributes. | ||
3002 | * | ||
3003 | * id RO int : the associated pool ID | ||
3004 | * nice RW int : nice value of the workers | ||
3005 | * cpumask RW mask : bitmask of allowed CPUs for the workers | ||
3006 | */ | ||
3007 | struct wq_device { | ||
3008 | struct workqueue_struct *wq; | ||
3009 | struct device dev; | ||
3010 | }; | ||
3011 | |||
3012 | static struct workqueue_struct *dev_to_wq(struct device *dev) | ||
3013 | { | ||
3014 | struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); | ||
3015 | |||
3016 | return wq_dev->wq; | ||
3017 | } | ||
3018 | |||
3019 | static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr, | ||
3020 | char *buf) | ||
3021 | { | ||
3022 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
3023 | |||
3024 | return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); | ||
3025 | } | ||
3026 | static DEVICE_ATTR_RO(per_cpu); | ||
3027 | |||
3028 | static ssize_t max_active_show(struct device *dev, | ||
3029 | struct device_attribute *attr, char *buf) | ||
3030 | { | ||
3031 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
3032 | |||
3033 | return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); | ||
3034 | } | ||
3035 | |||
3036 | static ssize_t max_active_store(struct device *dev, | ||
3037 | struct device_attribute *attr, const char *buf, | ||
3038 | size_t count) | ||
3039 | { | ||
3040 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
3041 | int val; | ||
3042 | |||
3043 | if (sscanf(buf, "%d", &val) != 1 || val <= 0) | ||
3044 | return -EINVAL; | ||
3045 | |||
3046 | workqueue_set_max_active(wq, val); | ||
3047 | return count; | ||
3048 | } | ||
3049 | static DEVICE_ATTR_RW(max_active); | ||
3050 | |||
3051 | static struct attribute *wq_sysfs_attrs[] = { | ||
3052 | &dev_attr_per_cpu.attr, | ||
3053 | &dev_attr_max_active.attr, | ||
3054 | NULL, | ||
3055 | }; | ||
3056 | ATTRIBUTE_GROUPS(wq_sysfs); | ||
3057 | |||
3058 | static ssize_t wq_pool_ids_show(struct device *dev, | ||
3059 | struct device_attribute *attr, char *buf) | ||
3060 | { | ||
3061 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
3062 | const char *delim = ""; | ||
3063 | int node, written = 0; | ||
3064 | |||
3065 | rcu_read_lock_sched(); | ||
3066 | for_each_node(node) { | ||
3067 | written += scnprintf(buf + written, PAGE_SIZE - written, | ||
3068 | "%s%d:%d", delim, node, | ||
3069 | unbound_pwq_by_node(wq, node)->pool->id); | ||
3070 | delim = " "; | ||
3071 | } | ||
3072 | written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); | ||
3073 | rcu_read_unlock_sched(); | ||
3074 | |||
3075 | return written; | ||
3076 | } | ||
3077 | |||
3078 | static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, | ||
3079 | char *buf) | ||
3080 | { | ||
3081 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
3082 | int written; | ||
3083 | |||
3084 | mutex_lock(&wq->mutex); | ||
3085 | written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice); | ||
3086 | mutex_unlock(&wq->mutex); | ||
3087 | |||
3088 | return written; | ||
3089 | } | ||
3090 | |||
3091 | /* prepare workqueue_attrs for sysfs store operations */ | ||
3092 | static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq) | ||
3093 | { | ||
3094 | struct workqueue_attrs *attrs; | ||
3095 | |||
3096 | attrs = alloc_workqueue_attrs(GFP_KERNEL); | ||
3097 | if (!attrs) | ||
3098 | return NULL; | ||
3099 | |||
3100 | mutex_lock(&wq->mutex); | ||
3101 | copy_workqueue_attrs(attrs, wq->unbound_attrs); | ||
3102 | mutex_unlock(&wq->mutex); | ||
3103 | return attrs; | ||
3104 | } | ||
3105 | |||
3106 | static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr, | ||
3107 | const char *buf, size_t count) | ||
3108 | { | ||
3109 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
3110 | struct workqueue_attrs *attrs; | ||
3111 | int ret; | ||
3112 | |||
3113 | attrs = wq_sysfs_prep_attrs(wq); | ||
3114 | if (!attrs) | ||
3115 | return -ENOMEM; | ||
3116 | |||
3117 | if (sscanf(buf, "%d", &attrs->nice) == 1 && | ||
3118 | attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE) | ||
3119 | ret = apply_workqueue_attrs(wq, attrs); | ||
3120 | else | ||
3121 | ret = -EINVAL; | ||
3122 | |||
3123 | free_workqueue_attrs(attrs); | ||
3124 | return ret ?: count; | ||
3125 | } | ||
3126 | |||
3127 | static ssize_t wq_cpumask_show(struct device *dev, | ||
3128 | struct device_attribute *attr, char *buf) | ||
3129 | { | ||
3130 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
3131 | int written; | ||
3132 | |||
3133 | mutex_lock(&wq->mutex); | ||
3134 | written = scnprintf(buf, PAGE_SIZE, "%*pb\n", | ||
3135 | cpumask_pr_args(wq->unbound_attrs->cpumask)); | ||
3136 | mutex_unlock(&wq->mutex); | ||
3137 | return written; | ||
3138 | } | ||
3139 | |||
3140 | static ssize_t wq_cpumask_store(struct device *dev, | ||
3141 | struct device_attribute *attr, | ||
3142 | const char *buf, size_t count) | ||
3143 | { | ||
3144 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
3145 | struct workqueue_attrs *attrs; | ||
3146 | int ret; | ||
3147 | |||
3148 | attrs = wq_sysfs_prep_attrs(wq); | ||
3149 | if (!attrs) | ||
3150 | return -ENOMEM; | ||
3151 | |||
3152 | ret = cpumask_parse(buf, attrs->cpumask); | ||
3153 | if (!ret) | ||
3154 | ret = apply_workqueue_attrs(wq, attrs); | ||
3155 | |||
3156 | free_workqueue_attrs(attrs); | ||
3157 | return ret ?: count; | ||
3158 | } | ||
3159 | |||
3160 | static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr, | ||
3161 | char *buf) | ||
3162 | { | ||
3163 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
3164 | int written; | ||
3165 | |||
3166 | mutex_lock(&wq->mutex); | ||
3167 | written = scnprintf(buf, PAGE_SIZE, "%d\n", | ||
3168 | !wq->unbound_attrs->no_numa); | ||
3169 | mutex_unlock(&wq->mutex); | ||
3170 | |||
3171 | return written; | ||
3172 | } | ||
3173 | |||
3174 | static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr, | ||
3175 | const char *buf, size_t count) | ||
3176 | { | ||
3177 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
3178 | struct workqueue_attrs *attrs; | ||
3179 | int v, ret; | ||
3180 | |||
3181 | attrs = wq_sysfs_prep_attrs(wq); | ||
3182 | if (!attrs) | ||
3183 | return -ENOMEM; | ||
3184 | |||
3185 | ret = -EINVAL; | ||
3186 | if (sscanf(buf, "%d", &v) == 1) { | ||
3187 | attrs->no_numa = !v; | ||
3188 | ret = apply_workqueue_attrs(wq, attrs); | ||
3189 | } | ||
3190 | |||
3191 | free_workqueue_attrs(attrs); | ||
3192 | return ret ?: count; | ||
3193 | } | ||
3194 | |||
3195 | static struct device_attribute wq_sysfs_unbound_attrs[] = { | ||
3196 | __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL), | ||
3197 | __ATTR(nice, 0644, wq_nice_show, wq_nice_store), | ||
3198 | __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store), | ||
3199 | __ATTR(numa, 0644, wq_numa_show, wq_numa_store), | ||
3200 | __ATTR_NULL, | ||
3201 | }; | ||
3202 | |||
3203 | static struct bus_type wq_subsys = { | ||
3204 | .name = "workqueue", | ||
3205 | .dev_groups = wq_sysfs_groups, | ||
3206 | }; | ||
3207 | |||
3208 | static int __init wq_sysfs_init(void) | ||
3209 | { | ||
3210 | return subsys_virtual_register(&wq_subsys, NULL); | ||
3211 | } | ||
3212 | core_initcall(wq_sysfs_init); | ||
3213 | |||
3214 | static void wq_device_release(struct device *dev) | ||
3215 | { | ||
3216 | struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); | ||
3217 | |||
3218 | kfree(wq_dev); | ||
3219 | } | ||
3220 | |||
3221 | /** | ||
3222 | * workqueue_sysfs_register - make a workqueue visible in sysfs | ||
3223 | * @wq: the workqueue to register | ||
3224 | * | ||
3225 | * Expose @wq in sysfs under /sys/bus/workqueue/devices. | ||
3226 | * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set | ||
3227 | * which is the preferred method. | ||
3228 | * | ||
3229 | * Workqueue user should use this function directly iff it wants to apply | ||
3230 | * workqueue_attrs before making the workqueue visible in sysfs; otherwise, | ||
3231 | * apply_workqueue_attrs() may race against userland updating the | ||
3232 | * attributes. | ||
3233 | * | ||
3234 | * Return: 0 on success, -errno on failure. | ||
3235 | */ | ||
3236 | int workqueue_sysfs_register(struct workqueue_struct *wq) | ||
3237 | { | ||
3238 | struct wq_device *wq_dev; | ||
3239 | int ret; | ||
3240 | |||
3241 | /* | ||
3242 | * Adjusting max_active or creating new pwqs by applyting | ||
3243 | * attributes breaks ordering guarantee. Disallow exposing ordered | ||
3244 | * workqueues. | ||
3245 | */ | ||
3246 | if (WARN_ON(wq->flags & __WQ_ORDERED)) | ||
3247 | return -EINVAL; | ||
3248 | |||
3249 | wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL); | ||
3250 | if (!wq_dev) | ||
3251 | return -ENOMEM; | ||
3252 | |||
3253 | wq_dev->wq = wq; | ||
3254 | wq_dev->dev.bus = &wq_subsys; | ||
3255 | wq_dev->dev.init_name = wq->name; | ||
3256 | wq_dev->dev.release = wq_device_release; | ||
3257 | |||
3258 | /* | ||
3259 | * unbound_attrs are created separately. Suppress uevent until | ||
3260 | * everything is ready. | ||
3261 | */ | ||
3262 | dev_set_uevent_suppress(&wq_dev->dev, true); | ||
3263 | |||
3264 | ret = device_register(&wq_dev->dev); | ||
3265 | if (ret) { | ||
3266 | kfree(wq_dev); | ||
3267 | wq->wq_dev = NULL; | ||
3268 | return ret; | ||
3269 | } | ||
3270 | |||
3271 | if (wq->flags & WQ_UNBOUND) { | ||
3272 | struct device_attribute *attr; | ||
3273 | |||
3274 | for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) { | ||
3275 | ret = device_create_file(&wq_dev->dev, attr); | ||
3276 | if (ret) { | ||
3277 | device_unregister(&wq_dev->dev); | ||
3278 | wq->wq_dev = NULL; | ||
3279 | return ret; | ||
3280 | } | ||
3281 | } | ||
3282 | } | ||
3283 | |||
3284 | dev_set_uevent_suppress(&wq_dev->dev, false); | ||
3285 | kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD); | ||
3286 | return 0; | ||
3287 | } | ||
3288 | |||
3289 | /** | ||
3290 | * workqueue_sysfs_unregister - undo workqueue_sysfs_register() | ||
3291 | * @wq: the workqueue to unregister | ||
3292 | * | ||
3293 | * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister. | ||
3294 | */ | ||
3295 | static void workqueue_sysfs_unregister(struct workqueue_struct *wq) | ||
3296 | { | ||
3297 | struct wq_device *wq_dev = wq->wq_dev; | ||
3298 | |||
3299 | if (!wq->wq_dev) | ||
3300 | return; | ||
3301 | |||
3302 | wq->wq_dev = NULL; | ||
3303 | device_unregister(&wq_dev->dev); | ||
3304 | } | ||
3305 | #else /* CONFIG_SYSFS */ | ||
3306 | static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { } | ||
3307 | #endif /* CONFIG_SYSFS */ | ||
3308 | |||
3309 | /** | 3005 | /** |
3310 | * free_workqueue_attrs - free a workqueue_attrs | 3006 | * free_workqueue_attrs - free a workqueue_attrs |
3311 | * @attrs: workqueue_attrs to free | 3007 | * @attrs: workqueue_attrs to free |
@@ -3424,6 +3120,20 @@ static int init_worker_pool(struct worker_pool *pool) | |||
3424 | return 0; | 3120 | return 0; |
3425 | } | 3121 | } |
3426 | 3122 | ||
3123 | static void rcu_free_wq(struct rcu_head *rcu) | ||
3124 | { | ||
3125 | struct workqueue_struct *wq = | ||
3126 | container_of(rcu, struct workqueue_struct, rcu); | ||
3127 | |||
3128 | if (!(wq->flags & WQ_UNBOUND)) | ||
3129 | free_percpu(wq->cpu_pwqs); | ||
3130 | else | ||
3131 | free_workqueue_attrs(wq->unbound_attrs); | ||
3132 | |||
3133 | kfree(wq->rescuer); | ||
3134 | kfree(wq); | ||
3135 | } | ||
3136 | |||
3427 | static void rcu_free_pool(struct rcu_head *rcu) | 3137 | static void rcu_free_pool(struct rcu_head *rcu) |
3428 | { | 3138 | { |
3429 | struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); | 3139 | struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); |
@@ -3601,12 +3311,10 @@ static void pwq_unbound_release_workfn(struct work_struct *work) | |||
3601 | 3311 | ||
3602 | /* | 3312 | /* |
3603 | * If we're the last pwq going away, @wq is already dead and no one | 3313 | * If we're the last pwq going away, @wq is already dead and no one |
3604 | * is gonna access it anymore. Free it. | 3314 | * is gonna access it anymore. Schedule RCU free. |
3605 | */ | 3315 | */ |
3606 | if (is_last) { | 3316 | if (is_last) |
3607 | free_workqueue_attrs(wq->unbound_attrs); | 3317 | call_rcu_sched(&wq->rcu, rcu_free_wq); |
3608 | kfree(wq); | ||
3609 | } | ||
3610 | } | 3318 | } |
3611 | 3319 | ||
3612 | /** | 3320 | /** |
@@ -4143,7 +3851,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, | |||
4143 | pwq_adjust_max_active(pwq); | 3851 | pwq_adjust_max_active(pwq); |
4144 | mutex_unlock(&wq->mutex); | 3852 | mutex_unlock(&wq->mutex); |
4145 | 3853 | ||
4146 | list_add(&wq->list, &workqueues); | 3854 | list_add_tail_rcu(&wq->list, &workqueues); |
4147 | 3855 | ||
4148 | mutex_unlock(&wq_pool_mutex); | 3856 | mutex_unlock(&wq_pool_mutex); |
4149 | 3857 | ||
@@ -4199,24 +3907,20 @@ void destroy_workqueue(struct workqueue_struct *wq) | |||
4199 | * flushing is complete in case freeze races us. | 3907 | * flushing is complete in case freeze races us. |
4200 | */ | 3908 | */ |
4201 | mutex_lock(&wq_pool_mutex); | 3909 | mutex_lock(&wq_pool_mutex); |
4202 | list_del_init(&wq->list); | 3910 | list_del_rcu(&wq->list); |
4203 | mutex_unlock(&wq_pool_mutex); | 3911 | mutex_unlock(&wq_pool_mutex); |
4204 | 3912 | ||
4205 | workqueue_sysfs_unregister(wq); | 3913 | workqueue_sysfs_unregister(wq); |
4206 | 3914 | ||
4207 | if (wq->rescuer) { | 3915 | if (wq->rescuer) |
4208 | kthread_stop(wq->rescuer->task); | 3916 | kthread_stop(wq->rescuer->task); |
4209 | kfree(wq->rescuer); | ||
4210 | wq->rescuer = NULL; | ||
4211 | } | ||
4212 | 3917 | ||
4213 | if (!(wq->flags & WQ_UNBOUND)) { | 3918 | if (!(wq->flags & WQ_UNBOUND)) { |
4214 | /* | 3919 | /* |
4215 | * The base ref is never dropped on per-cpu pwqs. Directly | 3920 | * The base ref is never dropped on per-cpu pwqs. Directly |
4216 | * free the pwqs and wq. | 3921 | * schedule RCU free. |
4217 | */ | 3922 | */ |
4218 | free_percpu(wq->cpu_pwqs); | 3923 | call_rcu_sched(&wq->rcu, rcu_free_wq); |
4219 | kfree(wq); | ||
4220 | } else { | 3924 | } else { |
4221 | /* | 3925 | /* |
4222 | * We're the sole accessor of @wq at this point. Directly | 3926 | * We're the sole accessor of @wq at this point. Directly |
@@ -4437,6 +4141,166 @@ void print_worker_info(const char *log_lvl, struct task_struct *task) | |||
4437 | } | 4141 | } |
4438 | } | 4142 | } |
4439 | 4143 | ||
4144 | static void pr_cont_pool_info(struct worker_pool *pool) | ||
4145 | { | ||
4146 | pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask); | ||
4147 | if (pool->node != NUMA_NO_NODE) | ||
4148 | pr_cont(" node=%d", pool->node); | ||
4149 | pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice); | ||
4150 | } | ||
4151 | |||
4152 | static void pr_cont_work(bool comma, struct work_struct *work) | ||
4153 | { | ||
4154 | if (work->func == wq_barrier_func) { | ||
4155 | struct wq_barrier *barr; | ||
4156 | |||
4157 | barr = container_of(work, struct wq_barrier, work); | ||
4158 | |||
4159 | pr_cont("%s BAR(%d)", comma ? "," : "", | ||
4160 | task_pid_nr(barr->task)); | ||
4161 | } else { | ||
4162 | pr_cont("%s %pf", comma ? "," : "", work->func); | ||
4163 | } | ||
4164 | } | ||
4165 | |||
4166 | static void show_pwq(struct pool_workqueue *pwq) | ||
4167 | { | ||
4168 | struct worker_pool *pool = pwq->pool; | ||
4169 | struct work_struct *work; | ||
4170 | struct worker *worker; | ||
4171 | bool has_in_flight = false, has_pending = false; | ||
4172 | int bkt; | ||
4173 | |||
4174 | pr_info(" pwq %d:", pool->id); | ||
4175 | pr_cont_pool_info(pool); | ||
4176 | |||
4177 | pr_cont(" active=%d/%d%s\n", pwq->nr_active, pwq->max_active, | ||
4178 | !list_empty(&pwq->mayday_node) ? " MAYDAY" : ""); | ||
4179 | |||
4180 | hash_for_each(pool->busy_hash, bkt, worker, hentry) { | ||
4181 | if (worker->current_pwq == pwq) { | ||
4182 | has_in_flight = true; | ||
4183 | break; | ||
4184 | } | ||
4185 | } | ||
4186 | if (has_in_flight) { | ||
4187 | bool comma = false; | ||
4188 | |||
4189 | pr_info(" in-flight:"); | ||
4190 | hash_for_each(pool->busy_hash, bkt, worker, hentry) { | ||
4191 | if (worker->current_pwq != pwq) | ||
4192 | continue; | ||
4193 | |||
4194 | pr_cont("%s %d%s:%pf", comma ? "," : "", | ||
4195 | task_pid_nr(worker->task), | ||
4196 | worker == pwq->wq->rescuer ? "(RESCUER)" : "", | ||
4197 | worker->current_func); | ||
4198 | list_for_each_entry(work, &worker->scheduled, entry) | ||
4199 | pr_cont_work(false, work); | ||
4200 | comma = true; | ||
4201 | } | ||
4202 | pr_cont("\n"); | ||
4203 | } | ||
4204 | |||
4205 | list_for_each_entry(work, &pool->worklist, entry) { | ||
4206 | if (get_work_pwq(work) == pwq) { | ||
4207 | has_pending = true; | ||
4208 | break; | ||
4209 | } | ||
4210 | } | ||
4211 | if (has_pending) { | ||
4212 | bool comma = false; | ||
4213 | |||
4214 | pr_info(" pending:"); | ||
4215 | list_for_each_entry(work, &pool->worklist, entry) { | ||
4216 | if (get_work_pwq(work) != pwq) | ||
4217 | continue; | ||
4218 | |||
4219 | pr_cont_work(comma, work); | ||
4220 | comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); | ||
4221 | } | ||
4222 | pr_cont("\n"); | ||
4223 | } | ||
4224 | |||
4225 | if (!list_empty(&pwq->delayed_works)) { | ||
4226 | bool comma = false; | ||
4227 | |||
4228 | pr_info(" delayed:"); | ||
4229 | list_for_each_entry(work, &pwq->delayed_works, entry) { | ||
4230 | pr_cont_work(comma, work); | ||
4231 | comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); | ||
4232 | } | ||
4233 | pr_cont("\n"); | ||
4234 | } | ||
4235 | } | ||
4236 | |||
4237 | /** | ||
4238 | * show_workqueue_state - dump workqueue state | ||
4239 | * | ||
4240 | * Called from a sysrq handler and prints out all busy workqueues and | ||
4241 | * pools. | ||
4242 | */ | ||
4243 | void show_workqueue_state(void) | ||
4244 | { | ||
4245 | struct workqueue_struct *wq; | ||
4246 | struct worker_pool *pool; | ||
4247 | unsigned long flags; | ||
4248 | int pi; | ||
4249 | |||
4250 | rcu_read_lock_sched(); | ||
4251 | |||
4252 | pr_info("Showing busy workqueues and worker pools:\n"); | ||
4253 | |||
4254 | list_for_each_entry_rcu(wq, &workqueues, list) { | ||
4255 | struct pool_workqueue *pwq; | ||
4256 | bool idle = true; | ||
4257 | |||
4258 | for_each_pwq(pwq, wq) { | ||
4259 | if (pwq->nr_active || !list_empty(&pwq->delayed_works)) { | ||
4260 | idle = false; | ||
4261 | break; | ||
4262 | } | ||
4263 | } | ||
4264 | if (idle) | ||
4265 | continue; | ||
4266 | |||
4267 | pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags); | ||
4268 | |||
4269 | for_each_pwq(pwq, wq) { | ||
4270 | spin_lock_irqsave(&pwq->pool->lock, flags); | ||
4271 | if (pwq->nr_active || !list_empty(&pwq->delayed_works)) | ||
4272 | show_pwq(pwq); | ||
4273 | spin_unlock_irqrestore(&pwq->pool->lock, flags); | ||
4274 | } | ||
4275 | } | ||
4276 | |||
4277 | for_each_pool(pool, pi) { | ||
4278 | struct worker *worker; | ||
4279 | bool first = true; | ||
4280 | |||
4281 | spin_lock_irqsave(&pool->lock, flags); | ||
4282 | if (pool->nr_workers == pool->nr_idle) | ||
4283 | goto next_pool; | ||
4284 | |||
4285 | pr_info("pool %d:", pool->id); | ||
4286 | pr_cont_pool_info(pool); | ||
4287 | pr_cont(" workers=%d", pool->nr_workers); | ||
4288 | if (pool->manager) | ||
4289 | pr_cont(" manager: %d", | ||
4290 | task_pid_nr(pool->manager->task)); | ||
4291 | list_for_each_entry(worker, &pool->idle_list, entry) { | ||
4292 | pr_cont(" %s%d", first ? "idle: " : "", | ||
4293 | task_pid_nr(worker->task)); | ||
4294 | first = false; | ||
4295 | } | ||
4296 | pr_cont("\n"); | ||
4297 | next_pool: | ||
4298 | spin_unlock_irqrestore(&pool->lock, flags); | ||
4299 | } | ||
4300 | |||
4301 | rcu_read_unlock_sched(); | ||
4302 | } | ||
4303 | |||
4440 | /* | 4304 | /* |
4441 | * CPU hotplug. | 4305 | * CPU hotplug. |
4442 | * | 4306 | * |
@@ -4834,6 +4698,323 @@ out_unlock: | |||
4834 | } | 4698 | } |
4835 | #endif /* CONFIG_FREEZER */ | 4699 | #endif /* CONFIG_FREEZER */ |
4836 | 4700 | ||
4701 | #ifdef CONFIG_SYSFS | ||
4702 | /* | ||
4703 | * Workqueues with WQ_SYSFS flag set is visible to userland via | ||
4704 | * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the | ||
4705 | * following attributes. | ||
4706 | * | ||
4707 | * per_cpu RO bool : whether the workqueue is per-cpu or unbound | ||
4708 | * max_active RW int : maximum number of in-flight work items | ||
4709 | * | ||
4710 | * Unbound workqueues have the following extra attributes. | ||
4711 | * | ||
4712 | * id RO int : the associated pool ID | ||
4713 | * nice RW int : nice value of the workers | ||
4714 | * cpumask RW mask : bitmask of allowed CPUs for the workers | ||
4715 | */ | ||
4716 | struct wq_device { | ||
4717 | struct workqueue_struct *wq; | ||
4718 | struct device dev; | ||
4719 | }; | ||
4720 | |||
4721 | static struct workqueue_struct *dev_to_wq(struct device *dev) | ||
4722 | { | ||
4723 | struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); | ||
4724 | |||
4725 | return wq_dev->wq; | ||
4726 | } | ||
4727 | |||
4728 | static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr, | ||
4729 | char *buf) | ||
4730 | { | ||
4731 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
4732 | |||
4733 | return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); | ||
4734 | } | ||
4735 | static DEVICE_ATTR_RO(per_cpu); | ||
4736 | |||
4737 | static ssize_t max_active_show(struct device *dev, | ||
4738 | struct device_attribute *attr, char *buf) | ||
4739 | { | ||
4740 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
4741 | |||
4742 | return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); | ||
4743 | } | ||
4744 | |||
4745 | static ssize_t max_active_store(struct device *dev, | ||
4746 | struct device_attribute *attr, const char *buf, | ||
4747 | size_t count) | ||
4748 | { | ||
4749 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
4750 | int val; | ||
4751 | |||
4752 | if (sscanf(buf, "%d", &val) != 1 || val <= 0) | ||
4753 | return -EINVAL; | ||
4754 | |||
4755 | workqueue_set_max_active(wq, val); | ||
4756 | return count; | ||
4757 | } | ||
4758 | static DEVICE_ATTR_RW(max_active); | ||
4759 | |||
4760 | static struct attribute *wq_sysfs_attrs[] = { | ||
4761 | &dev_attr_per_cpu.attr, | ||
4762 | &dev_attr_max_active.attr, | ||
4763 | NULL, | ||
4764 | }; | ||
4765 | ATTRIBUTE_GROUPS(wq_sysfs); | ||
4766 | |||
4767 | static ssize_t wq_pool_ids_show(struct device *dev, | ||
4768 | struct device_attribute *attr, char *buf) | ||
4769 | { | ||
4770 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
4771 | const char *delim = ""; | ||
4772 | int node, written = 0; | ||
4773 | |||
4774 | rcu_read_lock_sched(); | ||
4775 | for_each_node(node) { | ||
4776 | written += scnprintf(buf + written, PAGE_SIZE - written, | ||
4777 | "%s%d:%d", delim, node, | ||
4778 | unbound_pwq_by_node(wq, node)->pool->id); | ||
4779 | delim = " "; | ||
4780 | } | ||
4781 | written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); | ||
4782 | rcu_read_unlock_sched(); | ||
4783 | |||
4784 | return written; | ||
4785 | } | ||
4786 | |||
4787 | static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, | ||
4788 | char *buf) | ||
4789 | { | ||
4790 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
4791 | int written; | ||
4792 | |||
4793 | mutex_lock(&wq->mutex); | ||
4794 | written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice); | ||
4795 | mutex_unlock(&wq->mutex); | ||
4796 | |||
4797 | return written; | ||
4798 | } | ||
4799 | |||
4800 | /* prepare workqueue_attrs for sysfs store operations */ | ||
4801 | static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq) | ||
4802 | { | ||
4803 | struct workqueue_attrs *attrs; | ||
4804 | |||
4805 | attrs = alloc_workqueue_attrs(GFP_KERNEL); | ||
4806 | if (!attrs) | ||
4807 | return NULL; | ||
4808 | |||
4809 | mutex_lock(&wq->mutex); | ||
4810 | copy_workqueue_attrs(attrs, wq->unbound_attrs); | ||
4811 | mutex_unlock(&wq->mutex); | ||
4812 | return attrs; | ||
4813 | } | ||
4814 | |||
4815 | static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr, | ||
4816 | const char *buf, size_t count) | ||
4817 | { | ||
4818 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
4819 | struct workqueue_attrs *attrs; | ||
4820 | int ret; | ||
4821 | |||
4822 | attrs = wq_sysfs_prep_attrs(wq); | ||
4823 | if (!attrs) | ||
4824 | return -ENOMEM; | ||
4825 | |||
4826 | if (sscanf(buf, "%d", &attrs->nice) == 1 && | ||
4827 | attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE) | ||
4828 | ret = apply_workqueue_attrs(wq, attrs); | ||
4829 | else | ||
4830 | ret = -EINVAL; | ||
4831 | |||
4832 | free_workqueue_attrs(attrs); | ||
4833 | return ret ?: count; | ||
4834 | } | ||
4835 | |||
4836 | static ssize_t wq_cpumask_show(struct device *dev, | ||
4837 | struct device_attribute *attr, char *buf) | ||
4838 | { | ||
4839 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
4840 | int written; | ||
4841 | |||
4842 | mutex_lock(&wq->mutex); | ||
4843 | written = scnprintf(buf, PAGE_SIZE, "%*pb\n", | ||
4844 | cpumask_pr_args(wq->unbound_attrs->cpumask)); | ||
4845 | mutex_unlock(&wq->mutex); | ||
4846 | return written; | ||
4847 | } | ||
4848 | |||
4849 | static ssize_t wq_cpumask_store(struct device *dev, | ||
4850 | struct device_attribute *attr, | ||
4851 | const char *buf, size_t count) | ||
4852 | { | ||
4853 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
4854 | struct workqueue_attrs *attrs; | ||
4855 | int ret; | ||
4856 | |||
4857 | attrs = wq_sysfs_prep_attrs(wq); | ||
4858 | if (!attrs) | ||
4859 | return -ENOMEM; | ||
4860 | |||
4861 | ret = cpumask_parse(buf, attrs->cpumask); | ||
4862 | if (!ret) | ||
4863 | ret = apply_workqueue_attrs(wq, attrs); | ||
4864 | |||
4865 | free_workqueue_attrs(attrs); | ||
4866 | return ret ?: count; | ||
4867 | } | ||
4868 | |||
4869 | static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr, | ||
4870 | char *buf) | ||
4871 | { | ||
4872 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
4873 | int written; | ||
4874 | |||
4875 | mutex_lock(&wq->mutex); | ||
4876 | written = scnprintf(buf, PAGE_SIZE, "%d\n", | ||
4877 | !wq->unbound_attrs->no_numa); | ||
4878 | mutex_unlock(&wq->mutex); | ||
4879 | |||
4880 | return written; | ||
4881 | } | ||
4882 | |||
4883 | static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr, | ||
4884 | const char *buf, size_t count) | ||
4885 | { | ||
4886 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
4887 | struct workqueue_attrs *attrs; | ||
4888 | int v, ret; | ||
4889 | |||
4890 | attrs = wq_sysfs_prep_attrs(wq); | ||
4891 | if (!attrs) | ||
4892 | return -ENOMEM; | ||
4893 | |||
4894 | ret = -EINVAL; | ||
4895 | if (sscanf(buf, "%d", &v) == 1) { | ||
4896 | attrs->no_numa = !v; | ||
4897 | ret = apply_workqueue_attrs(wq, attrs); | ||
4898 | } | ||
4899 | |||
4900 | free_workqueue_attrs(attrs); | ||
4901 | return ret ?: count; | ||
4902 | } | ||
4903 | |||
4904 | static struct device_attribute wq_sysfs_unbound_attrs[] = { | ||
4905 | __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL), | ||
4906 | __ATTR(nice, 0644, wq_nice_show, wq_nice_store), | ||
4907 | __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store), | ||
4908 | __ATTR(numa, 0644, wq_numa_show, wq_numa_store), | ||
4909 | __ATTR_NULL, | ||
4910 | }; | ||
4911 | |||
4912 | static struct bus_type wq_subsys = { | ||
4913 | .name = "workqueue", | ||
4914 | .dev_groups = wq_sysfs_groups, | ||
4915 | }; | ||
4916 | |||
4917 | static int __init wq_sysfs_init(void) | ||
4918 | { | ||
4919 | return subsys_virtual_register(&wq_subsys, NULL); | ||
4920 | } | ||
4921 | core_initcall(wq_sysfs_init); | ||
4922 | |||
4923 | static void wq_device_release(struct device *dev) | ||
4924 | { | ||
4925 | struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); | ||
4926 | |||
4927 | kfree(wq_dev); | ||
4928 | } | ||
4929 | |||
4930 | /** | ||
4931 | * workqueue_sysfs_register - make a workqueue visible in sysfs | ||
4932 | * @wq: the workqueue to register | ||
4933 | * | ||
4934 | * Expose @wq in sysfs under /sys/bus/workqueue/devices. | ||
4935 | * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set | ||
4936 | * which is the preferred method. | ||
4937 | * | ||
4938 | * Workqueue user should use this function directly iff it wants to apply | ||
4939 | * workqueue_attrs before making the workqueue visible in sysfs; otherwise, | ||
4940 | * apply_workqueue_attrs() may race against userland updating the | ||
4941 | * attributes. | ||
4942 | * | ||
4943 | * Return: 0 on success, -errno on failure. | ||
4944 | */ | ||
4945 | int workqueue_sysfs_register(struct workqueue_struct *wq) | ||
4946 | { | ||
4947 | struct wq_device *wq_dev; | ||
4948 | int ret; | ||
4949 | |||
4950 | /* | ||
4951 | * Adjusting max_active or creating new pwqs by applyting | ||
4952 | * attributes breaks ordering guarantee. Disallow exposing ordered | ||
4953 | * workqueues. | ||
4954 | */ | ||
4955 | if (WARN_ON(wq->flags & __WQ_ORDERED)) | ||
4956 | return -EINVAL; | ||
4957 | |||
4958 | wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL); | ||
4959 | if (!wq_dev) | ||
4960 | return -ENOMEM; | ||
4961 | |||
4962 | wq_dev->wq = wq; | ||
4963 | wq_dev->dev.bus = &wq_subsys; | ||
4964 | wq_dev->dev.init_name = wq->name; | ||
4965 | wq_dev->dev.release = wq_device_release; | ||
4966 | |||
4967 | /* | ||
4968 | * unbound_attrs are created separately. Suppress uevent until | ||
4969 | * everything is ready. | ||
4970 | */ | ||
4971 | dev_set_uevent_suppress(&wq_dev->dev, true); | ||
4972 | |||
4973 | ret = device_register(&wq_dev->dev); | ||
4974 | if (ret) { | ||
4975 | kfree(wq_dev); | ||
4976 | wq->wq_dev = NULL; | ||
4977 | return ret; | ||
4978 | } | ||
4979 | |||
4980 | if (wq->flags & WQ_UNBOUND) { | ||
4981 | struct device_attribute *attr; | ||
4982 | |||
4983 | for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) { | ||
4984 | ret = device_create_file(&wq_dev->dev, attr); | ||
4985 | if (ret) { | ||
4986 | device_unregister(&wq_dev->dev); | ||
4987 | wq->wq_dev = NULL; | ||
4988 | return ret; | ||
4989 | } | ||
4990 | } | ||
4991 | } | ||
4992 | |||
4993 | dev_set_uevent_suppress(&wq_dev->dev, false); | ||
4994 | kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD); | ||
4995 | return 0; | ||
4996 | } | ||
4997 | |||
4998 | /** | ||
4999 | * workqueue_sysfs_unregister - undo workqueue_sysfs_register() | ||
5000 | * @wq: the workqueue to unregister | ||
5001 | * | ||
5002 | * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister. | ||
5003 | */ | ||
5004 | static void workqueue_sysfs_unregister(struct workqueue_struct *wq) | ||
5005 | { | ||
5006 | struct wq_device *wq_dev = wq->wq_dev; | ||
5007 | |||
5008 | if (!wq->wq_dev) | ||
5009 | return; | ||
5010 | |||
5011 | wq->wq_dev = NULL; | ||
5012 | device_unregister(&wq_dev->dev); | ||
5013 | } | ||
5014 | #else /* CONFIG_SYSFS */ | ||
5015 | static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { } | ||
5016 | #endif /* CONFIG_SYSFS */ | ||
5017 | |||
4837 | static void __init wq_numa_init(void) | 5018 | static void __init wq_numa_init(void) |
4838 | { | 5019 | { |
4839 | cpumask_var_t *tbl; | 5020 | cpumask_var_t *tbl; |